In [10]:
import pandas as pd

# Load the dataset
csv_dataset_path = 'dataset.csv'
xlsx_dataset_path = 'dataset.xlsx'

# Read the CSV file
csv_df = pd.read_csv(csv_dataset_path)

# Read the Excel file
xlsx_df = pd.read_excel(xlsx_dataset_path)

# Display the first few rows of both datasets to understand their structure
csv_df.head(), xlsx_df.head()


(                                                Link            EN_title  \
 0  https://www.imvbox.com/watch-persian-movie-ira...   Local Anaesthetic   
 1  https://www.imvbox.com/watch-persian-movie-ira...         Disturbance   
 2  https://www.imvbox.com/watch-persian-movie-ira...           Highlight   
 3  https://www.imvbox.com/watch-persian-movie-ira...               Gilda   
 4  https://www.imvbox.com/watch-persian-movie-ira...  Atmosphere Station   
 
      PENGLISH_title   PERSIAN_title  \
 0  Bi Hessie Mozeie    بی‌حسی موضعی   
 1         Ashoftegi        آشفته گی   
 2           Haylayt         هایلایت   
 3            Geelda           گیلدا   
 4  Istgahe Atmosfer  ایستگاه اتمسفر   
 
                                            Content_1  \
 0  جلال‌، دانشجوی سابق رشته فلسفه، متوجه می‌شود خ...   
 1  «آشفته‌گی» رئالیستی و اجتماعی نیست. یک فیلم اس...   
 2  یک تصادف اتومبیل آدم‌هایی را در تقابل با هم قر...   
 3  گیلدا ماجرای زنی به نام «گیلدا» را روایت می کن...   
 4  این ف

In [6]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Predict
y_pred_lr = lr_model.predict(X_test_tfidf)

# Evaluate
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Logistic Regression Accuracy: {accuracy_lr:.4f}')
print(classification_report(y_test, y_pred_lr, target_names=label_encoder.classes_))

Logistic Regression Accuracy: 0.3452
                          precision    recall  f1-score   support

                  Action       0.00      0.00      0.00        15
               Adventure       0.00      0.00      0.00        10
       Arts & Literature       0.00      0.00      0.00         7
                  Comedy       0.34      0.94      0.50        54
                   Crime       0.00      0.00      0.00        13
    Culture & Traditions       0.00      0.00      0.00         5
                   Drama       0.37      0.17      0.24        40
                  Family       0.00      0.00      0.00         5
                 History       0.00      0.00      0.00         4
Human Interest & Society       0.00      0.00      0.00         9
                 Romance       0.00      0.00      0.00         2
                     War       0.00      0.00      0.00         4

                accuracy                           0.35       168
               macro avg       0.06  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
# Handle missing values by dropping rows with missing content or title
cleaned_df = csv_df.dropna(subset=['Content_1', 'Content_2', 'PERSIAN_title', 'PENGLISH_title', 'Time'])

# Select top 5 genres with the highest counts
top_genres = cleaned_df['Genre'].value_counts().nlargest(5).index

# Filter the dataset to only include rows with the selected genres
balanced_df = cleaned_df[cleaned_df['Genre'].isin(top_genres)]

# Re-check the genre distribution after filtering
balanced_genre_distribution = balanced_df['Genre'].value_counts()

# Split the dataset into training (80%), validation (10%), and test (10%)
from sklearn.model_selection import train_test_split

# Split into train (90%) and test (10%)
train_val_df, test_df = train_test_split(balanced_df, test_size=0.1, stratify=balanced_df['Genre'], random_state=42)

# Further split train into training (80%) and validation (10%)
train_df, val_df = train_test_split(train_val_df, test_size=0.1111, stratify=train_val_df['Genre'], random_state=42)

balanced_genre_distribution, train_df.shape, val_df.shape, test_df.shape


(Genre
 Drama        537
 Comedy       177
 Action        56
 Crime         46
 Adventure     37
 Name: count, dtype: int64,
 (681, 10),
 (86, 10),
 (86, 10))

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import numpy as np

# Prepare the data
X_train = train_df['Content_1']
y_train = train_df['Genre']

X_test = test_df['Content_1']
y_test = test_df['Genre']

# Transform text data into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define models
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC()
}

# Evaluate models using cross-validation and report mean and std of accuracy, precision, recall, and f1
results = []
for model_name, model in models.items():
    cv_accuracy = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
    cv_precision = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='precision_macro')
    cv_recall = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='recall_macro')
    cv_f1 = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='f1_macro')
    
    results.append({
        "Model": model_name,
        "Accuracy Mean": cv_accuracy.mean(),
        "Accuracy Std": cv_accuracy.std(),
        "Precision Mean": cv_precision.mean(),
        "Precision Std": cv_precision.std(),
        "Recall Mean": cv_recall.mean(),
        "Recall Std": cv_recall.std(),
        "F1 Score Mean": cv_f1.mean(),
        "F1 Score Std": cv_f1.std()
    })

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)

# Display the results
print(results_df)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                 Model  Accuracy Mean  Accuracy Std  Precision Mean  \
0          Naive Bayes       0.629959      0.003057        0.125992   
1  Logistic Regression       0.631430      0.001846        0.166177   
2           Linear SVM       0.628521      0.014199        0.288494   

   Precision Std  Recall Mean  Recall Std  F1 Score Mean  F1 Score Std  
0       0.000611     0.200000    0.000000       0.154594      0.000461  
1       0.079875     0.201429    0.002857       0.157493      0.005430  
2       0.071546     0.218031    0.007969       0.194966      0.015130  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
