
## 1. Imports

In [29]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import classification_report

## 2. Dataset load

In [30]:
#FILE_NAME = "train.txt"
#FILE_NAME = "train_subset_1000.txt"
#FILE_NAME = "train_subset_2500.txt"
FILE_NAME = "train_subset_5000.txt"
file = open(FILE_NAME, "r", encoding="utf-8")
data = file.readlines()

data = [i.split("\t") for i in data]

df = [[row[2], row[4]] for row in data]  # Extract 2nd and 4th elements
df = pd.DataFrame(df, columns = ['genre', 'plot'])
print(df.head())


     genre                                               plot
0  romance  Sekhar (Tarun) is a graduate from IIM and work...
1   horror  Kris Fowles (Katie Cassidy) goes to the Spring...
2   horror  Cynthia is traumatized by the death of her bab...
3    crime  Four friends, Gangu (Jackie Shroff), Abdul (Na...
4    drama  Crisis in a middle-class family when the son f...


## 3. Pre-processing

In [31]:
porter_stemmer=PorterStemmer()
def processSentence(s):
    words=re.split("\\s+",s)
    stemmed_words=[porter_stemmer.stem(word=w) for w in words]
    return ' '.join(stemmed_words)

x_processed = [processSentence(x) for x in df["plot"]]

labels = np.unique(df["genre"]).tolist()

print(labels)


['action', 'animation', 'comedy', 'crime', 'drama', 'horror', 'romance', 'sci-fi', 'western']


## 4. Create Vectors

In [32]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=0.001, max_features=2500)
count_vectorizer = CountVectorizer(ngram_range=(1,3), stop_words ='english', min_df=0.001, max_features=2500)

X = tfidf_vectorizer.fit_transform(x_processed).toarray()
Z = count_vectorizer.fit_transform(x_processed).toarray()

## Train/test split


In [33]:
indices = range(len(df))
print(indices)
train_indices, test_test, y_train, y_test = train_test_split(indices, df["genre"], test_size=0.2, random_state=42)

X_train,X_test = X[train_indices],X[test_test]
Z_train,Z_test = Z[train_indices],Z[test_test]

range(0, 5000)


## Train with classic classifiers

In [25]:
from sklearn.multiclass import OneVsRestClassifier

print("==== SVM ====")
clf = OneVsRestClassifier(svm.SVC(kernel='linear')) # kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}
#clf = svm.SVC(kernel='linear') # kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}

print("Resultados com tfidf")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_pred = y_pred, y_true = y_test, labels=labels, zero_division=1.))


==== SVM ====
Resultados com tfidf
              precision    recall  f1-score   support

      action       0.59      0.55      0.57       135
   animation       0.66      0.71      0.68        65
      comedy       0.46      0.45      0.46       132
       crime       0.51      0.31      0.38        72
       drama       0.50      0.51      0.50       215
      horror       0.66      0.88      0.75       128
     romance       0.59      0.54      0.57       118
      sci-fi       0.53      0.31      0.39        29
     western       0.77      0.88      0.82       106

    accuracy                           0.59      1000
   macro avg       0.59      0.57      0.57      1000
weighted avg       0.58      0.59      0.58      1000



In [26]:
# Print available classes

print(clf.classes_.shape)
    
for idx, class_names in enumerate(clf.classes_):
    print(f"==== {class_names} ====")
    class_coefficients = clf.estimators_[idx].coef_[0]
    
    feature_importance = list(zip(tfidf_vectorizer.get_feature_names_out(), class_coefficients))
    sorted_features = sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)
    top_n = 10
    most_important_features = sorted_features[:top_n]
    for feature, importance in most_important_features:
        print(f"{feature}: {importance}")


(9,)
==== action ====
kill: 2.7651499564530493
fight: 2.1511714932782646
cia: 2.108259557249397
prakash: 1.9925356557862446
cop: 1.9166866942305292
polic offic: 1.9040456304253066
polic: 1.8418609244772561
drug: 1.7664959450512412
gabe: 1.5916543045801033
minist: 1.576811516629089
==== animation ====
anim: 2.2729028330494496
bug: 2.24534544053614
kill: -1.8590274013296755
cat: 1.8165281482958773
daffi: 1.7007480302539295
magic: 1.613071075275546
pooh: 1.6050961920142817
donald: 1.483892359909592
human: 1.42473190075629
jerri: 1.3606338219374896
==== comedy ====
kill: -2.7830302324655958
rival: 2.37584194349544
barri: 2.01102648942123
riley: 1.857233055300335
murder: -1.8422187360248305
shop: 1.7567490416043114
arriv: 1.664964852351757
hometown: 1.6341815027575026
party: 1.6175988728914121
win: 1.5791803189602946
==== crime ====
crimin: 2.525988326846936
suspect: 1.9196826347308786
murder: 1.780019110140009
nightclub: 1.7398407115517174
robbery: 1.532851733743666
chan: 1.316943396582603

## Kfold Validation on Train Set

In [27]:
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Initialize k-Fold Cross-Validation
k = 5  # Set number of folds
random_state = 42
kf = KFold(n_splits=k, shuffle=True, random_state=random_state)

# Initialize dictionary to store results
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted', zero_division=1),
    'recall': make_scorer(recall_score, average='weighted', zero_division=1),
    'f1': make_scorer(f1_score, average='weighted', zero_division=1)
}

# Perform k-fold cross-validation using cross_validate
results = cross_validate(clf, X, df['genre'], cv=kf, scoring=scoring, return_train_score=True)

# Display results for each metric
print(f"Accuracy: {results['test_accuracy'].mean()} ± {results['test_accuracy'].std()}")
print(f"Precision: {results['test_precision'].mean()} ± {results['test_precision'].std()}")
print(f"Recall: {results['test_recall'].mean()} ± {results['test_recall'].std()}")
print(f"F1-Score: {results['test_f1'].mean()} ± {results['test_f1'].std()}")

Accuracy: 0.61 ± 0.011575836902790236
Precision: 0.6048592632513433 ± 0.013961732066700988
Recall: 0.61 ± 0.011575836902790236
F1-Score: 0.6041980413230765 ± 0.014128737557797479


## Compare with Test Set and Save Results

In [28]:
# 7. Load test data (without genre labels)
test_file = open("test_no_labels.txt", "r", encoding="utf-8")
test_data = test_file.readlines()

# Preprocess the test data similar to the training data
x_test_no_labels_processed = [processSentence(x) for x in test_data]

# Transform the test data into TF-IDF vectors
X_test_no_labels = tfidf_vectorizer.transform(x_test_no_labels_processed).toarray()

# 8. Predict the genres for the test data
y_pred_test_no_labels = clf.predict(X_test_no_labels)

# 9. Save the predictions to results.txt
with open("results.txt", "w", encoding="utf-8") as results_file:
    for prediction in y_pred_test_no_labels:
        results_file.write(prediction + "\n")