
## 1. Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import classification_report

## 2. Dataset load

In [2]:
#FILE_NAME = "train_subset_1000.txt"
#FILE_NAME = "train_subset_2500.txt"
FILE_NAME = "train_subset_5000.txt"
#FILE_NAME = "train.txt"
file = open(FILE_NAME, "r", encoding="utf-8")
data = file.readlines()

data = [i.split("\t") for i in data]

df = [[row[2], row[4]] for row in data]  # Extract 2nd and 4th elements
df = pd.DataFrame(df, columns = ['genre', 'plot'])
print(df.head())


     genre                                               plot
0  romance  Sekhar (Tarun) is a graduate from IIM and work...
1   horror  Kris Fowles (Katie Cassidy) goes to the Spring...
2   horror  Cynthia is traumatized by the death of her bab...
3    crime  Four friends, Gangu (Jackie Shroff), Abdul (Na...
4    drama  Crisis in a middle-class family when the son f...


## 3. Pre-processing

In [3]:
porter_stemmer=PorterStemmer()
def processSentence(s):
    words=re.split("\\s+",s)
    stemmed_words=[porter_stemmer.stem(word=w) for w in words]
    return ' '.join(stemmed_words)

x_processed = [processSentence(x) for x in df["plot"]]

labels = np.unique(df["genre"]).tolist()

print(labels)


['action', 'animation', 'comedy', 'crime', 'drama', 'horror', 'romance', 'sci-fi', 'western']


## 4. Create Vectors

In [4]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=0.001, max_features=2500)
count_vectorizer = CountVectorizer(ngram_range=(1,3), stop_words ='english', min_df=0.001, max_features=2500)

X = tfidf_vectorizer.fit_transform(x_processed).toarray()
Z = count_vectorizer.fit_transform(x_processed).toarray()

## Train/test split


In [5]:
indices = range(len(df))
print(indices)
train_indices, test_test, y_train, y_test = train_test_split(indices, df["genre"], test_size=0.2, random_state=42)

X_train,X_test = X[train_indices],X[test_test]
Z_train,Z_test = Z[train_indices],Z[test_test]

range(0, 8041)


## Train with classic classifiers

In [6]:
from sklearn.multiclass import OneVsRestClassifier

print("==== SVM ====")
clf = OneVsRestClassifier(svm.SVC(kernel='linear')) # kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}
#clf = svm.SVC(kernel='linear') # kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}

print("Resultados com tfidf")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_pred = y_pred, y_true = y_test, labels=labels, zero_division=1.))


==== SVM ====
Resultados com tfidf
              precision    recall  f1-score   support

      action       0.58      0.59      0.59       202
   animation       0.81      0.68      0.74       115
      comedy       0.49      0.48      0.49       223
       crime       0.59      0.38      0.46       108
       drama       0.51      0.56      0.53       319
      horror       0.73      0.83      0.78       223
     romance       0.59      0.52      0.55       186
      sci-fi       0.50      0.44      0.47        39
     western       0.86      0.93      0.89       194

    accuracy                           0.62      1609
   macro avg       0.63      0.60      0.61      1609
weighted avg       0.62      0.62      0.62      1609



In [7]:
# Print available classes

print(clf.classes_.shape)
    
for idx, class_names in enumerate(clf.classes_):
    print(f"==== {class_names} ====")
    class_coefficients = clf.estimators_[idx].coef_[0]
    
    feature_importance = list(zip(tfidf_vectorizer.get_feature_names_out(), class_coefficients))
    sorted_features = sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)
    top_n = 10
    most_important_features = sorted_features[:top_n]
    for feature, importance in most_important_features:
        print(f"{feature}: {importance}")


(9,)
==== action ====
cia: 2.8021754711165308
kill: 2.7102591602766744
fight: 2.3630672845830403
cop: 2.1778162361073443
kidnap: 2.164113228953754
reveng: 2.087059808568574
terrorist: 2.0541809845544807
elimin: 1.9373311329906275
polic: 1.8995863062180611
prakash: 1.8797506685098555
==== animation ====
bug: 2.831226288902737
anim: 2.186765167236875
kill: -2.1823134799872004
daffi: 2.0740935524103072
cat: 1.989470935205464
conan: 1.9712058244142328
king: 1.5491512073148843
donald: 1.5390652663764965
herd: 1.4688285668642365
jerri: 1.4207498202024973
==== comedy ====
kill: -2.827987741312822
cloth: 2.1059708661407024
harold: 2.0730965165455864
party: 2.0519030203664275
policeman: 1.9457971113241168
riley: 1.831411662357698
murder: -1.7783065874608641
death: -1.7604837323599403
drunk: 1.7504530259616216
dead: -1.7233867918783514
==== crime ====
crimin: 2.235474723076241
detect: 2.1241909298015016
suspect: 1.953302705416882
murder: 1.8756132844646185
dead: 1.8500345610594269
polic: 1.77946

## Perform Kfold Cross-Validation on Train Set

In [None]:
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Initialize k-Fold Cross-Validation
k = 5  # Set number of folds
random_state = 42
kf = KFold(n_splits=k, shuffle=True, random_state=random_state)

# Initialize dictionary to store results
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted', zero_division=1),
    'recall': make_scorer(recall_score, average='weighted', zero_division=1),
    'f1': make_scorer(f1_score, average='weighted', zero_division=1)
}

# Perform k-fold cross-validation using cross_validate
results = cross_validate(clf, X, df['genre'], cv=kf, scoring=scoring, return_train_score=True)

# Display results for each metric
print(f"Accuracy: {results['test_accuracy'].mean()} ± {results['test_accuracy'].std()}")
print(f"Precision: {results['test_precision'].mean()} ± {results['test_precision'].std()}")
print(f"Recall: {results['test_recall'].mean()} ± {results['test_recall'].std()}")
print(f"F1-Score: {results['test_f1'].mean()} ± {results['test_f1'].std()}")

## Compare with Test Set and Save Results

In [28]:
# Load test data (without genre labels)
test_file = open("test_no_labels.txt", "r", encoding="utf-8")
test_data = test_file.readlines()

# Preprocess the test data similar to the training data
x_test_no_labels_processed = [processSentence(x) for x in test_data]

# Transform the test data into TF-IDF vectors
X_test_no_labels = tfidf_vectorizer.transform(x_test_no_labels_processed).toarray()

# Predict the genres for the test data
y_pred_test_no_labels = clf.predict(X_test_no_labels)

# Save the predictions to a file
with open("results.txt", "w", encoding="utf-8") as results_file:
    for prediction in y_pred_test_no_labels:
        results_file.write(prediction + "\n")