
## 1. Imports

In [57]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import classification_report

## 2. Dataset load

In [74]:
file = open("train.txt", "r", encoding="utf-8")
data = file.readlines()

data = [i.split("\t") for i in data]

df = [[row[2], row[4]] for row in data]  # Extract 2nd and 4th elements
df = pd.DataFrame(df, columns = ['genre', 'plot'])
print(df.head())


     genre                                               plot
0  romance  Sekhar (Tarun) is a graduate from IIM and work...
1   horror  Kris Fowles (Katie Cassidy) goes to the Spring...
2   horror  Cynthia is traumatized by the death of her bab...
3    crime  Four friends, Gangu (Jackie Shroff), Abdul (Na...
4    drama  Crisis in a middle-class family when the son f...


## 3. Pre-processing

In [75]:
porter_stemmer=PorterStemmer()
def processSentence(s):
    words=re.split("\\s+",s)
    stemmed_words=[porter_stemmer.stem(word=w) for w in words]
    return ' '.join(stemmed_words)

x_processed = [processSentence(x) for x in df["plot"]]

labels = np.unique(df["genre"]).tolist()

print(labels)


['action', 'animation', 'comedy', 'crime', 'drama', 'horror', 'romance', 'sci-fi', 'western']


## 4. Create Vectors

In [86]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words='english',min_df=0.001,max_features=2500)
count_vectorizer = CountVectorizer(ngram_range=(1,3), stop_words = 'english' ,min_df=0.001,max_features=2500)

X = tfidf_vectorizer.fit_transform(x_processed).toarray()
Z = count_vectorizer.fit_transform(x_processed).toarray()


## Train/test split


In [93]:
indices = range(len(df))
print(indices)
train_indices, test_test, y_train, y_test = train_test_split(indices, df["genre"], test_size=0.2, random_state=42)

X_train,X_test = X[train_indices],X[test_test]
Z_train,Z_test = Z[train_indices],Z[test_test]

range(0, 8041)


## Train with classic classifiers

In [132]:
from sklearn.multiclass import OneVsRestClassifier


print("==== SVM ====")
clf = OneVsRestClassifier(svm.SVC(kernel='linear')) # kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}
#clf = svm.SVC(kernel='linear') # kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}

print("Resultados com tfidf")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_pred = y_pred, y_true = y_test, labels=labels, zero_division=1.))

print("Resultados com CountVectorizer")




==== SVM ====
Resultados com tfidf
              precision    recall  f1-score   support

      action       0.58      0.59      0.59       202
   animation       0.81      0.68      0.74       115
      comedy       0.49      0.48      0.49       223
       crime       0.59      0.38      0.46       108
       drama       0.51      0.56      0.53       319
      horror       0.73      0.83      0.78       223
     romance       0.59      0.52      0.55       186
      sci-fi       0.50      0.44      0.47        39
     western       0.86      0.93      0.89       194

    accuracy                           0.62      1609
   macro avg       0.63      0.60      0.61      1609
weighted avg       0.62      0.62      0.62      1609

Resultados com CountVectorizer


In [137]:
# Print available classes

print(clf.classes_.shape)
    
for idx, class_names in enumerate(clf.classes_):
    print(f"==== {class_names} ====")
    class_coefficients = clf.estimators_[idx].coef_[0]
    
    feature_importance = list(zip(tfidf_vectorizer.get_feature_names_out(), class_coefficients))
    sorted_features = sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)
    top_n = 10
    most_important_features = sorted_features[:top_n]
    for feature, importance in most_important_features:
        print(f"{feature}: {importance}")


(9,)
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Utilizador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Utilizador\AppData\Local\Temp\ipykernel_20232\1469972885.py", line 4, in <module>
    print(clf.estimators_[0].coef_[1])
          ~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: index 1 is out of bounds for axis 0 with size 1

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Utilizador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\IPython\core\interactiveshell.py", line 2120, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Utilizador\AppData\Local\P