
## 1. Imports

In [77]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import classification_report

## 2. Dataset load

In [106]:
file = open("train.txt", "r", encoding="utf-8")
data = file.readlines()

data = [i.split("\t") for i in data]

df = [[row[2], row[4]] for row in data]  # Extract 2nd and 4th elements
df = pd.DataFrame(df, columns = ['genre', 'plot'])
print(df.head())


     genre                                               plot
0  romance  Sekhar (Tarun) is a graduate from IIM and work...
1   horror  Kris Fowles (Katie Cassidy) goes to the Spring...
2   horror  Cynthia is traumatized by the death of her bab...
3    crime  Four friends, Gangu (Jackie Shroff), Abdul (Na...
4    drama  Crisis in a middle-class family when the son f...


## 3. Pre-processing

In [107]:
porter_stemmer=PorterStemmer()
def processSentence(s):
    words=re.split("\\s+",s)
    stemmed_words=[porter_stemmer.stem(word=w) for w in words]
    return ' '.join(stemmed_words)

x_processed = [processSentence(x) for x in df["plot"]]

labels = np.unique(df["genre"]).tolist()

print(labels)
print(x_processed[0])


['action', 'animation', 'comedy', 'crime', 'drama', 'horror', 'romance', 'sci-fi', 'western']
sekhar (tarun) is a graduat from iim and work as a busi manag in a firm in hyderabad. he meet amar varma (siva balaji) in a busi parti who run a compani name varma industri in germany. when the parti is over, sekhar accident kill amar with hi car. sekhar' friend sunil (sunil) save him by eras the evidence, but sekhar feel guilti even after six months. he goe to germani along with sunil to visit amar' family. they find that their busi is in a bad condit and want to help them by reviv it with hi busi skills. priya (shriya), who wa engag to amar earlier is tri her level best to run the business, but helpless to run it smoothly. sekhar introduc himself to priya and offer hi help to reviv the business. the remain stori is about how sekhar help the busi grow and win priya' heart in that process. 


## 4. Create Vectors

In [108]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words=["the","is","and"],min_df=0.01)

X = tfidf_vectorizer.fit_transform(x_processed).toarray()


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Train/test split


In [109]:
indices = range(len(df))
train_indices, test_test, y_train, y_test = train_test_split(indices, df["genre"], test_size=0.2, random_state=42)

X_train,X_test = X[train_indices],X[test_test]

## Train with classic classifiers

In [None]:
print("==== SVM ====")
clf = svm.SVC(kernel='linear') # kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}

print("Resultados com tfidf")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_pred = y_pred, y_true = y_test, labels=labels, zero_division=1.0))

==== SVM ====
Resultados com tfidf
