# Classification

## SVM

In [15]:
import pandas as pd

df = pd.read_csv('movie_dataset_classification.csv',index_col=0)
df = df.dropna(subset=['Plot'])
df.tail()

Unnamed: 0,Release Year,Title,Director,Cast,Plot,Genre
40117,,Ready Player One,,[],Wade Watts is a teenager who lives in a slum w...,sport
40118,2011.0,From the Rough,Pierre Bagley,"['Taraji P. Henson', 'Tom Felton', 'Michael Cl...",Catana Starks is the coach for the women's swi...,sport
40119,,Brink!,Greg Beeman,"['Erik von Detten', 'Christina Vidal', 'Robin ...","Andy ""Brink"" Brinker and his in-line skating c...",sport
40120,2003.0,Open Water (film),Chris Kentis,"['Blanchard Ryan', 'Daniel Travis']",Daniel Kintner and Susan Watkins are frustrate...,sport
40121,2013.0,All Cheerleaders Die,Lucky McKeeChris Sivertson,"['Caitlin Stasey', 'Sianoa Smit-McPhee', 'Broo...",The film opens with Mäddy Killian (Caitlin Sta...,sport


In [16]:
from sklearn.model_selection import train_test_split

X = df['Plot'].values
y = df['Genre'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, shuffle=True)

In [17]:
len(X_train),len(y_train),len(X_test),len(y_test)

(31570, 31570, 7893, 7893)

In [18]:
import nltk
import spacy

nlp = spacy.load('en_core_web_sm')

doc_counter = 0
def reset_counter():
  global doc_counter
  doc_counter = 0

def increase_counter():
  global doc_counter
  doc_counter += 1
  if doc_counter % 100 == 0:
    print(doc_counter)

def spacy_nlp_tokenizer(text):
    increase_counter()
        
    # we use spacy for main nlp tasks
    doc = nlp(text)
    # lemmatized tokens, skipping stopwords
    lemmas = ['LEMMA_'+token.lemma_ for token in doc if not token.is_stop]
    # entity_types
    entity_types = ['NER_'+token.ent_type_ for token in doc if token.ent_type_]

    # in case an entity linker is available, we can use it do put actual entities as
    # features, e.g. Queen Elizabeth, Elizabeth II, Her Majesty -> KB2912
    # see https://spacy.io/usage/training#entity-linker
    # entities = ['ENT_'+token.ent_kb_id_ for token in doc if token.ent_kb_id_]

    # we use a simple nltk function to create ngrams
    lemma_bigrams = ['BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(lemmas,2)]
    lemma_trigrams = ['TRI_'+p1+'_'+p2+'_'+p3 for p1,p2,p3 in nltk.ngrams(lemmas,3)]

    all_tokens = list()
    all_tokens.extend(lemmas)
    all_tokens.extend(lemma_bigrams)
    all_tokens.extend(lemma_trigrams)
    all_tokens.extend(entity_types)
    return all_tokens

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(analyzer=spacy_nlp_tokenizer, min_df=5)

X_train_tok = vect.fit_transform(X_train)

X_test_tok = vect.transform(X_test)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400


KeyboardInterrupt: 

In [None]:
X_train_tok.shape, X_test_tok.shape

In [None]:
#writing features to pickle
import pickle

with open('x_train_tok.pkl','wb') as outfile:
    pickle.dump(X_train_tok,outfile)
with open('x_test_tok.pkl','wb') as outfile:
    pickle.dump(X_test_tok,outfile)

In [None]:
#reading from pickle
import pickle

infile = open('x_train_tok.pkl','rb')
X_train_tok = pickle.load(infile)
infile.close()

infile = open('x_test_tok.pkl','rb')
X_test_tok = pickle.load(infile)
infile.close()

In [None]:
X_train_tok.shape, X_test_tok.shape

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'sel__k': [1000, 2000, 5000, 10000], 'learner__C': [0.01, 0.1, 1, 10]},
 ]

opt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])


n_jobs = 2

opt_search = GridSearchCV(opt_pipeline, param_grid, cv=5, n_jobs = n_jobs, verbose=3).fit(X_train_tok,y_train)

In [None]:
opt_predictions = opt_search.best_estimator_.predict(X_test_tok)

correct = 0
for prediction,true_label in zip(opt_predictions, y_test):
    if prediction==true_label:
        correct += 1
print("Accuracy:\t", correct/len(opt_predictions))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test, opt_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, opt_predictions)
print(cm)