In [11]:
# Load data
import csv
import pandas as pd
from scipy.constants import yocto

data = pd.read_csv('data/Movie_Overview_Classification.csv',
                   encoding='utf-8', 
                   delimiter=',',
                   quotechar='|',
                   quoting=csv.QUOTE_MINIMAL)
X = data['overview']
y = data['genre_Drama']

data.head()

Unnamed: 0,id,overview,genre_Drama
0,1,"When Lou, who has become the ""father of the In...",0
1,2,Mia Thermopolis is now a college graduate and ...,1
2,3,"Under the direction of a ruthless instructor, ...",1
3,4,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,1
4,5,Marine Boy is the story of a former national s...,0


In [12]:
# Perform pre-processing, e.g., replacing missing values if existing
data = data.fillna('')
data.head()

Unnamed: 0,id,overview,genre_Drama
0,1,"When Lou, who has become the ""father of the In...",0
1,2,Mia Thermopolis is now a college graduate and ...,1
2,3,"Under the direction of a ruthless instructor, ...",1
3,4,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,1
4,5,Marine Boy is the story of a former national s...,0


### Implement a classifier

In [13]:
# Tokenize text with CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
X_train_counts.shape

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [5]:
# Transform the count matrix to a normalized term-frequency or term-frequency times inverse document-frequency representation 
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(3000, 17436)

In [7]:
# 
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tf, y)



In [8]:
# Build a pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [9]:
# Train the classifier
text_clf = text_clf.fit(X, y)

In [10]:
# Measure accuracy using cross-validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(text_clf, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.66 (+/- 0.02)
