In [38]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, ComplementNB 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords

In [39]:
movie = pd.read_csv('imdb_labelled.txt', delimiter = '\t', sep = ',' )
movie.columns=['review', 'rating']
movie.head()

Unnamed: 0,review,rating
0,Not sure who was more lost - the flat characte...,0
1,Attempting artiness with black & white and cle...,0
2,Very little music or anything to speak of.,0
3,The best scene in the movie was when Gerardo i...,1
4,"The rest of the movie lacks art, charm, meanin...",0


In [40]:
movie.columns

Index(['review', 'rating'], dtype='object')

In [41]:
def get_label(movie):
    if movie['rating'] == 0:
        return 'Negative'
    else:
        return 'Positive'

In [42]:
movie['label'] = movie.apply(get_label, axis =1)
movie.head()

Unnamed: 0,review,rating,label
0,Not sure who was more lost - the flat characte...,0,Negative
1,Attempting artiness with black & white and cle...,0,Negative
2,Very little music or anything to speak of.,0,Negative
3,The best scene in the movie was when Gerardo i...,1,Positive
4,"The rest of the movie lacks art, charm, meanin...",0,Negative


In [43]:
X = movie['review']

In [44]:
y = movie['rating']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 60)

In [46]:
from sklearn.naive_bayes import BernoulliNB
model_creation2 = Pipeline([
                         ('bow', CountVectorizer(stop_words='english')),
                         ('Tfidf', TfidfTransformer()),
                         ('model', BernoulliNB())
                         ])

In [47]:
scores = cross_val_score(model_creation2, X, y, cv = 10)
scores


array([0.77333333, 0.82666667, 0.68      , 0.82666667, 0.72      ,
       0.76      , 0.73333333, 0.64864865, 0.64864865, 0.7972973 ])

In [48]:
scores.mean(), scores.std()

(0.7414594594594596, 0.06366285936372215)

In [49]:
model2 = model_creation2.fit(X_train, y_train)

In [50]:
predict2 = model2.predict(X_test)

In [51]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predict2))

              precision    recall  f1-score   support

           0       0.84      0.43      0.56       143
           1       0.64      0.92      0.75       156

    accuracy                           0.69       299
   macro avg       0.74      0.67      0.66       299
weighted avg       0.73      0.69      0.66       299



In [52]:
from sklearn.naive_bayes import ComplementNB
model_creation1 = Pipeline([
                         ('bow', CountVectorizer(stop_words='english')),
                         ('Tfidf', TfidfTransformer()),
                         ('model', ComplementNB())
                         ])


In [53]:
scores = cross_val_score(model_creation1, X, y, cv = 10)
scores

array([0.78666667, 0.84      , 0.66666667, 0.8       , 0.76      ,
       0.72      , 0.77333333, 0.68918919, 0.68918919, 0.83783784])

In [54]:
scores.mean(), scores.std()

(0.7562882882882882, 0.059286847459052716)

In [55]:
model1 = model_creation1.fit(X_train, y_train)

In [56]:
from sklearn.metrics import classification_report
clf = GridSearchCV(model_creation1, tuned_parameters, cv=10, scoring=model1)

In [57]:
predict1 = model1.predict(X_test)

In [58]:
print(classification_report(y_test, predict1))

              precision    recall  f1-score   support

           0       0.73      0.80      0.76       143
           1       0.80      0.72      0.76       156

    accuracy                           0.76       299
   macro avg       0.76      0.76      0.76       299
weighted avg       0.77      0.76      0.76       299



In [59]:
model_creation = Pipeline([
                      ('bow', CountVectorizer(stop_words='english')),
                      ('Tfidf', TfidfTransformer()),
                      ('model', MultinomialNB())
                      ])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [60]:
scores = cross_val_score(model_creation, X, y, cv = 10)
scores

array([0.76      , 0.82666667, 0.66666667, 0.81333333, 0.76      ,
       0.74666667, 0.76      , 0.72972973, 0.68918919, 0.86486486])

In [61]:
scores.mean(), scores.std()

(0.7617117117117117, 0.05751137129384389)

In [62]:
model = model_creation.fit(X_train, y_train)  

In [30]:
from sklearn.metrics import classification_report
clf = GridSearchCV(model_creation, tuned_parameters, cv=10, scoring=model)

In [31]:
predict = model.predict(X_test)

In [32]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.75      0.72      0.73       143
           1       0.75      0.78      0.76       156

    accuracy                           0.75       299
   macro avg       0.75      0.75      0.75       299
weighted avg       0.75      0.75      0.75       299

