In [1]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer

In [3]:
test_csv = pd.read_csv('test_data.csv') 
train_csv = pd.read_csv('train_data.csv')

In [4]:
test_csv.head()

Unnamed: 0,0,1
0,I did not enjoy the film Eraser whatsoever. It...,0
1,Be very afraid of anyone who likes this film. ...,0
2,The 3rd and last big screen spin off from the ...,0
3,Barely three and a half years after just scrap...,1
4,I'm a big fan of the demonic puppets. Looking ...,0


In [8]:
lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
train_X_non = train_csv['0']   # '0' refers to the review text
train_y = train_csv['1']       # '1' corresponds to Label (1 - positive and 0 - negative)
test_X_non = test_csv['0']
test_y = test_csv['1']

train_X=[]
test_X=[]

In [12]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...


True

In [13]:
#text pre processing for training set

for i in range(0, len(train_X_non)):
    review = re.sub('[^a-zA-Z]', ' ', train_X_non[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    train_X.append(review)

In [14]:
#text pre processing for test set

for i in range(0, len(test_X_non)):
    review = re.sub('[^a-zA-Z]', ' ', test_X_non[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    test_X.append(review)

In [15]:
train_X[10]

'eddie murphy delirious far funniest thing ever see life compare movie garuntee decide delirious funniest movie ever movie hr min throughout time barely moment laughing laugh hour replaying punch line head eddie murphy given many funny performance career hr trading place beverly hill cop raw coming america nutty professor shrek etc far hilarious moment seen movie many time funnier every time never loses edge day forward every great stand performance emulated delirious two thumb'

In [16]:
# TF-IDF vectorization timeeee
tf_idf = TfidfVectorizer()
X_train_tf = tf_idf.fit_transform(train_X)

n_samples: 25000, n_features: 65498


In [18]:
X_train_tf = tf_idf.transform(train_X)
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 25000, n_features: 65498


In [19]:
X_test_tf = tf_idf.transform(test_X)
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 25000, n_features: 65498


# Model Creation

In [20]:
#naive bayes classifier

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_y)

In [21]:
#predicted y

y_pred = naive_bayes_classifier.predict(X_test_tf)

In [22]:
print(metrics.classification_report(test_y, y_pred,
                                            target_names=['Positive', 'Negative']))

              precision    recall  f1-score   support

    Positive       0.80      0.88      0.84     12500
    Negative       0.87      0.78      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



In [23]:
print("Confusion matrix:")
print(metrics.confusion_matrix(test_y, y_pred))

Confusion matrix:
[[10990  1510]
 [ 2693  9807]]


# Doing a Test Predicton

In [52]:
test=["This is quite possible one of the lengthiest short films I have ever seen, and I don't mean that in a good way. Hefty's initial flaw is its abysmal script, it's clear that the writer is an amateur at best. The movie goes on and on with no seeming end in mind. The ending is boring and a let down. The thing that grinds my gears the most about this movie is that it had no MaGuffin. I was waiting for it the whole time, and it never came. I'm really glad that this movie could not be released because of the copyrighted eagles music,Thank god for Glen Fry because without him the world would be in a pain on parallel to the Holocaust."]

In [53]:
review = re.sub('[^a-zA-Z]', ' ', test[0])
review = review.lower()
review = review.split()
review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
test_processed =[ ' '.join(review)]

In [54]:
test_processed

['quite possible one lengthiest short film ever seen mean good way hefty initial flaw abysmal script clear writer amateur best movie go seeming end mind ending boring let thing grind gear movie maguffin waiting whole time never came really glad movie could released copyrighted eagle music thank god glen fry without world would pain parallel holocaust']

In [55]:
test_input = tf_idf.transform(test_processed)
test_input.shape

(1, 65498)

In [56]:
#0= bad review
#1= good review

res=naive_bayes_classifier.predict(test_input)[0]

if res==1:
    print("Good Review")
    
elif res==0:
    print("Bad Review")

Bad Review


# Using Logistic regression instead of Naive Bayes 

In [57]:
from sklearn.linear_model import LogisticRegression

In [59]:
LR_classifier = LogisticRegression()
LR_classifier.fit(X_train_tf, train_y)

In [61]:
y_pred = LR_classifier.predict(X_test_tf)

In [62]:
print(metrics.classification_report(test_y, y_pred,
                                            target_names=['Positive', 'Negative']))

              precision    recall  f1-score   support

    Positive       0.88      0.88      0.88     12500
    Negative       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [63]:
print("Confusion matrix:")
print(metrics.confusion_matrix(test_y, y_pred))

Confusion matrix:
[[10998  1502]
 [ 1514 10986]]


In [83]:
test=["John Prusak is a great filmmaker, but more importantly, he's a great film _teacher_. His students have gone on to do big stuff in Hollywood (like Doug Chiang, at ILM). This was a side project, and it gave John the chance to showcase a classic piece of Detroit's history. It's an entertaining short film, and talks about what success is really all about."]
review = re.sub('[^a-zA-Z]', ' ', test[0])
review = review.lower()
review = review.split()
review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
test_processed =[ ' '.join(review)]

In [84]:
test_input = tf_idf.transform(test_processed)
test_input.shape

(1, 65498)

In [85]:
#0= bad review
#1= good review

res=LR_classifier.predict(test_input)[0]

if res==1:
    print("Good Review")
    
elif res==0:
    print("Bad Review")

Good Review
