## Importing Libraries

In [448]:
import pandas as pd
import numpy as np
import glob, os, string, re, spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.tree import export_graphviz

## Import Datasets

In [364]:
train_pos_files = glob.glob("aclImdb/train/pos/*.txt")
train_neg_files = glob.glob("aclImdb/train/neg/*.txt")
train_pos_ls = []

for i in train_pos_files:
    file = open(i, "r", encoding="utf8")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    train_pos_ls.append(str)
    
train_neg_ls = []
for i in train_neg_files:
    file = open(i, "r", encoding="utf8")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    train_neg_ls.append(str)

In [365]:
labels = ['reveiw', 'label']
df_train_pos = pd.DataFrame()
df_train_pos['review'] = train_pos_ls
df_train_pos['label'] = 1
df_train_neg = pd.DataFrame()
df_train_neg['review'] = train_neg_ls
df_train_neg['label'] = -1
df_train = pd.concat([df_train_pos , df_train_neg])
df_train.head(10)

Unnamed: 0,review,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
5,"This isn't the comedic Robin Williams, nor is ...",1
6,Yes its an art... to successfully make a slow ...,1
7,"In this ""critically acclaimed psychological th...",1
8,THE NIGHT LISTENER (2006) **1/2 Robin Williams...,1
9,"You know, Robin Williams, God bless him, is co...",1


In [366]:
test_pos_files = glob.glob("aclImdb/test/pos/*.txt")
test_neg_files = glob.glob("aclImdb/test/neg/*.txt")
test_pos_ls = []
for i in test_pos_files:
    file = open(i, "r",encoding="utf8")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    test_pos_ls.append(str)
    
test_neg_ls = []
for i in test_neg_files:
    file = open(i, "r",encoding="utf8")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    test_neg_ls.append(str)

In [367]:
labels = ['reveiw', 'label']
df_test_pos = pd.DataFrame()
df_test_pos['review'] = test_pos_ls
df_test_pos['label'] = 1
df_test_neg = pd.DataFrame()
df_test_neg['review'] = test_neg_ls
df_test_neg['label'] = -1
df_test = pd.concat([df_test_pos , df_test_neg])
df_test.head(10)

Unnamed: 0,review,label
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
5,"I saw this film on September 1st, 2005 in Indi...",1
6,"Maybe I'm reading into this too much, but I wo...",1
7,I felt this film did have many good qualities....,1
8,This movie is amazing because the fact that th...,1
9,"""Quitting"" may be as much about exiting a pre-...",1


In [368]:
# Define text pre-processing functions
lemma = WordNetLemmatizer()
stops = set(stopwords.words('english'))
            
def text_prep(text):
    no_punct = [char for char in text if char not in string.punctuation]
    text = "".join(no_punct)
    text = [lemma.lemmatize(text, pos='v') for text in text.lower().split() if text not in stops] 
    text = " ".join(text)
    return (text)

## Data Preprocessing

In [369]:
# preprocess training data
df_train['prep_review'] = df_train['review'].apply(lambda x:text_prep(x))
df_train[['prep_review', 'label']].head(10)

Unnamed: 0,prep_review,label
0,bromwell high cartoon comedy run time program ...,1
1,homelessness houselessness george carlin state...,1
2,brilliant overact lesley ann warren best drama...,1
3,easily underrate film inn brook cannon sure fl...,1
4,typical mel brook film much less slapstick mov...,1
5,isnt comedic robin williams quirkyinsane robin...,1
6,yes art successfully make slow pace thriller s...,1
7,critically acclaim psychological thriller base...,1
8,night listener 2006 12 robin williams toni col...,1
9,know robin williams god bless constantly shoot...,1


In [370]:
# preprocess testing data
df_test['prep_review'] = df_test['review'].apply(lambda x:text_prep(x))
df_test[['prep_review', 'label']].head(10)

Unnamed: 0,prep_review,label
0,go saw movie last night coax friends mine ill ...,1
1,actor turn director bill paxton follow promise...,1
2,recreational golfer knowledge sport history pl...,1
3,saw film sneak preview delightful cinematograp...,1
4,bill paxton take true story 1913 us golf open ...,1
5,saw film september 1st 2005 indianapolis one j...,1
6,maybe im read much wonder much hand hongsheng ...,1
7,felt film many good qualities cinematography c...,1
8,movie amaze fact real people portray real life...,1
9,quit may much exit preordain identity drug wit...,1


In [461]:
# Vectorizing training data 
tfidf = TfidfVectorizer()
# tfidf = TfidfVectorizer(ngram_range = (1,3)) did not improve accuracy
x_train = tfidf.fit_transform(df_train['prep_review'])
y_train = df_train['label']

In [462]:
# Vectorizing testing data 
x_test = tfidf.transform(df_test['prep_review'])
y_test = df_test['label']

## Training Model

### 1. Multinomial Naive Bayes

In [463]:
mnb = MultinomialNB()

In [464]:
mnb.fit(x_train, y_train)

MultinomialNB()

In [375]:
y_pred = mnb.predict(x_test)

In [376]:
print(y_pred)

[ 1  1  1 ...  1 -1  1]


In [377]:
accuracy = float(accuracy_score(y_test, y_pred))

In [378]:
print("Accuracy Percentage of Multinomial Naive Bayes Model : %0.2f" % (accuracy*100) + '%')

Accuracy Percentage of Multinomial Naive Bayes Model : 83.31%


In [379]:
cm = confusion_matrix(y_train, y_pred)

In [380]:
print("Confusion Matrix of Multinomial Naive Bayes Model -:")
print("True Negatives are : %d" % cm[0][0])
print("False Positives are : %d" % cm[0][1])
print("False Negatives are : %d" % cm[1][0])
print("True Positives are : %d" % cm[1][1])

Confusion Matrix of Multinomial Naive Bayes Model -:
True Negatives are : 11016
False Positives are : 1484
False Negatives are : 2689
True Positives are : 9811


In [381]:
precision = float(precision_score(y_train, y_pred))

In [382]:
print("Precision Percentage of Multinomial Naive Bayes Model : %0.2f" % (precision*100) + '%')

Precision Percentage of Multinomial Naive Bayes Model : 86.86%


In [383]:
rs = recall_score(y_train, y_pred)

In [384]:
print("Recall Score Percentage of Multinomial Naive Bayes Model : %0.2f" % (rs*100) + '%')

Recall Score Percentage of Multinomial Naive Bayes Model : 78.49%


In [385]:
fs = f1_score(y_train, y_pred)

In [386]:
print("F1 Score of Multinomial Naive Bayes Model : %0.2f" % (fs))

F1 Score of Multinomial Naive Bayes Model : 0.82


### 2. Random Forest Classifier

In [387]:
rfc = RandomForestClassifier(n_estimators=100, random_state = 42, n_jobs = -1)

In [388]:
rfc.fit(x_train, y_train)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [389]:
y_pred = rfc.predict(x_test)

In [390]:
rfc.score(x_train, y_train)

1.0

In [391]:
print(y_pred)

[ 1  1  1 ... -1  1  1]


In [392]:
accuracy = float(accuracy_score(y_test, y_pred))

In [393]:
print("Accuracy Percentage of RandomForest Classifier Model : %0.2f" % (accuracy*100) + '%')

Accuracy Percentage of RandomForest Classifier Model : 85.16%


In [394]:
cm = confusion_matrix(y_train, y_pred)

In [395]:
print("Confusion Matrix of RandomForest Classifier Model -:")
print("True Negatives are : %d" % cm[0][0])
print("False Positives are : %d" % cm[0][1])
print("False Negatives are : %d" % cm[1][0])
print("True Positives are : %d" % cm[1][1])

Confusion Matrix of RandomForest Classifier Model -:
True Negatives are : 10796
False Positives are : 1704
False Negatives are : 2006
True Positives are : 10494


In [396]:
precision = float(precision_score(y_train, y_pred))

In [397]:
print("Precision Percentage of RandomForest Classifier Model : %0.2f" % (precision*100) + '%')

Precision Percentage of RandomForest Classifier Model : 86.03%


In [398]:
rs = recall_score(y_train, y_pred)

In [399]:
print("Recall Score Percentage of RandomForest Classifier Model : %0.2f" % (rs*100) + '%')

Recall Score Percentage of RandomForest Classifier Model : 83.95%


In [400]:
fs = f1_score(y_train, y_pred)

In [401]:
print("F1 Score of RandomForest Classifier Model : %0.2f" % (fs))

F1 Score of RandomForest Classifier Model : 0.85


### 3. Logistic Regression

In [402]:
lr = LogisticRegression(solver = 'lbfgs', n_jobs = -1)

In [403]:
lr.fit(x_train, y_train)

LogisticRegression(n_jobs=-1)

In [404]:
y_pred = lr.predict(x_test)

In [473]:
print(y_pred)

[ 1 -1  1 ... -1  1  1]


In [475]:
lr.score(x_train, y_train)

0.93528

In [476]:
accuracy = float(accuracy_score(y_test, y_pred))

In [408]:
print("Accuracy Percentage of LogisticRegression Model : %0.2f" % (accuracy*100) + '%')

Accuracy Percentage of LogisticRegression Model : 88.34%


In [409]:
cm = confusion_matrix(y_train, y_pred)

In [410]:
print("Confusion Matrix of LogisticRegression Model -:")
print("True Negatives are : %d" % cm[0][0])
print("False Positives are : %d" % cm[0][1])
print("False Negatives are : %d" % cm[1][0])
print("True Positives are : %d" % cm[1][1])

Confusion Matrix of LogisticRegression Model -:
True Negatives are : 11008
False Positives are : 1492
False Negatives are : 1424
True Positives are : 11076


In [411]:
precision = float(precision_score(y_train, y_pred))

In [412]:
print("Precision Percentage of LogisticRegression Model : %0.2f" % (precision*100) + '%')

Precision Percentage of LogisticRegression Model : 88.13%


In [413]:
rs = recall_score(y_train, y_pred)

In [414]:
print("Recall Score Percentage of LogisticRegression Model : %0.2f" % (rs*100) + '%')

Recall Score Percentage of LogisticRegression Model : 88.61%


In [415]:
fs = f1_score(y_train, y_pred)

In [416]:
print("F1 Score of LogisticRegression Model : %0.2f" % (fs))

F1 Score of LogisticRegression Model : 0.88


### 4. Linear Support Vector Classifier

In [417]:
lsvm = LinearSVC()

In [418]:
lsvm.fit(x_train, y_train)

LinearSVC()

In [419]:
y_pred = lsvm.predict(x_test)

In [420]:
print(y_pred)

[ 1  1  1 ... -1 -1  1]


In [421]:
lsvm.score(x_train, y_train)

0.99128

In [422]:
accuracy = float(accuracy_score(y_test, y_pred))

In [423]:
print("Accuracy Percentage of Linear Support Vector Classifier Model : %0.2f" % (accuracy*100) + '%')

Accuracy Percentage of Linear Support Vector Classifier Model : 87.26%


In [424]:
cm = confusion_matrix(y_train, y_pred)

In [425]:
print("Confusion Matrix of Linear Support Vector Classifier Model -:")
print("True Negatives are : %d" % cm[0][0])
print("False Positives are : %d" % cm[0][1])
print("False Negatives are : %d" % cm[1][0])
print("True Positives are : %d" % cm[1][1])

Confusion Matrix of Linear Support Vector Classifier Model -:
True Negatives are : 11048
False Positives are : 1452
False Negatives are : 1732
True Positives are : 10768


In [426]:
precision = float(precision_score(y_train, y_pred))

In [427]:
print("Precision Percentage of Linear Support Vector Classifier Model : %0.2f" % (precision*100) + '%')

Precision Percentage of Linear Support Vector Classifier Model : 88.12%


In [428]:
rs = recall_score(y_train, y_pred)

In [429]:
print("Recall Score Percentage of Linear Support Vector Classifier Model : %0.2f" % (rs*100) + '%')

Recall Score Percentage of Linear Support Vector Classifier Model : 86.14%


In [430]:
fs = f1_score(y_train, y_pred)

In [431]:
print("F1 Score of Linear Support Vector Classifier Model : %0.2f" % (fs))

F1 Score of Linear Support Vector Classifier Model : 0.87


### 5. Decision Tree Classifier

In [432]:
dtc = DecisionTreeClassifier(random_state=0)

In [433]:
dtc.fit(x_train,y_train)

DecisionTreeClassifier(random_state=0)

In [434]:
y_pred = dtc.predict(x_test)

In [435]:
print(y_pred)

[ 1 -1  1 ... -1  1  1]


In [436]:
dtc.score(x_train, y_train)

1.0

In [437]:
accuracy = float(accuracy_score(y_test, y_pred))

In [438]:
print("Accuracy Percentage of Decision Tree Classifier Model : %0.2f" % (accuracy*100) + '%')

Accuracy Percentage of Decision Tree Classifier Model : 71.04%


In [439]:
cm = confusion_matrix(y_train, y_pred)

In [440]:
print("Confusion Matrix of Decision Tree Classifier Model -:")
print("True Negatives are : %d" % cm[0][0])
print("False Positives are : %d" % cm[0][1])
print("False Negatives are : %d" % cm[1][0])
print("True Positives are : %d" % cm[1][1])

Confusion Matrix of Decision Tree Classifier Model -:
True Negatives are : 8955
False Positives are : 3545
False Negatives are : 3695
True Positives are : 8805


In [441]:
precision = float(precision_score(y_train, y_pred))

In [442]:
print("Precision Percentage of Decision Tree Classifier Model : %0.2f" % (precision*100) + '%')

Precision Percentage of Decision Tree Classifier Model : 71.30%


In [443]:
rs = recall_score(y_train, y_pred)

In [444]:
print("Recall Score Percentage of Decision Tree Classifier Model : %0.2f" % (rs*100) + '%')

Recall Score Percentage of Decision Tree Classifier Model : 70.44%


In [445]:
fs = f1_score(y_train, y_pred)

In [446]:
print("F1 Score of Decision Tree Classifier Model : %0.2f" % (fs))

F1 Score of Decision Tree Classifier Model : 0.71


## Test Machine Learning Model

In [490]:
choice = int(input("Which Model You want to test by your Own Review?....\n1.Multinomial Naive Bayes\n2.Random Forest Classifier\n3.Logistic Regression\n4.Linear Support Vector Classifier\n5.Decision Tree Classifier\nEnter Choice..."))

Which Model You want to test by your Own Review?....
1.Multinomial Naive Bayes
2.Random Forest Classifier
3.Logistic Regression
4.Linear Support Vector Classifier
5.Decision Tree Classifier
Enter Choice...5


In [491]:
if choice == 1:
    predictions = mnb.predict(tfidf.transform([input("Enter Your Own Review :")]))[0]
elif choice == 2:
    predictions = rfc.predict(tfidf.transform([input("Enter Your Own Review :")]))[0]
elif choice == 3:
    predictions = lr.predict(tfidf.transform([input("Enter Your Own Review :")]))[0]
elif choice == 4:
    predictions = lsvm.predict(tfidf.transform([input("Enter Your Own Review :")]))[0]
elif choice == 5:
    predictions = dtc.predict(tfidf.transform([input("Enter Your Own Review :")]))[0]


if(predictions == -1):
	print("Negative Review!(-)")
else:
	print("Positive Review!(+)")

Enter Your Own Review :Great
Positive Review!(+)
