In [15]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score,accuracy_score
import pickle

In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kishan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
dataset = pd.read_csv('reviews.txt',sep = '\t', names =['Reviews','Comments'])

In [4]:
dataset

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


In [5]:
stopset = set(stopwords.words('english'))

In [6]:
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopset)

In [16]:
X = vectorizer.fit_transform(dataset.Comments)
y = dataset.Reviews
pickle.dump(vectorizer, open('tranform.pkl', 'wb'))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [18]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train,y_train)

MultinomialNB()

In [19]:
accuracy_score(y_test,clf.predict(X_test))*100

97.47109826589595

In [20]:
clf = naive_bayes.MultinomialNB()
clf.fit(X,y)

MultinomialNB()

In [21]:
accuracy_score(y_test,clf.predict(X_test))*100

98.77167630057804

In [22]:
filename = 'nlp_model.pkl'
pickle.dump(clf, open(filename, 'wb'))

In [14]:
import pandas as pd

dataset = pd.read_csv("../datasets/main_data.csv")  # change name if needed
print(dataset.head())
print(dataset.columns)


     director_name     actor_1_name    actor_2_name    actor_3_name  \
0    John Lasseter        Tom Hanks       Tim Allen     Don Rickles   
1     Joe Johnston   Robin Williams   Jonathan Hyde   Kirsten Dunst   
2    Howard Deutch   Walter Matthau     Jack Lemmon     Ann-Margret   
3  Forest Whitaker  Whitney Houston  Angela Bassett  Loretta Devine   
4    Charles Shyer     Steve Martin    Diane Keaton    Martin Short   

                     genres                  movie_title  \
0   Animation Comedy Family                    toy story   
1  Adventure Fantasy Family                      jumanji   
2            Romance Comedy             grumpier old men   
3      Comedy Drama Romance            waiting to exhale   
4                    Comedy  father of the bride part ii   

                                                comb  
0  Tom Hanks Tim Allen Don Rickles John Lasseter ...  
1  Robin Williams Jonathan Hyde Kirsten Dunst Joe...  
2  Walter Matthau Jack Lemmon Ann-Margret Howar

In [16]:
with open("../datasets/reviews.txt", "r", encoding="utf-8", errors="ignore") as f:
    for i in range(5):
        print(f.readline())


1	The Da Vinci Code book is just awesome.

1	this was the first clive cussler i've ever read, but even books like Relic, and Da Vinci code were more plausible than this.

1	i liked the Da Vinci Code a lot.

1	i liked the Da Vinci Code a lot.

1	I liked the Da Vinci Code but it ultimatly didn't seem to hold it's own.



In [17]:
import pandas as pd

data = []

with open("../datasets/reviews.txt", "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        parts = line.strip().split("\t", 1)
        if len(parts) == 2:
            label, text = parts
            data.append([int(label), text])

dataset = pd.DataFrame(data, columns=["label", "review"])

print(dataset.head())
print(dataset["label"].value_counts())


   label                                             review
0      1            The Da Vinci Code book is just awesome.
1      1  this was the first clive cussler i've ever rea...
2      1                   i liked the Da Vinci Code a lot.
3      1                   i liked the Da Vinci Code a lot.
4      1  I liked the Da Vinci Code but it ultimatly did...
label
1    3995
0    3091
Name: count, dtype: int64


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    lowercase=True,
    strip_accents='ascii',
    stop_words='english',
    max_features=5000
)

X = vectorizer.fit_transform(dataset["review"])
y = dataset["label"]

print("Vectorizer fitted:", hasattr(vectorizer, "vocabulary_"))
print("X shape:", X.shape)


Vectorizer fitted: True
X shape: (7086, 1920)


In [19]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
clf.fit(X, y)

print("Model trained!")


Model trained!


In [20]:
import joblib

joblib.dump(vectorizer, "../transform.pkl")
joblib.dump(clf, "../nlp_model.pkl")

print("Saved transform.pkl and nlp_model.pkl")



Saved transform.pkl and nlp_model.pkl


In [21]:
import os
print(os.listdir(".."))



['$PROFILE.txt', '.git', '.ipynb_checkpoints', 'datasets', 'main.py', 'main_data.csv', 'main_reviewsRecieved.py', 'nlp_model.pkl', 'Procfile', 'README.md', 'requirements-1.txt', 'requirements.txt', 'static', 'templates', 'tranform.pkl', 'transform.pkl', 'venv']


In [22]:
test_texts = [
    "This movie was absolutely amazing",
    "Worst film I have ever seen"
]

vec = vectorizer.transform(test_texts)
preds = clf.predict(vec)

for t, p in zip(test_texts, preds):
    print(t, "=>", "Positive" if p == 1 else "Negative")


This movie was absolutely amazing => Positive
Worst film I have ever seen => Negative
