<a href="https://colab.research.google.com/github/Fakhre-Alam-Hub/Movie-Recommendation-System/blob/main/sentiment_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Creating Sentiment Model From IMDB Movie Review Dataset**

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [78]:
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
import string
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
nltk.download("all")

In [80]:
data = pd.read_csv("/content/drive/My Drive/dataset/imdb/IMDB Dataset.csv")

In [81]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [82]:
wn = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

def text_preprocess(text):
    soup = BeautifulSoup(text, 'html.parser').text
    no_punctuation = "".join([c for c in soup if c not in string.punctuation]).lower()
    tokens = word_tokenize(no_punctuation)
    clean_text = [word for word in tokens if word not in stopwords]
    lemma = [wn.lemmatize(word) for word in clean_text]
    lemma = ' '.join(lemma)
    return lemma

In [83]:
data['clean_text'] = data['review'].apply(lambda x: text_preprocess(x))

In [84]:
data.head()

Unnamed: 0,review,sentiment,clean_text
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode y...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


In [85]:
from collections import Counter

def counter_word(text):
  count = Counter()
  for i in text.values:
    for word in i.split():
      count[word] += 1
  return count

In [86]:
text = data.clean_text

counter = counter_word(text)

In [87]:
num_words = len(counter)
max_length = 500

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score,accuracy_score
import pickle

In [89]:
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopwords)

In [90]:
X = vectorizer.fit_transform(data.clean_text)
y = pd.get_dummies(data,columns=['sentiment'],drop_first=True)['sentiment_positive']

In [None]:
pickle.dump(vectorizer, open('tranform.pkl', 'wb'))

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [92]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [93]:
from sklearn.model_selection import cross_val_score, KFold

kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(clf, X_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

K-fold CV average score: 0.86


In [94]:
accuracy_score(y_test,clf.predict(X_test))*100

86.76

In [95]:
clf = naive_bayes.MultinomialNB()
clf.fit(X,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [96]:
accuracy_score(y_test,clf.predict(X_test))*100

91.58

In [56]:
filename = 'sentiment_classifier.pkl'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
with open(r"C:\dataset\MovieRecommendation\tranform.pkl", 'rb') as f:
    vector = pickle.load(f)

with open(r"C:\dataset\MovieRecommendation\sentiment_classifier.pkl", 'rb') as f:
    model = pickle.load(f)

In [101]:
text = text_preprocess("This is the worlds boring movie do not watch this.")

In [104]:
text

'world boring movie watch'

In [105]:
clf.predict(vectorizer.transform([text]))[0]

0