# a.

## Dataset Loading 

In [6]:
import pandas as pd 

In [7]:
data = pd.read_csv('IMDB Dataset.csv')

In [8]:
data.sample(n= 10)

Unnamed: 0,review,sentiment
36842,There's something going on in this film direct...,negative
2948,"First, I would like to admit that Chokher Bali...",negative
4205,I imagine when Hitchcock scholars and experts ...,negative
8906,Strained and humorless (especially in light of...,negative
48852,Saw this in the theater in '86 and fell out of...,positive
15825,This movie is a quite fair adaptation of the P...,positive
287,I saw this movie last night and thought it was...,positive
7245,"The movie looked like a walk-through for ""Immo...",positive
19867,One-note comedy that probably sets modern day ...,negative
2138,*review may contain spoilers*<br /><br />predi...,negative


# b.

## Preprocessing Steps 

In [9]:
data['sentiment'] =  [1 if each == "positive" else 0 for each in data.sentiment]
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


### loadin necessary libraries

In [4]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Tokenization

In [10]:
data['review'] = data['review'].apply(word_tokenize)

### Stop Words Removal

In [11]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

data['review'] = data['review'].apply(remove_stopwords)

### lemmatization

In [12]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

data['review'] = data['review'].apply(lemmatize_words)

### Stemming

In [13]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem_words(tokens):
    return [stemmer.stem(word) for word in tokens]

data['review'] = data['review'].apply(stem_words)

In [15]:
data['review'] = data['review'].apply(lambda x: ' '.join(x))

In [16]:
data

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod 'll hook ...,1
1,wonder littl product . < br / > < br / > film ...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic 's famili littl boy ( jake ) think 's zo...,0
4,petter mattei 's `` love time money '' visual ...,1
...,...,...
49995,thought movi right good job . n't creativ orig...,1
49996,"bad plot , bad dialogu , bad act , idiot direc...",0
49997,"cathol taught parochi elementari school nun , ...",0
49998,'m go disagre previou comment side maltin one ...,0


# c.

## CountVectorizer and TfidfVectorizer

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(data['review'])

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(data['review'])


# d.

## splitting dataset

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X_train1, X_test1, y_train1, y_test1 = train_test_split(X_tfidf[0:49999], data['sentiment'][0:49999], test_size=0.2, random_state=42)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_count[0:49999], data['sentiment'][0:49999], test_size=0.2, random_state=42)

# scaler = StandardScaler()
# X_train_scaled1 = scaler.fit_transform(X_train1)
# X_test_scaled1 = scaler.transform(X_test1)


In [19]:
# scaler = StandardScaler()
# X_train_scaled2 = scaler.fit_transform(X_train2)
# X_test_scaled2 = scaler.transform(X_test2)


# e.

## Naive Bayes

### Naive Bayes with tf_idf model

### training the model

In [27]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib

nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(X_train1, y_train1)

y_pred = nb_classifier_tfidf.predict(X_test1)


### Evaluation 

In [30]:

accuracy = accuracy_score(y_test1, y_pred)
print("Accuracy for TF_IDF vectorizer method:", accuracy)
print(classification_report(y_test1, y_pred))

joblib.dump(nb_classifier_tfidf, 'naive_bayes_model1.pkl')

Accuracy for TF_IDF vectorizer method: 0.8642
              precision    recall  f1-score   support

           0       0.85      0.88      0.87      4969
           1       0.88      0.85      0.86      5031

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



['naive_bayes_model1.pkl']

### Naive Bayes with CountVectorizer model

### training the model

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib

nb_classifier_count = MultinomialNB()
nb_classifier_count.fit(X_train2, y_train2)

y_pred = nb_classifier_count.predict(X_test2)

### Evaluation 

In [32]:

accuracy = accuracy_score(y_test2, y_pred)
print("Accuracy for Count vectorizer method:", accuracy)
print(classification_report(y_test2, y_pred))

joblib.dump(nb_classifier_count, 'naive_bayes_model2.pkl')

Accuracy for Count vectorizer method: 0.8567
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      4969
           1       0.87      0.84      0.85      5031

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



['naive_bayes_model2.pkl']

# f.

### best models's prediction new review from the IMDB dataset

In [58]:
saved_model1 = joblib.load('naive_bayes_model1.pkl')
saved_model2 = joblib.load('naive_bayes_model2.pkl')


new_reivew_tfidf = X_tfidf[49999]
new_reivew_count = X_count[49999]

predicted_sentiment_tfidf = saved_model1.predict(new_reivew_tfidf)
predicted_sentiment_count = saved_model2.predict(new_reivew_count)

print("Predicted sentiment for tf_idf vectorizer model :", predicted_sentiment_tfidf)
print("Predicted sentiment for count vectorzer model:", predicted_sentiment_count)


Predicted sentiment for tf_idf vectorizer model : [0]
Predicted sentiment for count vectorzer model: [0]


In [62]:
data.iloc[49999]

review       one expect star trek movi high art , fan expec...
sentiment                                                    0
Name: 49999, dtype: object

### prediction  is correct for both tf_idf and CountVectorize methods 