# Movie Reviews

In [3]:
import pandas as pd

data = pd.read_csv("reviews.csv")
df = data.copy()
data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [17]:
import string 
def punct(text):
    table = text.maketrans("","", string.punctuation)
    return text.translate(table)
def clean(column):
    column = column.apply(lambda x: punct(x))
    column = column.apply(lambda x: x.lower())
    return column

In [12]:
df['reviews'] = clean(df['reviews'])
df['reviews']

0       plot  two teen couples go to a church party  d...
1       the happy bastards quick movie review \ndamn t...
2       it is movies like these that make a jaded movi...
3         quest for camelot  is warner bros   first fe...
4       synopsis  a mentally unstable man undergoing p...
                              ...                        
1995    wow  what a movie  \nits everything a movie ca...
1996    richard gere can be a commanding actor  but he...
1997    glorystarring matthew broderick  denzel washin...
1998    steven spielbergs second epic film on world wa...
1999    truman   trueman   burbank is the perfect name...
Name: reviews, Length: 2000, dtype: object

## Bag-of-Words modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a Bag-of-Word representation of the texts.

In [24]:
from nltk.stem import WordNetLemmatizer

def lematization(text):
    lemmatizer = WordNetLemmatizer()
    row = text.apply(lambda x: lemmatizer.lemmatize(x))
    return row
df['reviews'] = lematization(df['reviews'])

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
count_vector=cv.fit_transform(df['reviews'])

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

#cross validation
x=count_vector.toarray()
y=df.target
X_train, X_test, y_train, y_test= train_test_split(x, y, test_size= .2, random_state = 42)
#implement MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
#predictions
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
#accuracy score 
train_pred_score = accuracy_score(y_train, y_train_pred)
test_pred_score = accuracy_score(y_test, y_test_pred)
print('Training Set Accuracy Score: ', (100 * train_pred_score))
print('Testing Set Accuracy Score: ', (100 * test_pred_score))

Training Set Accuracy Score:  98.0
Testing Set Accuracy Score:  81.75


## N-gram modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a 2-gram Bag-of-Word representation of the texts.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vocabulary = ['hi ', 'bye', 'run away']
v = CountVectorizer(ngram_range=(2, 2))
print(v.fit(["an apple a day keeps the doctor away"]).vocabulary_) 

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(2,2))
count_vector=cv.fit_transform(df['reviews'])

In [49]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
X=count_vector.toarray()
y=df.target
clf = MultinomialNB()
#clf.fit(X, y)
#y_pred = clf.predict(X)

result = cross_validate(clf, X, y, cv=5)
sorted(result.keys())
result['test_score']

array([0.745 , 0.765 , 0.745 , 0.7875, 0.745 ])

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(2, 2))

vect = vectorizer.fit(df["reviews"])
vector = vect.transform(df.reviews)


#cross validation
x=vector.toarray()
y=data.target
X_train, X_test, y_train, y_test= train_test_split(x, y, test_size= .2, random_state = 42)
#implement MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
#predictions
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
#accuracy score 
train_pred_score = accuracy_score(y_train, y_train_pred)
test_pred_score = accuracy_score(y_test, y_test_pred)
print('Training Set Accuracy Score: ', (100 * train_pred_score))
print('Testing Set Accuracy Score: ', (100 * test_pred_score))

Training Set Accuracy Score:  100.0
Testing Set Accuracy Score:  76.5


⚠️ Please push the exercise once you are done 🙃

## 🏁 