# Movie Reviews and Bag-of-Words Modelling

## Imports

In [1]:
import pandas as pd
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate, cross_val_score
import numpy as np

In [2]:
data = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/05-Machine-Learning/10-Natural-Language-Processing/movie_reviews.csv")
data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [3]:
data.shape

(2000, 2)

## 1. Preprocessing

In [4]:
def preprocessing(sentence :str):
    sentence.strip()
    sentence.lower()
    sentence = "".join(char for char in sentence if not char.isdigit())
    sentence = "".join([char for char in sentence if char not in string.punctuation]) 
    tokens = word_tokenize(sentence) 
    lemmatized_tokens = [WordNetLemmatizer().lemmatize(token) for token in tokens]
    cleaned_reviews = " ".join(word for word in lemmatized_tokens)
    return cleaned_reviews

In [5]:
# Clean reviews
data["clean_reviews"] = data["reviews"].apply(preprocessing)
data

Unnamed: 0,target,reviews,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couple go to a church party drin...
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastard quick movie review damn that...
2,neg,it is movies like these that make a jaded movi...,it is movie like these that make a jaded movie...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest for camelot is warner bros first feature...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis a mentally unstable man undergoing ps...
...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,wow what a movie it everything a movie can be ...
1996,pos,"richard gere can be a commanding actor , but h...",richard gere can be a commanding actor but he ...
1997,pos,"glory--starring matthew broderick , denzel was...",glorystarring matthew broderick denzel washing...
1998,pos,steven spielberg's second epic film on world w...,steven spielberg second epic film on world war...


In [6]:
data["target"].unique()

array(['neg', 'pos'], dtype=object)

In [7]:
data["target_encoded"] =  LabelEncoder().fit_transform(data["target"])

In [8]:
data.head()

Unnamed: 0,target,reviews,clean_reviews,target_encoded
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couple go to a church party drin...,0
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastard quick movie review damn that...,0
2,neg,it is movies like these that make a jaded movi...,it is movie like these that make a jaded movie...,0
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest for camelot is warner bros first feature...,0
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis a mentally unstable man undergoing ps...,0


## 2. Bag-of-Words Modelling

In [9]:
vectorizer = CountVectorizer()
model = MultinomialNB()
X = vectorizer.fit_transform(data["clean_reviews"])
y = data["target_encoded"]
score = cross_val_score(
    model,
    X,
    y,
    cv=5,
    scoring="accuracy"
).mean()
np.round(score, 2)

0.82

## 3. N-gram Modelling

In [10]:
vectorizer = CountVectorizer(ngram_range = (2,2))
naivebayes = MultinomialNB()
X_bow = vectorizer.fit_transform(data.clean_reviews)
cv_nb = cross_validate(
    naivebayes,
    X_bow,
    data.target_encoded,
    scoring = "accuracy"
)
round(cv_nb['test_score'].mean(),2)

0.84