# Vectorizer + NaiveBayes Tuning

## Imports

In [1]:
import pandas as pd
from sklearn import preprocessing
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import set_config; set_config("diagram")
import numpy as np
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv("movie_reviews.csv")
data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [3]:
le = preprocessing.LabelEncoder()
data["target_encoded"] =  le.fit_transform(data.target)

In [4]:
data.head()

Unnamed: 0,target,reviews,target_encoded
0,neg,"plot : two teen couples go to a church party ,...",0
1,neg,the happy bastard's quick movie review \ndamn ...,0
2,neg,it is movies like these that make a jaded movi...,0
3,neg,""" quest for camelot "" is warner bros . ' firs...",0
4,neg,synopsis : a mentally unstable man undergoing ...,0


## Preprocessing

In [5]:
def preprocessing(sentence : str):
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = "".join(char for char in sentence if not char.isdigit())
    sentence = "".join([char for char in sentence if char not in string.punctuation])
    tokenized = word_tokenize(sentence) 
    stop_words = set(stopwords.words("english"))
    tokens_cleaned = [word for word in tokenized if not word in stop_words]
    lemmatized = [WordNetLemmatizer().lemmatize(token, pos="v") for token in tokens_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(token, pos="s") for token in tokens_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(token, pos="n") for token in tokens_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(token, pos="a") for token in tokens_cleaned]
    lemmatized = [WordNetLemmatizer().lemmatize(token, pos="r") for token in tokens_cleaned]
    
    cleaned_reviews = " ".join(word for word in lemmatized)
    return cleaned_reviews

In [6]:
# Clean reviews
data["cleaned"] = data["reviews"].apply(preprocessing)
data

Unnamed: 0,target,reviews,target_encoded,cleaned
0,neg,"plot : two teen couples go to a church party ,...",0,plot two teen couples go church party drink dr...
1,neg,the happy bastard's quick movie review \ndamn ...,0,happy bastards quick movie review damn yk bug ...
2,neg,it is movies like these that make a jaded movi...,0,movies like make jaded movie viewer thankful i...
3,neg,""" quest for camelot "" is warner bros . ' firs...",0,quest camelot warner bros first featurelength ...
4,neg,synopsis : a mentally unstable man undergoing ...,0,synopsis mentally unstable man undergoing psyc...
...,...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,1,wow movie everything movie funny dramatic inte...
1996,pos,"richard gere can be a commanding actor , but h...",1,richard gere commanding actor hes always great...
1997,pos,"glory--starring matthew broderick , denzel was...",1,glorystarring matthew broderick denzel washing...
1998,pos,steven spielberg's second epic film on world w...,1,steven spielbergs second epic film world war i...


## Tuning

In [7]:
X = data["cleaned"]
y = data["target_encoded"]
# Create Pipeline
pipe = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()

)
result = cross_val_score(
    pipe,
    X,
    y,
    cv=10,
    scoring="accuracy"

).mean()
np.round(result, 2)

0.82

In [8]:
# Set parameters to search
parameters = {
    "tfidfvectorizer__ngram_range": ((1, 1), (2, 2)),
    "multinomialnb__alpha": (0.1, 1)
}

# Perform grid search on pipeline
search = GridSearchCV(
    pipe,
    parameters,
    scoring="accuracy",
    cv = 10,
    n_jobs=-1,
    verbose=1
).fit(data["cleaned"], data["target_encoded"])
np.round(search.best_score_, 2)
# Best score
print(f"Best Score = {np.round(search.best_score_, 2)}")

# Best params
print(f"Best params = {search.best_params_}")

Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best Score = 0.82
Best params = {'multinomialnb__alpha': 1, 'tfidfvectorizer__ngram_range': (1, 1)}


In [9]:
search.best_estimator_