# Vectorizer Tuning

In [1]:
import pandas as pd

import nltk_utils

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [2]:
data["clean_reviews"] = data.reviews.str.replace(r"[^A-Za-z]", " ").str.lower()

  data["clean_reviews"] = data.reviews.str.replace(r"[^A-Za-z]", " ").str.lower()


## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
vector = TfidfVectorizer()
X_bow = vector.fit_transform(data.clean_reviews)
score = nltk_utils.multinomial_naive_bayes(X_bow, data.target)

In [49]:
# Create Pipeline
# Set parameters to search (model and vectorizer)
pipe = Pipeline([("vect", TfidfVectorizer()), ("clf", MultinomialNB())])
# Perform grid search on pipeline

In [136]:
parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    # "clf__fit_prior": ("True", "False"),
    "clf__alpha": (0.00001, 0.000001),
    # 'clf__max_iter': (10, 50, 80),
}

In [139]:
grid_search = GridSearchCV(pipe, param_grid=parameters, scoring="accuracy")

In [140]:
grid_search.fit(data.clean_reviews, data.target)
grid_search.best_score_

0.7985

⚠️ Please push the exercise once you are done 🙃

## 🏁 