# Vectorizer Tuning

In [36]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [37]:
def TextNormalizer(text):

    for i in range(len(text)):
        clean_text =  text[i].strip()
        clean_text = clean_text.lower()
        whitelist = set("'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")
        clean_text = ''.join(filter(whitelist.__contains__, clean_text))
        clean_text = clean_text.strip()
        text[i] = clean_text

    return text

In [38]:
data['reviews']=TextNormalizer(data.reviews)

## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [None]:
# Create Pipeline

# Set parameters to search (model and vectorizer)

# Perform grid search on pipeline

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB()),
])


search = GridSearchCV(model, scoring = 'accuracy', cv=5, param_grid={
    'vectorizer__analyzer': ['word', 'char', 'char_wb'],
    'vectorizer__ngram_range': [(1,1), (2,2)],
    'vectorizer__binary': (True, False),
    'tfidf__use_idf': (True, False),
    #'onehot__threshold': [1.0, 2.0, 3.0],
    'bayes__alpha': [ 0.1, 0.3, 0.5 ,0.7, 0.9],
})

In [44]:
X_train, X_test, y_train, y_test= train_test_split(data.reviews, data.target, test_size= 0.2, random_state = 42)
search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('bayes', MultinomialNB())]),
             param_grid={'bayes__alpha': [0.1, 0.3, 0.5, 0.7, 0.9],
                         'tfidf__use_idf': (True, False),
                         'vectorizer__analyzer': ['word', 'char', 'char_wb'],
                         'vectorizer__binary': (True, False),
                         'vectorizer__ngram_range': [(1, 1), (2, 2)]},
             scoring='accuracy')

In [45]:
search.best_score_

0.8512500000000001

In [46]:
CART= search.best_estimator_ 
CART.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(binary=True, ngram_range=(2, 2))),
                ('tfidf', TfidfTransformer()),
                ('bayes', MultinomialNB(alpha=0.1))])

In [47]:
y_pred=CART.predict(X_test)
test_pred_score = accuracy_score(y_test, y_pred)
test_pred_score

0.86

⚠️ Please push the exercise once you are done 🙃

## 🏁 