# Vectorizer Tuning

In [1]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

üëá Remove punctuation and lower case the text.

In [2]:
import string

def preprocessing(text):
    for punctuation in string.punctuation: 
        text = text.replace(punctuation, ' ') 
    return text
    
data['clean_reviews'] = data.reviews.apply(preprocessing)

data.head()

Unnamed: 0,target,reviews,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couples go to a church party ...
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastard s quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest for camelot is warner bros firs...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis a mentally unstable man undergoing ...


In [None]:
# import string 

# def punctuation(text):
    
#     for punctuation in string.punctuation:
#         text = text.replace(punctuation, '')
#     return text

# def lower_case(text):
    
#     return text.lower()
      
# data['reviews'] = data.reviews.apply(punctuation)
# # data['reviews'] = data.reviews.apply(lower_case)

## Tuning

üëá Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [None]:
# Create Pipeline

# Set parameters to search (model and vectorizer)

# Perform grid search on pipeline

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'tfidf__min_df': (0.05,0.1),
    'tfidf__max_df': (0.75,1),
    'nb__alpha': (0.01,0.1,1,10),}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data.clean_reviews,data.target)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


 0.7755 0.6925    nan    nan    nan    nan 0.8055 0.731  0.774  0.693
    nan    nan    nan    nan 0.789  0.735  0.766  0.6975    nan    nan
    nan    nan]


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.01, 0.1, 1, 10),
                         'tfidf__max_df': (0.75, 1),
                         'tfidf__min_df': (0.05, 0.1),
                         'tfidf__ngram_range': ((1, 1), (2, 2))},
             scoring='accuracy', verbose=1)

In [8]:
# Create Pipeline
pipeline2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search (model and vectorizer)
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'tfidf__min_df': (0.05,0.1),
    'tfidf__max_df': (0.5,0.75,0.9),
    'nb__alpha': (0.01,0.1,1,10),}

# Perform grid search on pipeline
grid_search2 = GridSearchCV(pipeline2, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search2.fit(data['reviews'],data['target'])

Fitting 5 folds for each of 48 candidates, totalling 240 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.01, 0.1, 1, 10),
                         'tfidf__max_df': (0.5, 0.75, 0.9),
                         'tfidf__min_df': (0.05, 0.1),
                         'tfidf__ngram_range': ((1, 1), (2, 2))},
             scoring='accuracy', verbose=1)

In [9]:
grid_search2.best_params_

{'nb__alpha': 0.1,
 'tfidf__max_df': 0.5,
 'tfidf__min_df': 0.05,
 'tfidf__ngram_range': (1, 1)}

In [10]:
grid_search2.best_score_

0.8095000000000001

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# Create Pipeline
pipeline = Pipeline([
    ('count', CountVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search (model and vectorizer)
parameters_CV = {
    'count__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.1,1)}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters_CV, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data['reviews'],data['target'])

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('count', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'count__ngram_range': ((2, 2), (3, 3)),
                         'nb__alpha': (0.1, 1)},
             scoring='accuracy', verbose=1)

In [19]:
grid_search.best_params_

{'count__ngram_range': (2, 2), 'nb__alpha': 0.1}

In [13]:
grid_search.best_params_

{'count__ngram_range': (2, 2), 'nb__alpha': 0.1}

In [20]:
grid_search.best_score_

0.8320000000000001

‚ö†Ô∏è Please push the exercise once you are done üôÉ

## üèÅ 