# Vectorizer Tuning

In [4]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [5]:
df = data.copy()

The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [6]:
import string 
def punct(text):
    table = text.maketrans("","", string.punctuation)
    return text.translate(table)
def clean(column):
    column = column.apply(lambda x: punct(x))
    column = column.apply(lambda x: x.lower())
    return column

In [7]:
df['reviews'] = clean(df['reviews'])
df['reviews']

0       plot  two teen couples go to a church party  d...
1       the happy bastards quick movie review \ndamn t...
2       it is movies like these that make a jaded movi...
3         quest for camelot  is warner bros   first fe...
4       synopsis  a mentally unstable man undergoing p...
                              ...                        
1995    wow  what a movie  \nits everything a movie ca...
1996    richard gere can be a commanding actor  but he...
1997    glorystarring matthew broderick  denzel washin...
1998    steven spielbergs second epic film on world wa...
1999    truman   trueman   burbank is the perfect name...
Name: reviews, Length: 2000, dtype: object

## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [None]:
# Create Pipeline

#Use idf

# Set parameters to search (model and vectorizer)

# Perform grid search on pipeline

In [7]:
#Stop word
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def stop_words(text):
    row = text.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))
    return row
#data['clean_text'] = stop_words(data['clean_text'])

In [8]:
#Lemmatization
from nltk.stem import WordNetLemmatizer

def lematization(text):
    lemmatizer = WordNetLemmatizer()
    row = text.apply(lambda x: lemmatizer.lemmatize(x))
    return row
#data['clean_text'] = lematization(data['clean_text'])

In [None]:
#Vectorization
from sklearn.feature_extraction.text import CountVectorizer
def vectorisation(text):
    cv = CountVectorizer(ngram_range=(2,2))
    count_vector=cv.fit_transform(text)
    return count_vector

In [None]:
#Tfidf vector
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
#X = tfidf_vect.fit(df['reviews'])
#X = tfidf_vect.transform(df['reviews'])

In [None]:
#MultinomialNB
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

In [8]:
# Create Pipeline

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB()),
])
# Set parameters to search (model and vectorizer)
param = { 'vect__stop_words' : ("english",None),
    'vect__analyzer': ['word', 'char', 'char_wb'],
    'vect__ngram_range': [(1,1), (2,2)],
    'tfidf__norm': ('l1', 'l2'),
    'bayes__alpha': [0.1, 1.0]
}


In [9]:
# Perform grid search on pipeline

from sklearn.model_selection import GridSearchCV

search = GridSearchCV(model, param_grid=param)

search.fit(df.reviews, df.target)

search.best_params_

{'bayes__alpha': 0.1,
 'tfidf__norm': 'l2',
 'vect__analyzer': 'word',
 'vect__ngram_range': (2, 2),
 'vect__stop_words': None}

from sklearn.model_selection import GridSearchCV

search = GridSearchCV(model, param_grid={
    'count__analyzer': ['word', 'char', 'char_wb'],
    'count__ngram_range': [(1,1), (1,2), (1,3), (1,4), (1,5), (2,3)],
    'onehot__threshold': [0.0, 1.0, 2.0, 3.0],
    'bayes__alpha': [0.0, 1.0],
})

⚠️ Please push the exercise once you are done 🙃

## 🏁 