# Vectorizer Tuning

In [1]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


data = pd.read_csv("reviews.csv")
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abize\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [2]:
def preproccessing(series: pd.Series):
    for i in range(len(series)):
        review = re.sub('[^a-zA-Z]'," ", series[i])
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        all_stopwords = stopwords.words('english')
        all_stopwords.remove("not")
        review = [ps.stem(word) for word in review if not word in set(all_stopwords)]

        #review = [wordnet_lemmatizer.lemmatize(word) for word in series[i]]
        review = ' '.join(review)
        series[i] = review
    #print(series[i])
    return series
#preproccessing(data['clean_reviews'] )

In [3]:
data['clean_reviews'] = preproccessing(data.reviews)
data['clean_reviews']

0       plot two teen coupl go church parti drink driv...
1       happi bastard quick movi review damn k bug got...
2       movi like make jade movi viewer thank invent t...
3       quest camelot warner bro first featur length f...
4       synopsi mental unstabl man undergo psychothera...
                              ...                        
1995    wow movi everyth movi funni dramat interest we...
1996    richard gere command actor not alway great fil...
1997    glori star matthew broderick denzel washington...
1998    steven spielberg second epic film world war ii...
1999    truman true man burbank perfect name jim carre...
Name: clean_reviews, Length: 2000, dtype: object

In [6]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [27]:
# Create Pipeline
Multinomial = MultinomialNB()
#Met tout en miniscule et les transforme en Matrice = encodeur
countVecto = CountVectorizer()
# determine l'importance du mot dans les strings == Scaler
TfidfVecto = TfidfTransformer() 

#X = data['clean_reviews']
#y = data.target
X = countVecto.fit_transform(data['clean_reviews']).toarray()
y = countVecto.fit_transform(data.target).toarray()


pipe = Pipeline([('vect', countVecto), ('tidf', TfidfVecto),('clf',Multinomial)])

In [33]:
parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    #"clf__max_iter": (20,),
    "clf__alpha": (0.00001, 0.000001),
    #"clf__penalty": ("l2", "elasticnet"),
    # 'clf__max_iter': (10, 50, 80),
}
#grid.best_params_

In [34]:
# Set parameters to search (model and vectorizer)
grid = GridSearchCV(pipe, parameters, n_jobs=1, verbose=1)

In [35]:
#grid.fit(data.clean_reviews, y).feature_importances_
grid.fit(data.clean_reviews, data.target)
grid.best_score_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


0.7705

⚠️ Please push the exercise once you are done 🙃

## 🏁 