In [1]:
import numpy as np
import pandas as pd

# Data loading and cleaning

In [2]:
# Loading data from raw_data
X_train = pd.read_csv('../../raw_data/clean_sentiment_data/X_train.csv')
y_train = pd.read_csv('../../raw_data/clean_sentiment_data/y_train.csv')
# X_test = pd.read_csv('../../raw_data/clean_sentiment_data/X_test.csv')
# y_test = pd.read_csv('../../raw_data/clean_sentiment_data/y_test.csv')

In [3]:
# Transforming y in pd.Series
y_train = y_train['Answer.sentiment']

# Binary classification

## Data cleaning

In [4]:
y_train_bin = y_train.apply(lambda x: 0 if x <= 0 else 1)

In [5]:
X_train.drop(columns=['Segment_ID', 'clip', 'ID'])

Unnamed: 0,Phrase
0,But (uhh) so yeah
1,The movie ruins it for itself
2,"(umm) The acting's subpar, the screen the scr..."
3,"And now, because of that, Machinima got intimi..."
4,I thought something would have to descend on m...
...,...
10837,"When we're talking about a verb, a verb can ha..."
10838,"If you are continuing at OSU, I hope you have ..."
10839,"I come in and I say ""Okay, hi everybody I'm go..."
10840,"This time, in the birthplace of modern innovat..."


## Basic preprocessing

### Lowercase

In [6]:
def lower(x):
    return x.lower()

In [7]:
X_train['clean_text'] = X_train['Phrase'].apply(lower)

### Remove punctuation

In [8]:
import string
def punct(x):
    for text in string.punctuation:
        x = x.replace(text, '')
    return x

In [9]:
X_train['clean_text'] = X_train['clean_text'].apply(punct)

### Remove numbers

In [10]:
def del_num(x):
    return ''.join([letter for letter in x if not letter.isdigit()])

In [11]:
X_train['clean_text'] = X_train['clean_text'].apply(del_num)

## First model with bag of words CountVectorizer & MultinomialNB

In [12]:
# Create a validation set on X_train
from sklearn.model_selection import train_test_split

X_t, X_val, y_t, y_val = train_test_split(X_train, y_train_bin, test_size=0.2)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_t['clean_text'])
X_t_vect1 = vectorizer.transform(X_t['clean_text'])

In [14]:
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(X_t_vect1, y_t)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
# Vectorizer on X_val
X_val_vect1 = vectorizer.transform(X_val['clean_text'])

In [21]:
# Compute accuracy on validation set
round(nb_model.score(X_val_vect1, y_val),2)

0.7

## Exploration of predicts from model 1

In [22]:
y_val_pred = nb_model.predict(X_val_vect1)

In [25]:
results = X_val.copy()

In [26]:
results['y_val'] = y_val

In [29]:
results['y_pred'] = y_val_pred

In [39]:
results = results.drop(columns=['clip', 'ID', 'Phrase'])

In [40]:
results['results'] = 1 - abs(results['y_val'] - results['y_pred'])

In [41]:
results.shape

(2169, 5)

In [43]:
tete_faux = results[results['results'] == 0 ]

In [47]:
tete_faux.to_csv('False_nlp_ml_m1.csv', index=False)

## Tf-Id vectorizer

In [58]:
# Creation of a pipeline a grid search to know which parameters are better 

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ('vect', TfidfVectorizer()),
    ('model', MultinomialNB ())
])

grid_params = {'vect__max_df': [0.7, 0.8, 0.9],
              'vect__max_features': [30, 40, 50, 60],
              'vect__ngram_range': [(1,1), (2,2)],
              'vect__min_df': [0.8, 0.9, 1]}

search = GridSearchCV(pipe, grid_params, n_jobs=-1, verbose=1, scoring='accuracy',cv=5)

In [51]:
pipe.get_params()

{'memory': None,
 'steps': [('vect',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=1.0, max_features=None,
                   min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
 'verbose': False,
 'vect': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=None,
                 min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words=None

In [59]:
search.fit(X_t['clean_text'], y_t)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   49.6s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [60]:
search.best_params_

{'vect__max_df': 0.7,
 'vect__max_features': 50,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1)}

In [61]:
pipe_good = Pipeline([
    ('vect', TfidfVectorizer(max_df = 0.7, max_features = 50, ngram_range = (1,1))),
    ('model', MultinomialNB())
])

In [65]:
pipe_good.fit(X_t['clean_text'], y_t)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.7, max_features=50,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [66]:
round(pipe_good.score(X_val['clean_text'], y_val),2)

0.66