In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dariarousset/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dariarousset/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dariarousset/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
os.listdir('../raw_data/')

['.DS_Store',
 'french_tweets.csv',
 'test.csv',
 'train_allo.csv',
 '.keep',
 'train.csv']

In [5]:
french = pd.read_csv('../raw_data/train_allo.csv').sample(20000).reset_index(drop=True)
french.polarity.value_counts()

0    10039
1     9961
Name: polarity, dtype: int64

In [6]:
french.head()

Unnamed: 0.1,Unnamed: 0,film-url,review,polarity
0,3970,http://www.allocine.fr/film/fichefilm-51316/cr...,"Un film très fort, à l'ambiance unique . Remet...",1
1,91720,http://www.allocine.fr/film/fichefilm-178820/c...,"donc, avoir 20 partenaires dans sa vie, c'est ...",0
2,34206,http://www.allocine.fr/film/fichefilm-261277/c...,Les acteurs sont relativement bons. .... Le re...,0
3,109119,http://www.allocine.fr/film/fichefilm-145730/c...,Un des meilleurs films que j'aie vu ces dernie...,1
4,36611,http://www.allocine.fr/film/fichefilm-192857/c...,"A la fois très ennuyeux et prétentieux, d'une ...",0


In [8]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import TweetTokenizer
import spacy
stop = set(stopwords.words('french'))
nlp=spacy.load("fr_core_news_md")


def clean_text(text):
    #emojis=demoji.findall(text)
    #if emojis != {}:
        #for key,value in emojis.items(): 
            #if key in text:
                #try:
                    #translated_text=ts.translate_html(value, translator=ts.google, to_language='fr', n_jobs=-1)
                    #text=text.replace(key,translated_text)
                #except TypeError:
                    #pass
    # lower text
    text = text.lower()
    # remove puncutation
    for punctuation in string.punctuation.replace('#',''):
        text = text.replace(punctuation, ' ')
    # remove words that contain numbers
    text = ''.join(letter for letter in text if not letter.isdigit())
    #tokenization + remove stop words
    doc=nlp(text)
    lemmatized= [token.lemma_ for token in doc]
    # join all
    text = " ".join(lemmatized)
    return text

In [9]:
french["clean_text"] = french["review"].apply(clean_text)

In [10]:
french.head()

Unnamed: 0.1,Unnamed: 0,film-url,review,polarity,clean_text
0,3970,http://www.allocine.fr/film/fichefilm-51316/cr...,"Un film très fort, à l'ambiance unique . Remet...",1,un film très fort à l ambiance unique rem...
1,91720,http://www.allocine.fr/film/fichefilm-178820/c...,"donc, avoir 20 partenaires dans sa vie, c'est ...",0,donc avoir partenaire dans son vie c êtr...
2,34206,http://www.allocine.fr/film/fichefilm-261277/c...,Les acteurs sont relativement bons. .... Le re...,0,le acteur être relativement bon le rest...
3,109119,http://www.allocine.fr/film/fichefilm-145730/c...,Un des meilleurs films que j'aie vu ces dernie...,1,un de meilleur film que j avoir voir ce dernie...
4,36611,http://www.allocine.fr/film/fichefilm-192857/c...,"A la fois très ennuyeux et prétentieux, d'une ...",0,avoir le fois très ennuyeux et prétentieux d...


In [11]:
# 2nd iteration
# Fine tuning the model
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

X= french['clean_text']
y= french['polarity']

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2), (1,2), (2,3), (3,3)),
    'nb__alpha': (0.1, 0.5, 1, 5, 10),
    'tfidf__max_df': [0.60, 0.65, 0.70, 0.75, 0.85, 1],
}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "f1", 
                           refit=True, cv=10)

grid_search.fit(X,y)

Fitting 10 folds for each of 150 candidates, totalling 1500 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 0.5, 1, 5, 10),
                         'tfidf__max_df': [0.6, 0.65, 0.7, 0.75, 0.85, 1],
                         'tfidf__ngram_range': ((1, 1), (2, 2), (1, 2), (2, 3),
                                                (3, 3))},
             scoring='f1', verbose=1)

In [12]:
grid_search.best_params_

{'nb__alpha': 0.5, 'tfidf__max_df': 0.65, 'tfidf__ngram_range': (1, 2)}

In [13]:
grid_search.best_score_

0.9108041148622968

In [14]:
french.to_csv("allo_clean.csv")

In [15]:
import joblib 
joblib.dump(grid_search,'ml_model_nb_allo.pkl')

['ml_model_nb_allo.pkl']

In [16]:
joblib.load("ml_model_nb.pkl")

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 0.5, 1, 5, 10),
                         'tfidf__lowercase': (True, False),
                         'tfidf__max_df': [0.6, 0.65, 0.7, 0.75, 0.85, 1],
                         'tfidf__ngram_range': ((1, 1), (2, 2), (1, 2), (2, 3),
                                                (3, 3))},
             scoring='f1', verbose=1)

In [22]:
predict = pd.read_csv("tweet_psg.csv")

In [23]:
predict["clean_text"] = predict["tweet"].apply(clean_text)

In [24]:
X_test = predict['clean_text']

In [25]:
prediction = pd.DataFrame(grid_search.best_estimator_.predict(X_test))

In [26]:
real_twits_prediction = pd.concat([predict,prediction], axis=1)

In [27]:
real_twits_prediction.to_csv("prediction_allo.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ce82c844-fa70-433d-b480-148c6a0f0f8b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>