In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dariarousset/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dariarousset/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dariarousset/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
os.listdir('../raw_data/')

['.DS_Store', 'french_tweets.csv', 'test.csv', '.keep', 'train.csv']

In [10]:
french = pd.read_csv('../raw_data/french_tweets.csv').sample(50000).reset_index(drop=True)
french.label.value_counts()

0    25435
1    24565
Name: label, dtype: int64

In [38]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import TweetTokenizer
import spacy
stop = set(stopwords.words('french'))
nlp=spacy.load("fr_core_news_md")


def clean_text(text):
    #emojis=demoji.findall(text)
    #if emojis != {}:
        #for key,value in emojis.items(): 
            #if key in text:
                #try:
                    #translated_text=ts.translate_html(value, translator=ts.google, to_language='fr', n_jobs=-1)
                    #text=text.replace(key,translated_text)
                #except TypeError:
                    #pass
    # lower text
    text = text.lower()
    # remove puncutation
    for punctuation in string.punctuation.replace('#',''):
        text = text.replace(punctuation, ' ')
    # remove words that contain numbers
    text = ''.join(letter for letter in text if not letter.isdigit())
    #tokenization + remove stop words
    doc=nlp(text)
    lemmatized= [token.lemma_ for token in doc]
    # join all
    text = " ".join(lemmatized)
    return text

In [12]:
french["clean_text"] = french["text"].apply(clean_text)

In [15]:
french.head()

Unnamed: 0,label,text,clean_text
0,0,La peinture me rend achy,le peinture me rendre achy
1,0,Les fans de nk ne sont-ils pas les meilleurs? ...,le fan de nk ne être il pas le meilleur je d...
2,1,Une autre belle journée. Coincé dans le bureau...,un autre bel journée coincer dans le bureau ...
3,0,": Dee, vous avez acheté une autre mini jupe? E...",dee vous avoir acheter un autre mini jup ...
4,1,Mon domestique à Hollywood a été éliminé! Trav...,mon domestique à hollywood avoir être éliminer...


In [18]:
# 2nd iteration
# Fine tuning the model
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

X= french['clean_text']
y= french['label']

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2), (1,2), (2,3), (3,3)),
    'nb__alpha': (0.1, 0.5, 1, 5, 10),
    'tfidf__max_df': [0.60, 0.65, 0.70, 0.75, 0.85, 1],
}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "f1", 
                           refit=True, cv=10)

grid_search.fit(X,y)

Fitting 10 folds for each of 300 candidates, totalling 3000 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 0.5, 1, 5, 10),
                         'tfidf__lowercase': (True, False),
                         'tfidf__max_df': [0.6, 0.65, 0.7, 0.75, 0.85, 1],
                         'tfidf__ngram_range': ((1, 1), (2, 2), (1, 2), (2, 3),
                                                (3, 3))},
             scoring='f1', verbose=1)

In [19]:
grid_search.best_params_

{'nb__alpha': 0.5,
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.6,
 'tfidf__ngram_range': (1, 2)}

In [20]:
grid_search.best_score_

0.723020073081222

In [22]:
french.to_csv("french_clean.csv")

In [23]:
import joblib 
joblib.dump(grid_search,'ml_model_nb.pkl')

['ml_model_nb.pkl']

In [None]:
joblib.load("ml_model_nb.pkl")

In [46]:
predict = pd.read_csv("tweet_psg.csv")

In [47]:
predict["clean_text"] = predict["tweet"].apply(clean_text)

In [48]:
X_test = predict['clean_text']

In [49]:
prediction = pd.DataFrame(grid_search.best_estimator_.predict(X_test))

In [50]:
real_twits_prediction = pd.concat([predict,prediction], axis=1)

In [51]:
real_twits_prediction.to_csv("prediction.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ce82c844-fa70-433d-b480-148c6a0f0f8b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>