In [1]:
import pandas as pd
import spacy
import matplotlib.pyplot as plt
from urllib.parse import urlparse
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
fn = "df_st_s.pk" # df stem SnowballStemmer
df = pd.read_pickle(fn)
df.sample(5)

Unnamed: 0,labels,text,text_stem
717,1,I am the first Latina to run for governor of T...,"[i, am, the, first, latina, to, run, for, gove..."
4799,1,Study after study have shown that sanctuary ci...,"[studi, after, studi, have, shown, that, sanct..."
1449,1,"Says state Sen. Randy Hopper, R-Fond du Lac, u...","[say, state, sen., randi, hopper,, r-fond, du,..."
633,1,"What we have now is the most generous, in my o...","[what, we, have, now, is, the, most, generous,..."
6949,0,Says Madison Mayor Paul Soglins stated intent ...,"[say, madison, mayor, paul, soglin, state, int..."


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB

Y = df["labels"]
X = df["text_stem"].map(lambda l: " ".join(l))

pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()), 
        ("mnb", MultinomialNB(class_prior=None))
    ]
)

param_grid = {
    
    
    "tfidf__ngram_range": [(1,1),(1,2)],
    "tfidf__max_df": [0.95,0.90,0.85,0.80,0.75,0.70,0.65,0.60,0.55,0.50, 0.45, 0.40],
    "tfidf__min_df": [0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005],
    "tfidf__binary": [True,False],
    "tfidf__smooth_idf": [True, False],
    "tfidf__sublinear_tf": [True, False],
    "mnb__alpha": [1, 4, 16, 64, 128, 256, 512],
    "mnb__fit_prior": [False]
    
    
    
}

SCORING = "f1"


grid = GridSearchCV(
    pipeline,
    n_jobs=-1,
    param_grid=param_grid,
    verbose=5,
    # réentrainer le meilleur modèle sur toutes les données
    refit=True,
    scoring=SCORING,
)





grid.fit(X, Y)

Fitting 5 folds for each of 8064 candidates, totalling 40320 fits
[CV 3/5] END mnb__alpha=1, mnb__fit_prior=False, tfidf__binary=True, tfidf__max_df=0.95, tfidf__min_df=0.0001, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=True;, score=0.721 total time=   1.1s
[CV 4/5] END mnb__alpha=1, mnb__fit_prior=False, tfidf__binary=True, tfidf__max_df=0.95, tfidf__min_df=0.0001, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=True;, score=0.723 total time=   0.9s
[CV 2/5] END mnb__alpha=1, mnb__fit_prior=False, tfidf__binary=True, tfidf__max_df=0.95, tfidf__min_df=0.0001, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=False;, score=0.722 total time=   0.9s
[CV 3/5] END mnb__alpha=1, mnb__fit_prior=False, tfidf__binary=True, tfidf__max_df=0.95, tfidf__min_df=0.0001, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=False;, score=0.721 total time=   0.9s
[CV 1/5] END mnb__alpha=1, mnb__fit_prior=False, tfi

In [6]:
grid.best_estimator_

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(binary=True, max_df=0.4, min_df=0.0001,
                                 ngram_range=(1, 2), smooth_idf=False,
                                 sublinear_tf=True)),
                ('mnb', MultinomialNB(alpha=4, fit_prior=False))])

In [7]:
grid.best_score_

0.7605851982771472

In [9]:
len(grid.best_estimator_['tfidf'].vocabulary_)

93404

In [10]:
import joblib
joblib.dump(grid, '4_ml_nb_sts_tfidf_best.pkl')

['4_ml_nb_sts_tfidf_best.pkl']