In [1]:
import pandas as pd
import spacy
import matplotlib.pyplot as plt
from urllib.parse import urlparse
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
fn = "df_st_ps.pk" # df stem porterstemmer
df = pd.read_pickle(fn)
df.sample(5)

Unnamed: 0,labels,text,text_stem
6173,1,When you were casting your vote for Republican...,"[when, you, were, cast, your, vote, for, repub..."
7593,1,Ben Unger supported a 15 percent property tax ...,"[ben, unger, support, a, 15, percent, properti..."
1143,0,"Fifty-one percent -- that is, a majority of Am...","[fifty-on, percent, --, that, is,, a, major, o..."
9331,0,Richard Nixon released tax returns when he was...,"[richard, nixon, releas, tax, return, when, he..."
7793,1,A proposed revenue smart cap gives Floridians ...,"[a, propos, revenu, smart, cap, give, floridia..."


In [3]:
#import joblib
#### joblib.dump(grid, '4_ml_nb_st_ps_tfidf_best.pkl') 

In [4]:
#grid = joblib.load('4_ml_nb_st_ps_tfidf_best.pkl')

In [5]:
#len(grid.best_estimator_['tfidf'].vocabulary_)

In [6]:
#grid.best_score_
#grid.cv_results_['mean_test_score']

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB

Y = df["labels"]
X = df["text_stem"].map(lambda l: " ".join(l))

pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()), 
        ("mnb", MultinomialNB(class_prior=None))
    ]
)

param_grid = {
    
    "tfidf__ngram_range": [(1,1),(1,2)],
    "tfidf__max_df": [0.95,0.90,0.85,0.80,0.75,0.70,0.65,0.60,0.55,0.50, 0.45, 0.40],
    "tfidf__min_df": [0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005],
    "tfidf__binary": [True,False],
    "tfidf__smooth_idf": [True, False],
    "tfidf__sublinear_tf": [True, False],
    
    "mnb__alpha": [1, 4, 16, 64, 128, 256, 512],
    "mnb__fit_prior": [False]
}

SCORING = "f1"


grid = GridSearchCV(
    pipeline,
    n_jobs=-1,
    param_grid=param_grid,
    verbose=5,
    # réentrainer le meilleur modèle sur toutes les données
    refit=True,
    scoring=SCORING,
)


grid.fit(X, Y)

Fitting 5 folds for each of 8064 candidates, totalling 40320 fits
[CV 1/5] END mnb__alpha=1, mnb__fit_prior=False, tfidf__binary=True, tfidf__max_df=0.95, tfidf__min_df=0.0001, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=True;, score=0.718 total time=   0.4s
[CV 4/5] END mnb__alpha=1, mnb__fit_prior=False, tfidf__binary=True, tfidf__max_df=0.95, tfidf__min_df=0.0001, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=True;, score=0.723 total time=   0.3s
[CV 2/5] END mnb__alpha=1, mnb__fit_prior=False, tfidf__binary=True, tfidf__max_df=0.95, tfidf__min_df=0.0001, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=False;, score=0.722 total time=   0.3s
[CV 5/5] END mnb__alpha=1, mnb__fit_prior=False, tfidf__binary=True, tfidf__max_df=0.95, tfidf__min_df=0.0001, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=False;, score=0.711 total time=   0.3s
[CV 3/5] END mnb__alpha=1, mnb__fit_prior=False, tfi

GridSearchCV(estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('mnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'mnb__alpha': [1, 4, 16, 64, 128, 256, 512],
                         'mnb__fit_prior': [False],
                         'tfidf__binary': [True, False],
                         'tfidf__max_df': [0.95, 0.9, 0.85, 0.8, 0.75, 0.7,
                                           0.65, 0.6, 0.55, 0.5, 0.45, 0.4],
                         'tfidf__min_df': [0.0001, 0.00025, 0.0005, 0.001,
                                           0.0025, 0.005],
                         'tfidf__ngram_range': [(1, 1), (1, 2)],
                         'tfidf__smooth_idf': [True, False],
                         'tfidf__sublinear_tf': [True, False]},
             scoring='f1', verbose=5)

In [8]:
grid.best_estimator_

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(binary=True, max_df=0.4, min_df=0.0001,
                                 ngram_range=(1, 2), smooth_idf=False,
                                 sublinear_tf=True)),
                ('mnb', MultinomialNB(alpha=4, fit_prior=False))])

In [9]:
grid.best_score_

0.7605532730945174

In [10]:
len(grid.best_estimator_['tfidf'].vocabulary_)

93577

In [11]:
grid.cv_results_['mean_test_score']

array([0.71850249, 0.71850249, 0.71650461, ..., 0.75842233, 0.75789367,
       0.75835774])

[CV 1/5] END mnb__alpha=512, mnb__fit_prior=False, tfidf__binary=False, tfidf__max_df=0.4, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=True;, score=0.758 total time=   2.5s
[CV 2/5] END mnb__alpha=512, mnb__fit_prior=False, tfidf__binary=False, tfidf__max_df=0.4, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=True;, score=0.762 total time=   1.6s
[CV 5/5] END mnb__alpha=512, mnb__fit_prior=False, tfidf__binary=False, tfidf__max_df=0.4, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=True;, score=0.750 total time=   2.2s
[CV 3/5] END mnb__alpha=512, mnb__fit_prior=False, tfidf__binary=False, tfidf__max_df=0.4, tfidf__min_df=0.005, tfidf__ngram_range=(1, 1), tfidf__smooth_idf=True, tfidf__sublinear_tf=False;, score=0.755 total time=   2.3s
[CV 1/5] END mnb__alpha=512, mnb__fit_prior=False, tfidf__binary=False, tfidf__max_df=0.4, tfidf__min_df=0.005, tfi

In [12]:
import joblib
joblib.dump(grid, '4_ml_nb_st_ps_tfidf_best.pkl')

['4_ml_nb_st_ps_tfidf_best.pkl']