In [3]:
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from switcher_class import ClfSwitcher


In [17]:
df = pd.read_csv("comments_train.csv")
X = df['comment']
y = df['sentiment'].map({"Positive":1,"Negative":0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', ClfSwitcher()),
    ])

parameters = [
    {
        'clf__estimator': [RandomForestClassifier()],
        'tfidf__stop_words': ['french', None],
        'tfidf__ngram_range' : [(1,1),(1,2)],
        'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        'clf__estimator__n_estimators': [100,300],
        'clf__estimator__max_depth': [4,6],
        'clf__estimator__min_samples_leaf': [3,5,10],
    },
    {
        'clf__estimator': [LogisticRegression()],
        'tfidf__stop_words': ['french', None],
        'tfidf__ngram_range' : [(1,1),(1,2)],
        'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        'clf__estimator__penalty': ['l1','l2'],
        'clf__estimator__C': [1,10,100],
    },
    {
        'clf__estimator': [GaussianNB()],
        'tfidf__stop_words': ['french', None],
        'tfidf__ngram_range' : [(1,1),(1,2)],
        'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        'clf__estimator__var_smoothing': (1e-5, 1e-3, 1e-1),
    },

]


In [18]:
gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, scoring = 'roc_auc',verbose=1)
gscv.fit(X_train,y_train)

Fitting 5 folds for each of 336 candidates, totalling 1680 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   48.3s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1680 out of 1680 | elapsed:  2.2min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf', ClfSwitcher())]),
             n_jobs=-1,
             param_grid=[{'clf__estimator': [RandomForestClassifier()],
                          'clf__estimator__max_depth': [4, 6],
                          'clf__estimator__min_samples_leaf': [3, 5, 10],
                          'clf__estimator__n_estimators': [100, 300],
                          'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
                          'tfidf__ngram_range': [(1, 1), (1, 2...
                          'clf__estimator__penalty': ['l1', 'l2'],
                          'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
                          'tfidf__ngram_range': [(1, 1), (1, 2)],
                          'tfidf__stop_words': ['french', None]},
                         {'clf__estimator': [GaussianNB()],
                          'clf__estimator__var_smoothing': (1e-05, 0.001, 0.1),
     

In [25]:
model = gscv.best_estimator_
print(model)
preds = model.predict(X_test)
sklearn.metrics.roc_auc_score(preds,y_test)

0.8410287081339713