In [53]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV

In [72]:
train_df = pd.read_csv('sentiment_df.csv')
train_df.reset_index(drop=True)
train_df = train_df.drop_duplicates(subset=['review'], keep=False)
'''Датасет все еще немного несбалансированный, поэтому сбалансируем ручками'''
neg_df = train_df.loc[train_df['mark'] == 0]
pos_df = train_df.loc[train_df['mark'] == 1].sample(n=1500, random_state=42)  
balanced_train_df = pd.concat((pos_df, neg_df))
X_train, y_train = balanced_train_df['review'], balanced_train_df['mark']

In [73]:
pipe_count_bigrams = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
                            ('lr', LogisticRegression(n_jobs=-1))])
cross_score_bigrams = cross_val_score(
    pipe_count_bigrams, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1)
print('(Count Bigram + Unigram Vectorizer + LR) mean score: ', cross_score_bigrams.mean())

(Count Bigram + Unigram Vectorizer + LR) mean score:  0.776962426105895


In [None]:
parameters = {'vectorizer__min_df': [0, 0.05, 0.1, 0.2],
              'vectorizer__max_df': [1, 0.95, 0.9],
              'lr__penalty': ['l1', 'l2'],
              'lr__C': [1, 0.9, 0.5, 0.05]
              }
grid = GridSearchCV(pipe_count_bigrams, parameters, cv=5, verbose=10)
grid.fit(X_train, y_train)

In [75]:
cross_val_score(
            grid.best_estimator_, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1).mean()

0.776962426105895

In [76]:
import pickle

best_pipe = grid.best_estimator_
best_pipe.fit(X_train, y_train)

with open("model.pkl", "wb") as f:
    pickle.dump(best_pipe, f)  # Сохраним модель, чтобы в дальнейшем просто вызывать predict