In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train_data-2.csv', encoding='utf', engine='python', index_col=0)
test = pd.read_csv('test_data-2.csv', encoding='utf', engine='python', index_col=0)

In [5]:
tf_idf = TfidfVectorizer(ngram_range=(1, 4), stop_words=stopwords.words('russian'), 
                         analyzer='word',
                         max_df=0.8, 
                         min_df=10,
                        )

tf_idf_model = tf_idf.fit(train['text'])
train_tf_idf_vec = tf_idf_model.transform(train['text'])
test_tf_idf_vec = tf_idf_model.transform(test['text'])

lm = LogisticRegression(
                        penalty='l2',
                        random_state=42,
                        C=10, 
                        max_iter=100000)

lm_params = {'penalty':['l1', 'l2'],
             'C':np.arange(0.1,5,0.1),  
}

lm_search = GridSearchCV(estimator=lm, 
                         param_grid=lm_params, 
                         scoring ='roc_auc', 
                         cv=StratifiedKFold(10), 
                         n_jobs=-1,
                         verbose=1)

lm_search_fitted = lm_search.fit(X=train_tf_idf_vec, y=pd.factorize(train.score)[0])

pred_scores = cross_val_score(estimator=lm_search_fitted.best_estimator_, X=train_tf_idf_vec, y=pd.factorize(train.score)[0],
                scoring='roc_auc',  
                cv=10, #stratified by default
                n_jobs=-1)

predicts = lm_search_fitted.best_estimator_.predict_proba(test_tf_idf_vec)[:, 0]

sub = pd.DataFrame({'index': range(0, len(predicts)),
                    'score':predicts})
sub.to_csv('rang.csv', index=False)

Fitting 10 folds for each of 98 candidates, totalling 980 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   40.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 980 out of 980 | elapsed:  1.9min finished
