In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import random
random.seed(1228)
from sklearn.feature_extraction.text import *
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
import re
from pymystem3 import Mystem
import numpy as np
import itertools
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from nltk.tokenize import toktok
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

import gensim
import string

from nltk.stem.snowball import RussianStemmer
import seaborn as sns

In [185]:
clean_data = pd.read_csv("clean_data.csv")
clean_data.head()

Unnamed: 0.1,Unnamed: 0,id,rank,relevant,text
0,0,1,1,1,я очень расстроена! пошли в межсерверную битв...
1,1,2,5,1,"уважаемые разработчики, благодарю за ваш отве..."
2,2,3,5,1,играю с ноября до сих пор не надоедает доба...
3,3,4,2,1,"у меня клара виснит,она бывает прозрачной или..."
4,4,5,5,0,лучшая игра!


In [None]:
#lemmatisation, delete stop words

In [None]:
#lemmatization

In [175]:
from nltk.corpus import stopwords

In [186]:
%%time 
from pymystem3 import Mystem

m = Mystem()
def lemmatize(text, mystem=m):
    try:
        return "".join(m.lemmatize(text)).strip()  
    except:
        return "ошибка"



CPU times: user 50 µs, sys: 28 µs, total: 78 µs
Wall time: 77 µs


In [187]:
clean_data.text = clean_data.text.apply(lemmatize)
clean_data.head()

Unnamed: 0.1,Unnamed: 0,id,rank,relevant,text
0,0,1,1,1,я очень расстраивать! пойти в межсерверный бит...
1,1,2,5,1,"уважаемый разработчик, благодарить за ваш отве..."
2,2,3,5,1,играть с ноябрь до сей пора не надоедать доб...
3,3,4,2,1,"у я клара виснять,она бывать прозрачный или вк..."
4,4,5,5,0,хороший игра!


In [188]:
cnt = Counter()
n_types = []
n_tokens = []
tokens = []
for index, row in tqdm(clean_data.iterrows(), total = len(clean_data)):
    tokens = row['text'].split()
    cnt.update(tokens)
    n_types.append(len(cnt))
    n_tokens.append(sum(cnt.values()))
for i in cnt.most_common(10):
    print(i)

HBox(children=(IntProgress(value=0, max=214), HTML(value='')))


('не', 178)
('и', 154)
('игра', 145)
('я', 119)
('в', 102)
('что', 74)
('а', 61)
('на', 59)
('быть', 51)
('но', 43)


In [189]:
mystopwords = stopwords.words('russian') + ['это', 'наш' , 'тыс', 'млн', 'млрд', 'также',  'т', 'д', 'хотя',
                                            'мой', 'очень', 'день', ',', 'просто', 'этот', 'вообще','что',
                                            'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на', '...',
                                           'несколько','один', '2', '3', '1', '5', 'е', 'либо', '!', 'a',
                                           'нифиг', '2500', 'короче', '1800', 'игра', 'весь', 'который']
def  remove_stopwords(text, mystopwords = mystopwords):
    try:
        return " ".join([token for token in text.split() if not token in mystopwords])
    except:
        return "ошибка"

In [190]:
clean_data.text = clean_data.text.apply(remove_stopwords) 
clean_data.head()

Unnamed: 0.1,Unnamed: 0,id,rank,relevant,text
0,0,1,1,1,"расстраивать! пойти межсерверный битва, случат..."
1,1,2,5,1,"уважаемый разработчик, благодарить ваш ответ в..."
2,2,3,5,1,играть ноябрь сей пора надоедать добавлять инт...
3,3,4,2,1,"клара виснять,она бывать прозрачный вклеточка,..."
4,4,5,5,0,хороший игра!


In [193]:
from collections import Counter
from tqdm import tqdm_notebook as tqdm
cnt = Counter()
n_types = []
n_tokens = []
tokens = []
for index, row in tqdm(clean_data.iterrows(), total = len(clean_data)):
    tokens = row['text'].split()
    cnt.update(tokens)
    n_types.append(len(cnt))
    n_tokens.append(sum(cnt.values()))
for i in cnt.most_common(10):
    print(i)

HBox(children=(IntProgress(value=0, max=214), HTML(value='')))


('уровень', 36)
('играть', 28)
('время', 22)
('хотеть', 19)
('сделать', 17)
('проходить', 17)
('обновление', 14)
('реклама', 13)
('спасибо', 12)
('новый', 12)


In [29]:
#train CV 

In [111]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def cv(data):
    count_vectorizer = CountVectorizer(analyzer='word', tokenizer=word_tokenize, lowercase=False)
#analyzer='word', decode_error='replace', lowercase=False 
    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = clean_data["text"].tolist()
list_labels = clean_data["relevant"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.3, 
                                                                                random_state=40)

X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [112]:
len(count_vectorizer.get_feature_names())

1074

In [18]:
#feature_names = count_vectorizer.get_feature_names()
#pd.DataFrame(X_train_counts.toarray(), columns = feature_names)

In [113]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection, naive_bayes, svm
from sklearn.ensemble import RandomForestClassifier

In [None]:
#lr

In [114]:
clf_lr = LogisticRegression()

In [115]:
param_dist_lr = { "penalty" : ['l2'],
                "C" : [0.001, 0.01, 0.1, 0.25, 0.5, 1.0, 3.4, 5.5, 7.5, 10, 15, 20, 30, 35],
                "solver" : ['newton-cg', 'lbfgs', 'sag', 'saga' ] }

In [116]:
grid_lr = GridSearchCV(clf_lr, param_dist_lr, scoring='f1_weighted', cv=10)
grid_lr.fit(X_train_counts, y_train)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)














GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l2'], 'C': [0.001, 0.01, 0.1, 0.25, 0.5, 1.0, 3.4, 5.5, 7.5, 10, 15, 20, 30, 35], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=0)

In [117]:
print(grid_lr.best_params_)
print(grid_lr.best_score_)
print(grid_lr.best_estimator_)

{'C': 0.25, 'penalty': 'l2', 'solver': 'newton-cg'}
0.7623422110196686
LogisticRegression(C=0.25, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)


In [118]:
y_pred_lr = grid_lr.predict(X_test_counts)

In [119]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_lr)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.831, precision = 0.835, recall = 0.831, f1 = 0.831


In [None]:
#train TF IDF 

In [120]:
def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer(
    analyzer="word", tokenizer=word_tokenize,
    preprocessor=None, max_features=None)

    emb = tfidf_vectorizer.fit_transform(data)

    return emb, tfidf_vectorizer

list_corpus2 = clean_data["text"].tolist()
list_labels2 = clean_data["relevant"].tolist()

X_train2, X_test2, y_train2, y_test2 = train_test_split(list_corpus2, list_labels2, test_size=0.3, 
                                                                                random_state=40)

X_train_counts2, tfidf_vectorizer = tfidf(X_train2)
X_test_counts2 = tfidf_vectorizer.transform(X_test2)

In [121]:
len(tfidf_vectorizer.get_feature_names())

1074

In [166]:
#tfidf_vc = dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer.idf_))

In [28]:
#feature_names = tfidf_vectorizer.get_feature_names()
#pd.DataFrame(X_train_counts2.toarray(), columns = feature_names)

In [122]:
clf_lr2 = LogisticRegression()

In [123]:
param_dist_lr = { "penalty" : ['l2'],
                "C" : [0.001, 0.01, 0.1, 0.25, 0.5, 1.0, 3.4, 5.5, 7.5, 10, 15, 20, 30, 35],
                "solver" : ['newton-cg', 'lbfgs', 'sag', 'saga' ] }

In [124]:
grid_lr2 = GridSearchCV(clf_lr2, param_dist_lr, scoring='f1_weighted', cv=10)
grid_lr2.fit(X_train_counts2, y_train2)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)




GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l2'], 'C': [0.001, 0.01, 0.1, 0.25, 0.5, 1.0, 3.4, 5.5, 7.5, 10, 15, 20, 30, 35], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=0)

In [125]:
print(grid_lr2.best_params_)
print(grid_lr2.best_score_)
print(grid_lr2.best_estimator_)

{'C': 20, 'penalty': 'l2', 'solver': 'saga'}
0.7069781801855275
LogisticRegression(C=20, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)


In [126]:
y_pred_lr2 = grid_lr2.predict(X_test_counts2)

In [127]:
accuracy, precision, recall, f1 = get_metrics(y_test2, y_pred_lr2)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.754, precision = 0.766, recall = 0.754, f1 = 0.748


In [None]:
#naive_bayes

In [128]:
clf_naive = naive_bayes.MultinomialNB()

In [129]:
param_dist_naive = {
             'alpha':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
}

In [130]:
grid_naive = GridSearchCV(clf_naive, param_dist_naive, scoring='f1_weighted', cv=10, n_jobs=-1)
grid_naive.fit(X_train_counts, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=0)

In [131]:
print(grid_naive.best_params_)
print(grid_naive.best_score_)
print(grid_naive.best_estimator_)

{'alpha': 0.0001}
0.7263715462315001
MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)


In [132]:
y_pred_naive = grid_naive.predict(X_test_counts)

In [133]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_naive)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.754, precision = 0.806, recall = 0.754, f1 = 0.737


In [134]:
clf_naive2 = naive_bayes.MultinomialNB()

In [135]:
grid_naive2 = GridSearchCV(clf_naive2, param_dist_naive, scoring='f1_weighted', cv=10, n_jobs=-1)
grid_naive2.fit(X_train_counts2, y_train2)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=0)

In [136]:
print(grid_naive2.best_params_)
print(grid_naive2.best_score_)
print(grid_naive2.best_estimator_)

{'alpha': 0.001}
0.7047054380941804
MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)


In [137]:
y_pred_naive2 = grid_naive2.predict(X_test_counts2)

In [138]:
accuracy, precision, recall, f1 = get_metrics(y_test2, y_pred_naive2)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.708, precision = 0.778, recall = 0.708, f1 = 0.678


In [None]:
#RandomForest

In [139]:
clf_rf = RandomForestClassifier()

In [140]:
param_dist_rf = {
    'n_estimators': [5, 10, 20, 40],
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 3, 5, 7, 9, None],
    'min_samples_leaf': [1, 2, 4, 8, 16]
}

In [141]:
grid_forest = GridSearchCV(clf_rf, param_dist_rf, scoring='f1_weighted', cv=10, n_jobs=-1)
grid_forest.fit(X_train_counts, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [5, 10, 20, 40], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 3, 5, 7, 9, None], 'min_samples_leaf': [1, 2, 4, 8, 16]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=0)

In [142]:
print(grid_forest.best_params_)
print(grid_forest.best_score_)
print(grid_forest.best_estimator_)

{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 2, 'n_estimators': 10}
0.8218238692967016
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [143]:
y_pred_rf = grid_forest.predict(X_test_counts)

In [144]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_rf)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.785, precision = 0.798, recall = 0.785, f1 = 0.784


In [145]:
clf_rf2 = RandomForestClassifier()

In [146]:
grid_forest2 = GridSearchCV(clf_rf2, param_dist_rf, scoring='f1_weighted', cv=10, n_jobs=-1)
grid_forest2.fit(X_train_counts2, y_train2)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [5, 10, 20, 40], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 3, 5, 7, 9, None], 'min_samples_leaf': [1, 2, 4, 8, 16]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=0)

In [147]:
print(grid_forest2.best_params_)
print(grid_forest2.best_score_)
print(grid_forest2.best_estimator_)

{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 2, 'n_estimators': 40}
0.8239327476096857
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [148]:
y_pred_rf2 = grid_forest2.predict(X_test_counts2)

In [149]:
accuracy, precision, recall, f1 = get_metrics(y_test2, y_pred_rf2)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.831, precision = 0.831, recall = 0.831, f1 = 0.831


In [None]:
#svc

In [150]:
clf_svc = svm.SVC()

In [151]:
param_dist_svc = {
             'C':[1,10,100,1000],
             'gamma':[1,0.1,0.001,0.0001], 
             'kernel':['linear','rbf','sigmoid']
}

In [152]:
grid_svc = GridSearchCV(clf_svc, param_dist_svc, scoring='f1_weighted', cv=10, n_jobs=-1)
grid_svc.fit(X_train_counts, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf', 'sigmoid']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=0)

In [153]:
print(grid_svc.best_params_)
print(grid_svc.best_score_)
print(grid_svc.best_estimator_)

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.806972528929019
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [154]:
y_pred_svc = grid_svc.predict(X_test_counts)

In [155]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_svc)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.815, precision = 0.815, recall = 0.815, f1 = 0.815


In [156]:
clf_svc2 = svm.SVC()

In [157]:
grid_svc2 = GridSearchCV(clf_svc2, param_dist_svc, scoring='f1_weighted', cv=10, n_jobs=-1)
grid_svc2.fit(X_train_counts2, y_train2)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf', 'sigmoid']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=0)

In [158]:
print(grid_svc2.best_params_)
print(grid_svc2.best_score_)
print(grid_svc2.best_estimator_)

{'C': 1, 'gamma': 1, 'kernel': 'sigmoid'}
0.7103975432522819
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [159]:
y_pred_svc2 = grid_svc2.predict(X_test_counts2)

In [160]:
accuracy, precision, recall, f1 = get_metrics(y_test2, y_pred_svc2)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.815, precision = 0.815, recall = 0.815, f1 = 0.815


In [None]:
#TM

In [76]:
from gensim.corpora import *
texts = [clean_data.text.iloc[i].split() for i in range(len(clean_data))]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [77]:
from gensim import corpora
clean_data["tok_text"] = clean_data["text"].map(lambda x: x.split(' '))
dictionary = corpora.Dictionary(clean_data["tok_text"])

In [78]:
clean_data["bow"] = clean_data["tok_text"].map(dictionary.doc2bow)

In [79]:
clean_data.head()

Unnamed: 0.1,Unnamed: 0,id,rank,relevant,text,tok_text,bow
0,0,1,1,1,"расстраивать! пойти межсерверный битва, случат...","[расстраивать!, пойти, межсерверный, битва,, с...","[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 2..."
1,1,2,5,1,"уважаемый разработчик, благодарить ваш ответ в...","[уважаемый, разработчик,, благодарить, ваш, от...","[(4, 1), (12, 2), (13, 1), (21, 1), (22, 1), (..."
2,2,3,5,1,играть ноябрь сей пора надоедать добавлять инт...,"[играть, ноябрь, сей, пора, надоедать, добавля...","[(14, 1), (41, 1), (45, 1), (50, 1), (67, 2), ..."
3,3,4,2,1,"клара виснять,она бывать прозрачный вклеточка,...","[клара, виснять,она, бывать, прозрачный, вклет...","[(93, 1), (94, 1), (95, 3), (96, 1), (97, 1), ..."
4,4,5,5,0,хороший игра!,"[хороший, игра!]","[(116, 1), (117, 1)]"


In [80]:
print("Found {} words.".format(len(dictionary.values())))

Found 1565 words.


In [82]:
%%time
from gensim.models import  *
from gensim import similarities

tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

index = similarities.MatrixSimilarity(corpus_tfidf)
sims = index[corpus_tfidf]

CPU times: user 74.8 ms, sys: 4.69 ms, total: 79.5 ms
Wall time: 81 ms


In [83]:
%%time
lda = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=25,
                        alpha='auto', eta='auto', iterations = 20, passes = 5)

CPU times: user 454 ms, sys: 3.5 ms, total: 457 ms
Wall time: 469 ms


In [84]:
def document_to_lda_features(lda_model, document):
    """ Transforms a bag of words document to features.
    It returns the proportion of how much each topic was
    present in the document.
    """
    topic_importances = lda_model.get_document_topics(document, minimum_probability=0)
    topic_importances = np.array(topic_importances)
    return topic_importances[:,1]

clean_data['lda_features'] = list(map(lambda doc:
                                      document_to_lda_features(lda, doc),
                                      clean_data.bow))

In [86]:
clean_data['lda_features'].head()

0    [0.0008767048129811883, 0.0009181999485008419,...
1    [0.0007744525210000575, 0.0008111080387607217,...
2    [0.0010417590383440256, 0.0010910663986578584,...
3    [0.001191277988255024, 0.0012476621195673943, ...
4    [0.7085118889808655, 0.01211701799184084, 0.01...
Name: lda_features, dtype: object

In [91]:
#id_tuples = lda.get_topic_terms(1, topn=5)
#word_ids = np.array(id_tuples)[:,0]
#words = map(lambda id_: lda.id2word[id_], word_ids)

In [92]:
#print("For topic {}, the top words are: {}.".format(x, ", ".join(words)))

In [93]:
cleansed_words_df = pd.DataFrame.from_dict(dictionary.token2id, orient='index')
cleansed_words_df.rename(columns={0: 'id'}, inplace=True)

cleansed_words_df['count'] = list(map(lambda id_: dictionary.dfs.get(id_), cleansed_words_df.id))
del cleansed_words_df['id']

In [94]:
cleansed_words_df.sort_values('count', ascending=False, inplace=True)


In [95]:
lda.show_topics(5)

[(11,
  '0.020*"который" + 0.014*"писать" + 0.013*"почему" + 0.013*"загружаться" + 0.013*"интернет" + 0.008*"весь" + 0.008*"хотеть" + 0.007*"время" + 0.007*"столько" + 0.007*"ответ"'),
 (13,
  '0.027*"понравиться" + 0.011*"который" + 0.011*"игрок" + 0.011*"разработчик" + 0.011*"хороший" + 0.010*"новый" + 0.010*"интересный," + 0.010*"альянс" + 0.010*"стоять" + 0.010*"бывать"'),
 (5,
  '0.014*"играть" + 0.010*"время" + 0.010*"новый" + 0.010*"обновление" + 0.009*"альянс" + 0.009*"нравиться" + 0.009*"кухня" + 0.009*"решать" + 0.005*"весь" + 0.005*"скачать"'),
 (6,
  '0.020*"время" + 0.015*"проблема" + 0.010*"пока" + 0.010*"маленький" + 0.010*"пожалуйста!" + 0.010*"дворцовый" + 0.010*""" + 0.006*"весь" + 0.006*"уровень" + 0.006*"играть"'),
 (7,
  '0.019*"проходить" + 0.015*"уровень," + 0.013*"уровень" + 0.013*"жизнь" + 0.010*"играть" + 0.010*"новый" + 0.009*"последний" + 0.009*"вылетать" + 0.009*"бывать" + 0.009*"ход"')]

In [96]:
topic_id = 0
top_words = lda.get_topic_terms(topic_id, topn=5)

In [97]:
print(top_words)

[(117, 0.016084332), (222, 0.015855992), (116, 0.015817475), (656, 0.01575234), (132, 0.009545251)]


In [None]:
#word2vec

In [98]:
sentences = []
for sentence_group in clean_data.tok_text:
    sentences.extend(sentence_group)

print("Number of sentences: {}.".format(len(sentences)))
print("Number of texts: {}.".format(len(clean_data)))

Number of sentences: 2797.
Number of texts: 214.


In [99]:
from gensim.models import Word2Vec

In [100]:
model_w2v = Word2Vec(clean_data['tok_text'], size=100, window=5, min_count=1, workers=4)

In [101]:
model_w2v.wv.most_similar('реклама')

[('перезапускать', 0.39656537771224976),
 ('тупой', 0.32587701082229614),
 ('пожалеть', 0.3029012680053711),
 ('ток,', 0.2969647943973541),
 ('никакой', 0.2949686348438263),
 ('доберешся', 0.2875513732433319),
 ('ранг', 0.27291351556777954),
 ('банкет', 0.2728500962257385),
 ('целый', 0.2644536793231964),
 ('заботно', 0.2563919425010681)]