In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
texts = pd.read_csv('ml_data.csv', encoding = 'utf-8') #пока наилучший вариант был с третьей итерацией

In [3]:
texts = texts.drop(['Unnamed: 0'], axis=1)

In [4]:
texts = texts[texts.in_out.isin(['0', '1'])]

In [5]:
texts['id'] = texts['id'].factorize()[0]

In [6]:
train, test = train_test_split(texts, test_size=0.3)

In [7]:
y_train = train['in_out'] 
y_test = test['in_out']

In [8]:
import re
from pyphen import Pyphen 
import string
exclude = list(string.punctuation)

def sentence_splitter(text):
    sent_list = re.split(' *[\.\?!][\'"\)\]]* ', text)
    return sent_list
 
def text_len_sent(text):
    TL_sent = len(sentence_splitter(text))
    return TL_sent
    
def text_len_words(text):
    TL_words = len(text.split())
    return TL_words
 
def avg_sentence_length(text):
    ASL = float(text_len_words(text)/text_len_sent(text))
    return round(ASL, 2)
    
    
def avg_sent_per_word(text):
    ASPW = float(text_len_sent(text)/text_len_words(text))
    return round(ASPW, 2)
    
    
def char_count(text, ignore_spaces=True):
    if ignore_spaces:
        text_chars = text.replace(" ", "")
    return len(text_chars) 

    
def avg_letter_per_word(text):
    ALPW = float(float(char_count(text))/float(len(text.split())))
    return round(ALPW, 2)
    

def avg_letter_per_sent(text):
    ALPS = float(float(char_count(text))/float(len(sentence_splitter(text))))
    return round(ALPS, 2)
    

def syllable_count(text): 
    text = text.lower()
    text = "".join(x for x in text if x not in exclude)
    dic = Pyphen(lang='ru_RU')
    count = 0
    for word in text.split(' '):
        word_hyphenated = dic.inserted(word)
        count += max(1, word_hyphenated.count("-") + 1)
    return count
    
    
def avg_syllab_per_word(text):
    ASYPW = float(float(syllable_count(text))/float(len(text.split())))
    return round(ASYPW, 2)
    

def avg_syllab_per_sent(text):
    ASYPS = float(float(syllable_count(text))/float(len(sentence_splitter(text))))
    return round(ASYPS, 2)    
    
def diffsyll(text):
    count = 0
    for word in text.split():
        wrds = syllable_count(word)
        #if wrds >= 3:
        if wrds >= 4:
            count += 1
    return count

def percent_syll(text):
    perc_diff = float(float(diffsyll(text)))/float(len(text.split()))*100
    return round(perc_diff, 2) 
    
      
def get_simple_metrics(text):
    SL = len(sentence_splitter(text))
    WC = len(text.split())
    ASL = avg_sentence_length(text)
    TC = char_count(text)
    ALPW = avg_letter_per_word(text)
    ALPS = avg_letter_per_sent(text)
    SYC = syllable_count(text)
    ASYPW = avg_syllab_per_word(text)
    ASYPS = avg_syllab_per_sent(text)
    DW = diffsyll(text)
    ADF = percent_syll(text)
    return [SL, WC, ASL, TC, ALPW, ALPS, SYC, ASYPW, ASYPS, DW, ADF] 
    

    
def print_simple_metrics(text):
    print('Количество предложений в тексте:', len(sentence_splitter(text)))
    print('Количество слов в тексте:', len(text.split()))
    print('Средняя длина предложений:', avg_sentence_length(text))
    print('Количество символов в тексте:', char_count(text))
    print('Средняя длина слова:', avg_letter_per_word(text))
    print('Средняя длина предложений в символах:', avg_letter_per_sent(text))
    print('Количество слогов в тексте:', syllable_count(text))
    print('Среднее количество слогов в слове:', avg_syllab_per_word(text))
    print('Среднее количеcтво слогов в предложении:', avg_syllab_per_sent(text))
    print('Количество сложных слов в тексте:', diffsyll(text))
    print('Процент сложных слов в тексте', percent_syll(text))
    

In [9]:
import re
import os
import string
exclude = list(string.punctuation)


def flesch_RE(text):
    ASL = avg_sentence_length(text)
    ASW = avg_syllab_per_word(text)
    FRE = 206.835 - float(1.3 * ASL) - float(60.6 * ASW)
    return round(FRE, 2)

def flesch_kincaid_grade(text):
    ASL = avg_sentence_length(text)
    ASW = avg_syllab_per_word(text)
    #английский язык!
    #FKRA = float(0.39 * ASL) + float(11.8 * ASW) - 15.59
    #русский
    #FKRA = float(0.49 * ASL) + float(7.3 * ASW) - 16.59
    #Оборнева
    FKRA = float(0.5 * ASL) + float(8.4 * ASW) - 15.59
    return round(FKRA, 2)
    
def smog_index(text): 
    if len(sentence_splitter(text)) >= 3:
        SMOG = (1.043 * (30*(diffsyll(text)/len(sentence_splitter(text))))**.5) + 3.1291
        return round(SMOG, 2)
    else:
        return 0
        
        
def coleman_liau_index(text):
    L = round(avg_letter_per_word(text)*100, 2)
    S = round(avg_sent_per_word(text)*100, 2)
    CLI = float((0.058 * L) - (0.296 * S) - 15.8)
    return round(CLI, 2)


def dale_chall_score(text): #т.к. делаем сложные слова как 4 слога, все ок 
    word_count = len(text.split())
    count = word_count - diffsyll(text)
    per = float(count)/float(word_count)*100
    difficult_words = 100-per
    if difficult_words > 5: #дальше идет адаптация: вместо 0,0496 0,062
        score = (0.1579 * difficult_words) + (0.062 * avg_sentence_length(text)) + 3.6365
    else:
        score = (0.1579 * difficult_words) + (0.062 * avg_sentence_length(text))
    return round(score, 2)
    
    
def gunning_fog(text):
    grade = 0.4*(avg_sentence_length(text) + percent_syll(text))
    return round(grade,2)
       
def print_statistics(text):
    print('Russian Flesh reading Ease =', flesch_RE(text))
    print('Russian Flesh-Kincaid Grade =', flesch_kincaid_grade(text))
    print('Russian SMOG =', smog_index(text))
    print('Russian CLI =', coleman_liau_index(text))
    print('Russian DCH =', dale_chall_score(text))
    print('Russian Gunning Fog =', gunning_fog(text))
    
def statist_vectors(text):
    FRE = flesch_RE(text)
    FKG = flesch_kincaid_grade(text)
    #SMOG = smog_index(text)
    CLI = coleman_liau_index(text)
    DCH = dale_chall_score(text)
    GF = gunning_fog(text)
    return [FRE, FKG, SMOG, CLI, DCH, GF]   
    
    
def statist_sum(text):
    average = (flesch_kincaid_grade(text)+smog_index(text)+coleman_liau_index(text)+dale_chall_score(text)+gunning_fog(text))/5
    return round(average,2)

def simple_classifire(text):
    level = statist_sum(text)
    if level > 0 and level <13 :
        return 1
    if level >= 13 and level < 17:
        return 2
    return 3
    

In [10]:
def statist_vectors(text):
    FRE = flesch_RE(text)
    FKG = flesch_kincaid_grade(text)
    SMOG = smog_index(text)
    CLI = coleman_liau_index(text)
    DCH = dale_chall_score(text)
    GF = gunning_fog(text)
    return [FRE, FKG, SMOG,CLI,DCH,GF]

Посчитаем средние метрики для всех наших текстов

In [11]:
df_in = texts.loc[texts['in_out'] == '1']
df_out = texts.loc[texts['in_out'] == '0']
df_hold = texts.loc[texts['in_out'] == 'hold'] 

In [15]:
def avg(lst): 
    return sum(lst) / len(lst) 

In [16]:
FREs_in = []
FKGs_in = []
SMOGs_in = []
CLIs_in = []
DCHs_in = []
GFs_in = []
for text in df_in.text:
    FRE = flesch_RE(text)
    FKG = flesch_kincaid_grade(text)
    SMOG = smog_index(text)
    CLI = coleman_liau_index(text)
    DCH = dale_chall_score(text)
    GF = gunning_fog(text)
    FREs_in.append(FRE)
    FKGs_in.append(FKG)
    SMOGs_in.append(SMOG)
    CLIs_in.append(CLI)
    DCHs_in.append(DCH)
    GFs_in.append(GF)

print('FRE:')
print(avg(FREs_in))
print(min(FREs_in))
print(max(FREs_in))
print('FKG:')
print(avg(FKGs_in))
print(min(FKGs_in))
print(max(FKGs_in))
print('SMOG:')
print(avg(SMOGs_in))
print(min(SMOGs_in))
print(max(SMOGs_in))
print('CLI:')
print(avg(CLIs_in))
print(min(CLIs_in))
print(max(CLIs_in))
print('DCH:')
print(avg(DCHs_in))
print(min(DCHs_in))
print(max(DCHs_in))
print('GF:')
print(avg(GFs_in))
print(min(GFs_in))
print(max(GFs_in))

FRE:
48.37086857142859
-163.74
122.84
FKG:
12.157177142857142
-0.99
86.06
SMOG:
10.746830476190475
0
29.1
CLI:
15.807653333333382
-1.45
34.26
DCH:
7.050384761904762
0.19
16.91
GF:
13.212502857142871
1.2
71.29


In [17]:
FREs_out = []
FKGs_out = []
SMOGs_out = []
CLIs_out = []
DCHs_out = []
GFs_out = []
for text in df_out.text:
    FRE = flesch_RE(text)
    FKG = flesch_kincaid_grade(text)
    SMOG = smog_index(text)
    CLI = coleman_liau_index(text)
    DCH = dale_chall_score(text)
    GF = gunning_fog(text)
    FREs_out.append(FRE)
    FKGs_out.append(FKG)
    SMOGs_out.append(SMOG)
    CLIs_out.append(CLI)
    DCHs_out.append(DCH)
    GFs_out.append(GF)

print('FRE:')
print(avg(FREs_out))
print(min(FREs_out))
print(max(FREs_out))
print('FKG:')
print(avg(FKGs_out))
print(min(FKGs_out))
print(max(FKGs_out))
print('SMOG:')
print(avg(SMOGs_out))
print(min(SMOGs_out))
print(max(SMOGs_out))
print('CLI:')
print(avg(CLIs_out))
print(min(CLIs_out))
print(max(CLIs_out))
print('DCH:')
print(avg(DCHs_out))
print(min(DCHs_out))
print(max(DCHs_out))
print('GF:')
print(avg(GFs_out))
print(min(GFs_out))
print(max(GFs_out))

FRE:
48.21727826675702
-3734.76
144.94
FKG:
11.330974949221378
-6.69
531.41
SMOG:
6.323656059580239
0
22.08
CLI:
15.714939065673653
-33.8
363.8
DCH:
6.6729823967501805
0.06
26.03
GF:
12.347748815165867
0.4
129.6


In [25]:
#try_me = statist_vectors(train['text'][1])

In [21]:
#try_me

In [18]:
def count_me_all(texts):
    return [statist_vectors(text) for text in texts]

In [19]:
x_train = count_me_all(train['text'])
x_test = count_me_all(test['text'])

In [20]:
from sklearn.preprocessing import StandardScaler

In [24]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(x_transform, y_train)
# display the relative importance of each attribute
print(model.feature_importances_)

[0.15287531 0.15497527 0.22373921 0.16671135 0.15099996 0.1506989 ]


In [28]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(x_transform, y_train)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True False False False]
[1 1 1 4 3 2]


In [98]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.001,
          verbose=0, warm_start=False)
model.fit(x_transform, y_train)
print(model)
# make predictions
expected = y_test
predicted = model.predict(x_transform_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.001,
          verbose=0, warm_start=False)
             precision    recall  f1-score   support

          0       0.73      0.58      0.65       877
          1       0.63      0.77      0.69       797

avg / total       0.68      0.67      0.67      1674

[[511 366]
 [186 611]]


In [21]:
scaler = StandardScaler()
x_fit = scaler.fit(x_train)
x_transform = scaler.transform(x_train)
x_fit_test = scaler.fit(x_test)
x_transform_test = scaler.transform(x_test)

In [22]:
x_transform

array([[-4.74303272e+00,  5.29088121e+00,  3.67587396e+00,
         2.51674692e+00,  2.86970401e+00,  4.82426986e+00],
       [-7.71525017e-01,  1.06469004e+00, -1.47304268e+00,
        -4.61482025e-02,  1.05981271e+00,  1.42129302e+00],
       [-1.71950239e-01, -2.52765647e-01,  3.84813841e-01,
         5.85389575e-01, -1.82099710e-02, -3.95333719e-01],
       ...,
       [ 1.56555625e+00, -1.19306735e+00, -2.04392083e-01,
        -1.53198436e+00, -2.69348645e+00, -1.23237099e+00],
       [-1.99498166e+00,  1.27307191e+00, -1.47304268e+00,
         2.28480032e+00,  1.50981301e+00,  1.48708634e+00],
       [-1.62048390e-02, -3.54373340e-01, -1.47304268e+00,
        -2.18182298e-04,  2.73548462e-01, -1.35815614e-01]])

In [45]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_transform, y_train)
print(model)
# make predictions
expected = y_test
predicted = model.predict(x_transform_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

GaussianNB(priors=None)
             precision    recall  f1-score   support

          0       0.81      0.47      0.60       877
          1       0.60      0.87      0.71       797

avg / total       0.71      0.66      0.65      1674

[[415 462]
 [100 697]]


In [44]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(x_transform, y_train)
print(model)
# make predictions
expected = y_test
predicted = model.predict(x_transform_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
             precision    recall  f1-score   support

          0       0.65      0.69      0.67       877
          1       0.63      0.58      0.61       797

avg / total       0.64      0.64      0.64      1674

[[606 271]
 [333 464]]


In [43]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(x_transform, y_train)
print(model)
# make predictions
expected = y_test
predicted = model.predict(x_transform_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
             precision    recall  f1-score   support

          0       0.60      0.70      0.65       877
          1       0.60      0.48      0.53       797

avg / total       0.60      0.60      0.59      1674

[[617 260]
 [415 382]]


In [83]:
from sklearn import metrics
from sklearn.svm import SVC
# fit a SVM model to the data
model = SVC(C=3.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
model.fit(x_transform, y_train)
print(model)
# make predictions
expected = y_test
predicted = model.predict(x_transform_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

SVC(C=3.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.76      0.59      0.66       877
          1       0.64      0.80      0.71       797

avg / total       0.70      0.69      0.69      1674

[[514 363]
 [159 638]]


In [55]:
model.get_params().keys()

dict_keys(['verbose', 'class_weight', 'tol', 'C', 'decision_function_shape', 'max_iter', 'cache_size', 'kernel', 'degree', 'probability', 'shrinking', 'coef0', 'random_state', 'gamma'])

In [68]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
# prepare a range of alpha values to test
alphas = np.array([])
# create and fit a ridge regression model, testing each alpha
model = SVC(C=5.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
grid = GridSearchCV(estimator=model, param_grid=dict(C=alphas))
grid.fit(x_transform, y_train)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.C)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([5. , 4.9, 4.8, 4.7, 4.6, 4.5, 4.4, 4.3, 4.2, 4.1, 4. ])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
0.7032010243277849
5.0


In [71]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.grid_search import RandomizedSearchCV
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'cache_size': sp_rand(1000)}
# create and fit a ridge regression model, testing random alpha values
model = SVC(C=5.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(x_transform, y_train)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.cache_size)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=5.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params={}, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'cache_size': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001AB4CD36550>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=0)
0.7032010243277849
1000.0904705496463


In [82]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

In [89]:
from sklearn.svm import SVC

model_svc = SVC(C=3.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
model_svc.fit(x_transform, y_train)
print(model_svc)

expected_svc = y_test
predicted_svc = model_svc.predict(x_transform_test)

print(metrics.classification_report(expected_svc, predicted_svc))
print(metrics.confusion_matrix(expected_svc, predicted_svc))

SVC(C=3.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.72      0.67      0.69       885
          1       0.66      0.71      0.68       789

avg / total       0.69      0.69      0.69      1674

[[591 294]
 [225 564]]


Без SMOG

SVC(C=3.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.62      0.77      0.69       885
          1       0.65      0.48      0.55       789

avg / total       0.64      0.63      0.63      1674

[[683 202]
 [410 379]]

Без SMOG и CLI

SVC(C=3.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.62      0.74      0.67       885
          1       0.63      0.50      0.56       789

avg / total       0.62      0.62      0.62      1674

[[651 234]
 [396 393]]

Без SMOG CLI DCH GF

In [34]:
model = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=90, p=2,
           weights='uniform')
model.fit(x_transform, y_train)
print(model)

expected = y_test
predicted = model.predict(x_transform_test)

print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=90, p=2,
           weights='uniform')
             precision    recall  f1-score   support

          0       0.63      0.73      0.68       885
          1       0.63      0.52      0.57       789

avg / total       0.63      0.63      0.63      1674

[[645 240]
 [375 414]]


In [35]:
model = KNeighborsClassifier()
model.fit(x_transform, y_train)
print(model)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


А что если попробовать стандартное преобразование текстов? Традиционно делаем предобработку

In [36]:
def set_clean(s):
    try:
        # print(type(s))
        clean_line = re.sub('[\W\d_-]+', ' ', s.lower().strip())
        return re.sub(' +', ' ', clean_line)
    except AttributeError:
        ''#print ("this was a series")

In [37]:
import re
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [38]:
train_clean = train.applymap(set_clean) #применяем еще раз нашу предобработку
test_clean = test.applymap(set_clean)

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_vec(voc=None):
    if(voc):
        vectorizer = TfidfVectorizer(ngram_range=(1, 1),
                                     stop_words='russian',
                                     min_df=5,   
                                     max_df=0.9)  
        tr = vectorizer.fit_transform(train_clean["text"]) 
        te = vectorizer.fit_transform(test_clean["text"]) 
        return (tr, te)
    else:
        vectorizer = TfidfVectorizer() 
        tr = vectorizer.fit_transform(train_clean["text"]) 
        voc = vectorizer.get_feature_names()
        vectorizer = CountVectorizer(vocabulary=voc) 
        te = vectorizer.fit_transform(test_clean["text"]) 
        return (tr, te)
train_counts, test_counts = tfidf_vec() 

tfidf_transformer = TfidfTransformer()

x_train = tfidf_transformer.fit_transform(train_counts)
x_test = tfidf_transformer.fit_transform(test_counts)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [40]:
clf = MultinomialNB().fit(x_train, y_train)
predicted = clf.predict(x_test)
len(predicted)

1674

In [41]:
print(classification_report(y_pred=predicted, y_true=y_test))

             precision    recall  f1-score   support

          0       0.85      0.32      0.47       885
          1       0.55      0.93      0.69       789

avg / total       0.71      0.61      0.57      1674



In [42]:
model = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=100, p=2,
           weights='uniform')
model.fit(x_train, y_train)
print(model)

expected = y_test
predicted = model.predict(x_test)

print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=100, p=2,
           weights='uniform')
             precision    recall  f1-score   support

          0       0.70      0.62      0.66       885
          1       0.62      0.70      0.66       789

avg / total       0.66      0.66      0.66      1674

[[548 337]
 [234 555]]


In [43]:
from sklearn.svm import SVC

model_svc = SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
model_svc.fit(x_train, y_train)
print(model_svc)

expected_svc = y_test
predicted_svc = model_svc.predict(x_test)

print(metrics.classification_report(expected_svc, predicted_svc))
print(metrics.confusion_matrix(expected_svc, predicted_svc))

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.53      1.00      0.69       885
          1       0.00      0.00      0.00       789

avg / total       0.28      0.53      0.37      1674

[[885   0]
 [789   0]]


  'precision', 'predicted', average, warn_for)
