In [None]:
!pip install hyperopt
!pip install catboost



In [None]:
import pandas as pd
from google.colab import drive

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score,classification_report,confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from hyperopt import hp, fmin, tpe
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import time

In [None]:
def metrics_classification(y_test, predicted):
    print("Precision : {0:6.5f}".format(precision_score(y_test, predicted)))
    print("Recall: {0:6.5f}".format(recall_score(y_test, predicted)))
    print("F1-measure: {0:6.5f}".format(f1_score(y_test, predicted)))
    print("Accuracy: {0:6.5f}".format(accuracy_score(y_test, predicted)))

In [None]:
df = pd.read_csv('/content/drive/My Drive/Финальный_датафрейм.csv')
df = df.replace({'score':{7:0,8:1, 9:1, 10:1}})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['prepared_text'], df['score'], test_size=0.25, random_state=42, stratify = df['score'], shuffle=True)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=2048)
X_train = tfidf_vectorizer.fit_transform(X_train).toarray()
X_val = tfidf_vectorizer.transform(X_val)
X_test = tfidf_vectorizer.transform(X_test)

In [None]:
ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

def objective(params):
    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    if f1<=0.99:
        return -f1
    else:
        return 1.00

space = {
        'n_estimators': hp.choice('n_estimators', [3,5,10,15,20,30,35,40,50]),
        'criterion':  hp.choice('criterion', ['gini', 'entropy', 'log_loss']),
        'max_depth':  hp.choice('max_depth', [3,5,10,20,30,40,50,60,70]),
        'min_samples_leaf': hp.choice('min_samples_leaf', [1,2,3,4,5,6,7,8,9,10])
    }

best_params =fmin(
    fn=objective, space=space, algo=tpe.suggest, max_evals=50)

100%|██████████| 50/50 [01:16<00:00,  1.54s/trial, best loss: -0.9090909090909091]


In [None]:
best_params

{'criterion': 0, 'max_depth': 4, 'min_samples_leaf': 6, 'n_estimators': 7}

In [None]:
clf = RandomForestClassifier(n_estimators=40,criterion='gini', max_depth=30, min_samples_leaf=7,class_weight={0:2,1:8})
%time clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
metrics_classification(y_test, y_pred)

CPU times: user 3.22 s, sys: 9.11 ms, total: 3.23 s
Wall time: 3.23 s
Precision : 0.81427
Recall: 0.98493
F1-measure: 0.89151
Accuracy: 0.80684


In [None]:
from sklearn.preprocessing import MaxAbsScaler

scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
def objective(params):
    params['max_iter'] = 1000
    log_reg = LogisticRegression(**params)
    log_reg.fit(X_train_scaled, y_train)

    y_pred = log_reg.predict(X_val_scaled)
    f1 = f1_score(y_val, y_pred)
    if f1<=0.99:
        return -f1
    else:
        return 1.00

space = {
        'C': hp.loguniform('C', -10, 1),
        'penalty': hp.choice('penalty', ['l2']),
        'solver': hp.choice('solver',['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'])
    }

best_params =fmin(
    fn=objective, space=space, algo=tpe.suggest, max_evals=50)

100%|██████████| 50/50 [02:45<00:00,  3.31s/trial, best loss: -0.8267898383371824]


In [None]:
best_params

{'C': 1.938326928938861, 'penalty': 0, 'solver': 4}

In [None]:
clf = LogisticRegression(C=1.938326928938861, random_state=0, solver='saga', max_iter=1000,class_weight={0:2,1:8})
%time clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
metrics_classification(y_test, y_pred)

CPU times: user 16.5 s, sys: 17.6 ms, total: 16.6 s
Wall time: 16.7 s
Precision : 0.84046
Recall: 0.80822
F1-measure: 0.82402
Accuracy: 0.72185


In [None]:
def objective(params):
    clf = KNeighborsClassifier(**params)
    clf.fit(X_train_scaled, y_train)

    y_pred = clf.predict(X_val_scaled)
    f1 = f1_score(y_val, y_pred)
    if f1<=0.99:
        return -f1
    else:
        return 1.00

space = {
        'n_neighbors': hp.choice('n_neighbors', [2,3,4,5,6]),
        'weights': hp.choice('weights', ['uniform', 'distance'])
    }

best_params =fmin(
    fn=objective, space=space, algo=tpe.suggest, max_evals=50)

100%|██████████| 50/50 [01:57<00:00,  2.34s/trial, best loss: -0.8147295742232451]


In [None]:
best_params

{'n_neighbors': 0, 'weights': 1}

In [None]:
clf = KNeighborsClassifier(n_neighbors = 2, weights = 'distance')
%time clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
metrics_classification(y_test, y_pred)

CPU times: user 10 ms, sys: 1e+03 ns, total: 10 ms
Wall time: 11.8 ms
Precision : 0.83008
Recall: 0.81644
F1-measure: 0.82320
Accuracy: 0.71744


In [None]:
def objective(params):
    clf = CatBoostClassifier(**params)
    clf.fit(X_train, y_train)
    print(f'PARAMS = {params}')
    y_pred = clf.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    if f1<=0.99:
        return -f1
    else:
        return 1.00

space = {
        'iterations': hp.choice('iterations', [100,200,300,400,500,600,700]),
        'learning_rate': hp.choice('learning_rate', [0.001, 0.05, 0.1]),
        'depth': hp.choice('depth', [4,5,6,7,8])
    }

best_params =fmin(
    fn=objective, space=space, algo=tpe.suggest, max_evals=10)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
405:	learn: 0.1801018	total: 1m 35s	remaining: 45.9s

406:	learn: 0.1796567	total: 1m 36s	remaining: 45.6s

407:	learn: 0.1790318	total: 1m 36s	remaining: 45.4s

408:	learn: 0.1784029	total: 1m 36s	remaining: 45.1s

409:	learn: 0.1776746	total: 1m 36s	remaining: 44.9s

410:	learn: 0.1771847	total: 1m 37s	remaining: 44.6s

411:	learn: 0.1765776	total: 1m 37s	remaining: 44.4s

412:	learn: 0.1760848	total: 1m 37s	remaining: 44.1s

413:	learn: 0.1757677	total: 1m 37s	remaining: 43.9s

414:	learn: 0.1752659	total: 1m 37s	remaining: 43.6s

415:	learn: 0.1747974	total: 1m 38s	remaining: 43.4s

416:	learn: 0.1744061	total: 1m 38s	remaining: 43.1s

417:	learn: 0.1738183	total: 1m 38s	remaining: 42.9s

418:	learn: 0.1731633	total: 1m 38s	remaining: 42.6s

419:	learn: 0.1727457	total: 1m 38s	remaining: 42.4s

420:	learn: 0.1723002	total: 1m 39s	remaining: 42.1s

421:	learn: 0.1719432	total: 1m 39s	remaining: 41.9s



In [None]:
best_params

{'depth': 4, 'iterations': 2, 'learning_rate': 1}

In [None]:
clf = CatBoostClassifier(depth = 8, iterations = 300, learning_rate = 0.05, class_weights={0:2,1:8})
%time clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
metrics_classification(y_test, y_pred)

0:	learn: 0.6690286	total: 4.33s	remaining: 21m 34s
1:	learn: 0.6481224	total: 6.67s	remaining: 16m 33s
2:	learn: 0.6298118	total: 9.37s	remaining: 15m 27s
3:	learn: 0.6128734	total: 11.4s	remaining: 14m 1s
4:	learn: 0.5942251	total: 13s	remaining: 12m 47s
5:	learn: 0.5794249	total: 14.7s	remaining: 11m 58s
6:	learn: 0.5640550	total: 16.4s	remaining: 11m 24s
7:	learn: 0.5535409	total: 18s	remaining: 10m 57s
8:	learn: 0.5436516	total: 19.7s	remaining: 10m 35s
9:	learn: 0.5340771	total: 21.8s	remaining: 10m 32s
10:	learn: 0.5246572	total: 24.4s	remaining: 10m 41s
11:	learn: 0.5158621	total: 26.4s	remaining: 10m 32s
12:	learn: 0.5046263	total: 28s	remaining: 10m 18s
13:	learn: 0.4961020	total: 29.6s	remaining: 10m 5s
14:	learn: 0.4881141	total: 31.3s	remaining: 9m 54s
15:	learn: 0.4832595	total: 32.9s	remaining: 9m 44s
16:	learn: 0.4779208	total: 34.5s	remaining: 9m 35s
17:	learn: 0.4725567	total: 36.8s	remaining: 9m 35s
18:	learn: 0.4661583	total: 39.4s	remaining: 9m 43s
19:	learn: 0.460