## Создание классических моделей машинного обучения

### Импорт собранных и обработанных данных

In [4]:
import pandas as pd

# Для mac
df_nlp = pd.read_csv(r'/Users/user/Documents/ML.csv')

# Для win
#df_ml = pd.read_csv(r'C:\Users\User\Downloads\ML.csv')

df_ml = df_nlp.drop(df_nlp.columns[[0,2,3,4,5,6,7,8,9]], axis = 1)
df_ml

Unnamed: 0,rubric,text_lemm,title_lemm
0,Политика,начинаться дальнейший переговорный процесс рос...,песков начало переговоры украина нужный полити...
1,Политика,официальный представитель кремль дмитрий песо...,песок москва ожидать готовность киев обсуждат...
2,Политика,представитель кремль дмитрий песок сообщать о...,кремль объяснять почему продолжаться спецопер...
3,Политика,представитель кремль заявлять помимо преодоле...,кремль начало переговоры украина нужный полит...
4,Политика,официальный представитель кремль дмитрий песо...,песок начинать переговоры россия украина нужн...
...,...,...,...
256325,Наука,стационарный посадочный модуль insight достав...,американский посадочный модуль insight марс п...
256326,Наука,устройство позволять выращивать растение косм...,устройство развитие растение космос участие ч...
256327,Наука,новый японский ракета носитель тяжелый класс ...,первый запуск новый японский ракета носитель ...
256328,Наука,первый запуск новый ракета носитель h который...,первый запуск новый японский ракета h состоят...


### Кодирование целевой переменной. Формирование тестовой и обучающей выборок

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

encoder = LabelEncoder()
rubrics_list = df_ml['rubric'].to_list()
rubric_labels = encoder.fit_transform(rubrics_list)

X = df_ml['text_lemm']
y = rubric_labels

rubrics = ['Политика', 'Общество', 'Экономика', 'В мире', 'Спорт', 'Происшествия', 'Культура', 'Технологии', 'Наука']
my_tags = rubrics

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


#X_train.shape

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### Байесовский классификатор

In [7]:
from sklearn.naive_bayes import MultinomialNB

def nb_classifier():
    
    nb = Pipeline ([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),
                   ])

    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

nb_classifier()

accuracy 0.8093733338534962
              precision    recall  f1-score   support

    Политика       0.80      0.10      0.18      4572
    Общество       0.89      0.82      0.85      4418
   Экономика       0.93      0.61      0.74      2113
      В мире       0.68      0.75      0.71     14627
       Спорт       0.75      0.95      0.84     19364
Происшествия       0.89      0.92      0.90     12924
    Культура       0.98      0.94      0.96      9432
  Технологии       0.87      0.50      0.64      2351
       Наука       0.85      0.78      0.82      7098

    accuracy                           0.81     76899
   macro avg       0.85      0.71      0.74     76899
weighted avg       0.82      0.81      0.79     76899



0.81

### Метод опорных векторов

In [8]:
from sklearn.linear_model import SGDClassifier

def sgd_classifier():
    
    sgd = Pipeline ([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, 
                                           max_iter=5, tol=None)),
                    ])

    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

sgd_classifier()

accuracy 0.7364985240380238
              precision    recall  f1-score   support

    Политика       0.74      0.03      0.06      4572
    Общество       0.82      0.80      0.81      4418
   Экономика       0.83      0.73      0.78      2113
      В мире       0.81      0.35      0.49     14627
       Спорт       0.66      0.95      0.78     19364
Происшествия       0.70      0.93      0.80     12924
    Культура       0.90      0.95      0.92      9432
  Технологии       0.83      0.56      0.67      2351
       Наука       0.73      0.79      0.76      7098

    accuracy                           0.74     76899
   macro avg       0.78      0.68      0.67     76899
weighted avg       0.76      0.74      0.70     76899



0.74

### Логистическая регрессия

In [9]:
from sklearn.linear_model import LogisticRegression

def logreg_classifier():

    logreg = Pipeline ([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                       ])

    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

logreg_classifier()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy 0.8460708201667122
              precision    recall  f1-score   support

    Политика       0.59      0.53      0.56      4572
    Общество       0.90      0.88      0.89      4418
   Экономика       0.86      0.85      0.85      2113
      В мире       0.74      0.78      0.76     14627
       Спорт       0.87      0.88      0.87     19364
Происшествия       0.92      0.92      0.92     12924
    Культура       0.97      0.96      0.96      9432
  Технологии       0.80      0.77      0.78      2351
       Наука       0.85      0.83      0.84      7098

    accuracy                           0.85     76899
   macro avg       0.83      0.82      0.83     76899
weighted avg       0.85      0.85      0.85     76899



0.85

### Дерево решений 

In [17]:
from sklearn.tree import DecisionTreeClassifier

def dtree_classifier():
    
    dtree = Pipeline ([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', DecisionTreeClassifier(max_depth = 100)),
                      ])

    dtree.fit(X_train, y_train)
    y_pred = dtree.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

dtree_classifier()

accuracy 0.686966020364374
              precision    recall  f1-score   support

    Политика       0.50      0.30      0.38      4572
    Общество       0.85      0.57      0.68      4418
   Экономика       0.87      0.54      0.67      2113
      В мире       0.42      0.78      0.55     14627
       Спорт       0.83      0.75      0.79     19364
Происшествия       0.86      0.71      0.78     12924
    Культура       0.96      0.82      0.88      9432
  Технологии       0.70      0.32      0.44      2351
       Наука       0.77      0.59      0.67      7098

    accuracy                           0.69     76899
   macro avg       0.75      0.60      0.65     76899
weighted avg       0.75      0.69      0.70     76899



0.69

### Метод K ближайщих соседей KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier

def knn_classifier():

    knn = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', KNeighborsClassifier(n_neighbors=3)),
                   ])

    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

knn_classifier()

accuracy 0.870921598460318
              precision    recall  f1-score   support

    Политика       0.63      0.60      0.62      4572
    Общество       0.87      0.88      0.88      4418
   Экономика       0.86      0.84      0.85      2113
      В мире       0.81      0.81      0.81     14627
       Спорт       0.89      0.92      0.90     19364
Происшествия       0.92      0.94      0.93     12924
    Культура       0.97      0.96      0.96      9432
  Технологии       0.87      0.73      0.79      2351
       Наука       0.89      0.86      0.87      7098

    accuracy                           0.87     76899
   macro avg       0.86      0.84      0.85     76899
weighted avg       0.87      0.87      0.87     76899



0.87

### Метод градиентного бустинга

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

def gb_classifier():
    
    gb = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', GradientBoostingClassifier(max_depth=2, n_estimators=150,
                                                       random_state=12, learning_rate=1)),
                   ])

    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

gb_classifier()

accuracy 0.6430642791193644
              precision    recall  f1-score   support

    Политика       0.45      0.32      0.38      4572
    Общество       0.80      0.54      0.64      4418
   Экономика       0.75      0.49      0.59      2113
      В мире       0.40      0.56      0.46     14627
       Спорт       0.70      0.79      0.74     19364
Происшествия       0.77      0.76      0.76     12924
    Культура       0.94      0.82      0.88      9432
  Технологии       0.63      0.28      0.39      2351
       Наука       0.62      0.38      0.47      7098

    accuracy                           0.64     76899
   macro avg       0.67      0.55      0.59     76899
weighted avg       0.67      0.64      0.64     76899



0.64

### Случайный лес

In [13]:
from sklearn.ensemble import RandomForestClassifier


def rf_classifier():

    rf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', RandomForestClassifier(n_estimators=10, random_state=1))
                  ])

    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

rf_classifier()

accuracy 0.803612530722116
              precision    recall  f1-score   support

    Политика       0.64      0.33      0.44      4572
    Общество       0.84      0.81      0.83      4418
   Экономика       0.83      0.76      0.79      2113
      В мире       0.67      0.72      0.69     14627
       Спорт       0.80      0.90      0.85     19364
Происшествия       0.87      0.89      0.88     12924
    Культура       0.96      0.93      0.95      9432
  Технологии       0.80      0.60      0.68      2351
       Наука       0.82      0.76      0.79      7098

    accuracy                           0.80     76899
   macro avg       0.80      0.74      0.77     76899
weighted avg       0.80      0.80      0.80     76899



0.8

### Стекинг

In [14]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

def stack_classifier():
    estimators = [('lr', LogisticRegression()), ('dt', DecisionTreeClassifier())]

    stack = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', StackingClassifier(estimators=estimators, final_estimator=SVC()))
                     ])

    stack.fit(X_train, y_train)
    y_pred = stack.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

stack_classifier()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

accuracy 0.8493998621568551
              precision    recall  f1-score   support

    Политика       0.65      0.45      0.53      4572
    Общество       0.87      0.90      0.89      4418
   Экономика       0.83      0.86      0.85      2113
      В мире       0.76      0.76      0.76     14627
       Спорт       0.86      0.91      0.88     19364
Происшествия       0.92      0.91      0.92     12924
    Культура       0.97      0.96      0.96      9432
  Технологии       0.78      0.78      0.78      2351
       Наука       0.82      0.87      0.84      7098

    accuracy                           0.85     76899
   macro avg       0.83      0.82      0.82     76899
weighted avg       0.85      0.85      0.85     76899



0.85

### Бэггинг

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

def bag_classifier():
    bag = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', BaggingClassifier(base_estimator=LogisticRegression(), 
                                              n_estimators=50, random_state=12))
                   ])

    bag.fit(X_train, y_train)
    y_pred = bag.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

bag_classifier()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy 0.8455116451449304
              precision    recall  f1-score   support

    Политика       0.65      0.43      0.52      4572
    Общество       0.89      0.87      0.88      4418
   Экономика       0.88      0.82      0.85      2113
      В мире       0.74      0.78      0.76     14627
       Спорт       0.85      0.90      0.88     19364
Происшествия       0.91      0.92      0.92     12924
    Культура       0.98      0.95      0.96      9432
  Технологии       0.82      0.74      0.78      2351
       Наука       0.84      0.84      0.84      7098

    accuracy                           0.85     76899
   macro avg       0.84      0.81      0.82     76899
weighted avg       0.84      0.85      0.84     76899



0.85

### Адаптивный бустинг

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

def adb_classifier():
    adb = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), 
                                               n_estimators=100, random_state=12))
                   ])

    adb.fit(X_train, y_train)
    y_pred = adb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

adb_classifier()

accuracy 0.6721543843222929
              precision    recall  f1-score   support

    Политика       0.47      0.24      0.32      4572
    Общество       0.77      0.73      0.75      4418
   Экономика       0.76      0.67      0.71      2113
      В мире       0.41      0.53      0.46     14627
       Спорт       0.73      0.79      0.76     19364
Происшествия       0.77      0.69      0.73     12924
    Культура       0.95      0.89      0.92      9432
  Технологии       0.63      0.51      0.56      2351
       Наука       0.67      0.64      0.66      7098

    accuracy                           0.67     76899
   macro avg       0.68      0.63      0.65     76899
weighted avg       0.68      0.67      0.67     76899



0.67