## Создание классических моделей машинного обучения

### Импорт собранных и обработанных данных

In [41]:
import pandas as pd

# Для mac
df_nlp = pd.read_csv(r'/Users/user/Documents/ML.csv')

# Для win
#df_nlp = pd.read_csv(r'C:\Users\User\Downloads\ML.csv'), если в начале лишних столбец с индексами

df_ml = df_nlp.drop(df_nlp.columns[[0,3,4,5,6,7,8,9,10]], axis = 1)

df_ml

Unnamed: 0,main_rubric,rubric,text_lemm,title_lemm
0,Происшествия,Происшествия,количество пострадать стройка краснознаменск р...,число пострадать обрушение строительный лес по...
1,Происшествия,Происшествия,инцидент происходить накануне лес устанавливат...,число пострадать обрушение лес краснознаменск ...
2,Происшествия,Происшествия,число пострадать обрушение строительный лес по...,экстренные служба число пострадать обрушение л...
3,Происшествия,Происшествия,напомним инцидент стройка подмосковный красноз...,число пострадать обрушение стройка подмосковье...
4,Происшествия,Происшествия,результате обрушение строительный лес краснозн...,число пострадать обрушение лес краснознаменск ...
...,...,...,...,...
3360,Наука,Космос,несмотря огромный сила гравитация солнце плане...,ученые объяснять почему планета сталкиваться с...
3361,Наука,Космос,главная причина который планета преодолевать м...,ученые рассказывать мешать планета сталкиватьс...
3362,Наука,Космос,сила гравитация удерживать планета солнечный с...,планеты преодолевать гравитация солнце сталкив...
3363,Наука,Космос,однако несмотря это планета держаться определе...,стало известно почему планета сталкиваться сол...


### Кодирование целевой переменной. Формирование тестовой и обучающей выборок

In [42]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

rubrics = ['Политика', 'Общество', 'Экономика', 'В мире', 'Спорт', 'Происшествия', 'Культура', 'Технологии', 'Наука']

encoder = LabelEncoder()
rubrics_list = df_ml['main_rubric'].to_list()
rubric_labels = encoder.fit_transform(rubrics_list)

X = df_ml['text_lemm']
y = rubric_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
my_tags = rubrics

X_train.shape

(2355,)

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### Байесовский классификатор

In [44]:
from sklearn.naive_bayes import MultinomialNB

def nb_classifier():
    
    nb = Pipeline ([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),
                   ])

    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

nb_classifier()

accuracy 0.799009900990099
              precision    recall  f1-score   support

    Политика       0.00      0.00      0.00        35
    Общество       0.99      0.84      0.91       116
   Экономика       0.96      0.32      0.48        71
      В мире       0.67      0.92      0.77       197
       Спорт       0.93      0.80      0.86       121
Происшествия       1.00      0.49      0.66        67
    Культура       0.72      0.99      0.83       193
  Технологии       0.95      0.78      0.85        90
       Наука       0.84      0.93      0.88       120

    accuracy                           0.80      1010
   macro avg       0.78      0.68      0.70      1010
weighted avg       0.81      0.80      0.78      1010



0.8

### Метод опорных векторов

In [45]:
from sklearn.linear_model import SGDClassifier

def sgd_classifier():
    
    sgd = Pipeline ([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                    ])

    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

sgd_classifier()

accuracy 0.8811881188118812
              precision    recall  f1-score   support

    Политика       0.50      0.20      0.29        35
    Общество       0.94      0.93      0.94       116
   Экономика       0.93      0.77      0.85        71
      В мире       0.83      0.89      0.86       197
       Спорт       0.89      0.94      0.92       121
Происшествия       0.98      0.76      0.86        67
    Культура       0.90      0.97      0.94       193
  Технологии       0.87      0.86      0.86        90
       Наука       0.87      0.95      0.91       120

    accuracy                           0.88      1010
   macro avg       0.86      0.81      0.82      1010
weighted avg       0.88      0.88      0.87      1010



0.88

### Логистическая регрессия

In [46]:
from sklearn.linear_model import LogisticRegression

def logreg_classifier():

    logreg = Pipeline ([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                       ])

    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

logreg_classifier()

accuracy 0.8722772277227723
              precision    recall  f1-score   support

    Политика       0.48      0.29      0.36        35
    Общество       0.87      0.94      0.90       116
   Экономика       0.95      0.80      0.87        71
      В мире       0.82      0.87      0.84       197
       Спорт       0.91      0.88      0.90       121
Происшествия       0.96      0.79      0.87        67
    Культура       0.94      0.96      0.95       193
  Технологии       0.81      0.88      0.84        90
       Наука       0.86      0.91      0.88       120

    accuracy                           0.87      1010
   macro avg       0.84      0.81      0.82      1010
weighted avg       0.87      0.87      0.87      1010



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.87

### Дерево решений 

In [47]:
from sklearn.tree import DecisionTreeClassifier

def dtree_classifier():
    
    dtree = Pipeline ([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', DecisionTreeClassifier(max_depth = 100)),
                      ])

    dtree.fit(X_train, y_train)
    y_pred = dtree.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

dtree_classifier()

accuracy 0.6495049504950495
              precision    recall  f1-score   support

    Политика       0.30      0.23      0.26        35
    Общество       0.81      0.68      0.74       116
   Экономика       0.72      0.46      0.56        71
      В мире       0.42      0.76      0.54       197
       Спорт       0.85      0.70      0.77       121
Происшествия       0.77      0.45      0.57        67
    Культура       0.78      0.77      0.78       193
  Технологии       0.78      0.47      0.58        90
       Наука       0.84      0.67      0.74       120

    accuracy                           0.65      1010
   macro avg       0.70      0.58      0.62      1010
weighted avg       0.71      0.65      0.66      1010



0.65

### Метод K ближайщих соседей KNN

In [48]:
from sklearn.neighbors import KNeighborsClassifier

def knn_classifier():

    knn = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', KNeighborsClassifier(n_neighbors=3)),
                   ])

    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

knn_classifier()

accuracy 0.8613861386138614
              precision    recall  f1-score   support

    Политика       0.43      0.57      0.49        35
    Общество       0.84      0.89      0.87       116
   Экономика       0.90      0.76      0.82        71
      В мире       0.86      0.85      0.86       197
       Спорт       0.87      0.90      0.89       121
Происшествия       0.93      0.84      0.88        67
    Культура       0.94      0.97      0.95       193
  Технологии       0.86      0.77      0.81        90
       Наука       0.86      0.86      0.86       120

    accuracy                           0.86      1010
   macro avg       0.83      0.82      0.83      1010
weighted avg       0.87      0.86      0.86      1010



0.86

### Метод градиентного бустинга

In [49]:
from sklearn.ensemble import GradientBoostingClassifier

def gb_classifier():
    
    gb = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', GradientBoostingClassifier(max_depth=2, n_estimators=150,random_state=12, learning_rate=1)),
                   ])

    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

gb_classifier()

accuracy 0.7
              precision    recall  f1-score   support

    Политика       0.30      0.26      0.28        35
    Общество       0.84      0.70      0.76       116
   Экономика       0.85      0.56      0.68        71
      В мире       0.46      0.81      0.58       197
       Спорт       0.82      0.75      0.78       121
Происшествия       0.70      0.52      0.60        67
    Культура       0.93      0.76      0.83       193
  Технологии       0.85      0.61      0.71        90
       Наука       0.86      0.76      0.81       120

    accuracy                           0.70      1010
   macro avg       0.73      0.64      0.67      1010
weighted avg       0.76      0.70      0.71      1010



0.7

### Случайный лес

In [50]:
from sklearn.ensemble import RandomForestClassifier


def rf_classifier():

    rf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', RandomForestClassifier(n_estimators=10, random_state=1))
                  ])

    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

rf_classifier()

accuracy 0.7504950495049505
              precision    recall  f1-score   support

    Политика       0.40      0.17      0.24        35
    Общество       0.82      0.78      0.80       116
   Экономика       0.88      0.52      0.65        71
      В мире       0.53      0.86      0.65       197
       Спорт       0.82      0.79      0.81       121
Происшествия       0.95      0.63      0.76        67
    Культура       0.92      0.84      0.88       193
  Технологии       0.85      0.63      0.73        90
       Наука       0.84      0.82      0.83       120

    accuracy                           0.75      1010
   macro avg       0.78      0.67      0.70      1010
weighted avg       0.79      0.75      0.75      1010



0.75

In [51]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

random_search = {'criterion': ['entropy', 'gini'],
               'max_depth': list(np.linspace(10, 1200, 10, dtype = int)) + [None],
               'max_features': ['auto', 'sqrt','log2', None],
               'min_samples_leaf': [4, 6, 8, 12],
               'min_samples_split': [5, 7, 10, 14],
               'n_estimators': list(np.linspace(151, 1200, 10, dtype = int))}

clf = RandomForestClassifier()
model = RandomizedSearchCV(estimator = clf, param_distributions = random_search, n_iter = 80, 
                               cv = 4, verbose= 5, random_state= 101, n_jobs = -1)

model.fit(X_train,y_train)

Fitting 4 folds for each of 80 candidates, totalling 320 fits


ValueError: 
All the 320 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 331, in fit
    X, y = self._validate_data(
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/pandas/core/series.py", line 872, in __array__
    return np.asarray(self._values, dtype)
ValueError: could not convert string to float: 'соответствующие цель выделять миллион евро сообщаться сайт минздрав страны'

--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 331, in fit
    X, y = self._validate_data(
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/Users/user/opt/anaconda3/lib/python3.9/site-packages/pandas/core/series.py", line 872, in __array__
    return np.asarray(self._values, dtype)
ValueError: could not convert string to float: 'планшет google pixel попадать рынок сша вместе док станцией'
