## 1. Логистическая регрессия 

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Загрузка данных
data = pd.read_csv('./data/train.csv')

# Функция предобработки текста
def preprocess_text(text):
    text = text.lower()  # Приведение к нижнему регистру
    text = re.sub(r'\W', ' ', text)  # Удаление специальных символов
    text = re.sub(r'\s+', ' ', text)  # Удаление лишних пробелов
    return text.strip()

# Применение функции предобработки
data['Processed_Description'] = data['Description'].apply(preprocess_text)

# Векторизация текста
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['Processed_Description'])
y = data['Label'].values

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение модели логистической регрессии 
# (добавляем параллелизм с помощью n_jobs=-1, используем все доступные ядра процессора для параллельной работы. 
# Меняем алгоритм, используемый для оптимизации, более подходящий для большого набора данных)
lr_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1, solver = 'sag')
lr_model.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred_lr = lr_model.predict(X_test)

# Расчёт метрик
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr, recall_lr, f1_lr, _ = precision_recall_fscore_support(y_test, y_pred_lr, average='macro')

print(f"Accuracy (Logistic Regression): {accuracy_lr}")
print(f"Precision (Logistic Regression): {precision_lr}")
print(f"Recall (Logistic Regression): {recall_lr}")
print(f"F1-Score (Logistic Regression): {f1_lr}")

Accuracy (Logistic Regression): 0.9623223532003966
Precision (Logistic Regression): 0.9643066995557898
Recall (Logistic Regression): 0.9609596679902497
F1-Score (Logistic Regression): 0.9625422259388425


## 2. Наивный байесовский алгоритм

In [3]:
from sklearn.naive_bayes import MultinomialNB

# Обучение модели
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred_nb = nb_model.predict(X_test)

# Расчёт метрик
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb, recall_nb, f1_nb, _ = precision_recall_fscore_support(y_test, y_pred_nb, average='macro')

print(f"Accuracy (Naive Bayes): {accuracy_nb}")
print(f"Precision (Naive Bayes): {precision_nb}")
print(f"Recall (Naive Bayes): {recall_nb}")
print(f"F1-Score (Naive Bayes): {f1_nb}")

Accuracy (Naive Bayes): 0.9423818442216592
Precision (Naive Bayes): 0.9464542261962015
Recall (Naive Bayes): 0.9416636580851345
F1-Score (Naive Bayes): 0.9437281181690276


## 3. Метод опорных векторов

In [4]:
from sklearn.svm import SVC

# Обучение модели SVM
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred_svm = svm_model.predict(X_test)

# Расчёт метрик для SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm, recall_svm, f1_svm, _ = precision_recall_fscore_support(y_test, y_pred_svm, average='macro')

print(f"Accuracy (SVM): {accuracy_svm}")
print(f"Precision (SVM): {precision_svm}")
print(f"Recall (SVM): {recall_svm}")
print(f"F1-Score (SVM): {f1_svm}")

Accuracy (SVM): 0.9668392640740333
Precision (SVM): 0.9684446967618542
Recall (SVM): 0.9660815641499675
F1-Score (SVM): 0.9672255624577506


## 4. Случайный лес

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Обучение модели случайного леса (для ускорения можно уменьшить число деревьев)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred_rf = rf_model.predict(X_test)

# Расчёт метрик для случайного леса
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, y_pred_rf, average='macro')

print(f"Accuracy (Random Forest): {accuracy_rf}")
print(f"Precision (Random Forest): {precision_rf}")
print(f"Recall (Random Forest): {recall_rf}")
print(f"F1-Score (Random Forest): {f1_rf}")

Accuracy (Random Forest): 0.9659579156108846
Precision (Random Forest): 0.970031676433625
Recall (Random Forest): 0.963211701230934
F1-Score (Random Forest): 0.9664102636093996


## 5. Градиентный бустинг

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

# Обучение модели градиентного бустинга
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)
gb_model.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred_gb = gb_model.predict(X_test)

# Расчёт метрик для градиентного бустинга
accuracy_gb = accuracy_score(y_test, y_pred_gb)
precision_gb, recall_gb, f1_gb, _ = precision_recall_fscore_support(y_test, y_pred_gb, average='macro')

print(f"Accuracy (Gradient Boosting): {accuracy_gb}")
print(f"Precision (Gradient Boosting): {precision_gb}")
print(f"Recall (Gradient Boosting): {recall_gb}")
print(f"F1-Score (Gradient Boosting): {f1_gb}")

Accuracy (Gradient Boosting): 0.9535088685689105
Precision (Gradient Boosting): 0.9577865699443346
Recall (Gradient Boosting): 0.9510645023618062
F1-Score (Gradient Boosting): 0.9542555538411515


## 6. XGBoost

In [7]:
import xgboost as xgb

# Обучение модели XGBoost
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=1.0, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred_xgb = xgb_model.predict(X_test)

# Расчёт метрик
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb, recall_xgb, f1_xgb, _ = precision_recall_fscore_support(y_test, y_pred_xgb, average='macro')

print(f"Accuracy (XGBoost): {accuracy_xgb}")
print(f"Precision (XGBoost): {precision_xgb}")
print(f"Recall (XGBoost): {recall_xgb}")
print(f"F1-Score (XGBoost): {f1_xgb}")

Accuracy (XGBoost): 0.9706951636003085
Precision (XGBoost): 0.9725626074535048
Recall (XGBoost): 0.9694491765556488
F1-Score (XGBoost): 0.970965531326612
