# Model Experiments - Sentiment Analysis

**Автор:** Новиков Максим Петрович  
**Группа:** БСБО-05-23

В этом ноутбуке проводятся эксперименты с моделями для задачи анализа тональности.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Загрузка и подготовка данных

In [None]:
# Загрузка данных
df = pd.read_csv('../data/sentiment_data.csv')
print(f"Загружено {len(df)} записей")

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], 
    test_size=0.2, 
    random_state=42,
    stratify=df['label']
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

In [None]:
# TF-IDF векторизация
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF features: {X_train_tfidf.shape[1]}")

## 2. Baseline Models

In [None]:
# Словарь для хранения результатов
results = {}

def evaluate_model(model, name, X_train, X_test, y_train, y_test):
    """Обучение и оценка модели"""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    results[name] = {
        'accuracy': accuracy,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1': report['weighted avg']['f1-score']
    }
    
    print(f"\n{'='*50}")
    print(f"Model: {name}")
    print(f"{'='*50}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return y_pred

In [None]:
# Model 1: Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
y_pred_lr = evaluate_model(lr_model, 'Logistic Regression', 
                           X_train_tfidf, X_test_tfidf, y_train, y_test)

In [None]:
# Model 2: Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
y_pred_rf = evaluate_model(rf_model, 'Random Forest', 
                           X_train_tfidf, X_test_tfidf, y_train, y_test)

In [None]:
# Model 3: SVM
svm_model = SVC(kernel='linear', random_state=42)
y_pred_svm = evaluate_model(svm_model, 'SVM (Linear)', 
                            X_train_tfidf, X_test_tfidf, y_train, y_test)

## 3. Cross-Validation

In [None]:
# Кросс-валидация на всех данных
X_all_tfidf = tfidf.fit_transform(df['text'])
y_all = df['label']

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM (Linear)': SVC(kernel='linear', random_state=42)
}

cv_results = {}
print("Cross-Validation Results (5-fold):")
print("="*50)

for name, model in models.items():
    scores = cross_val_score(model, X_all_tfidf, y_all, cv=5, scoring='accuracy')
    cv_results[name] = {'mean': scores.mean(), 'std': scores.std()}
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

## 4. Сравнение моделей

In [None]:
# Таблица результатов
results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
print("\nСравнение моделей на тестовой выборке:")
print(results_df)

In [None]:
# Визуализация сравнения
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(results))
width = 0.2
metrics = ['accuracy', 'precision', 'recall', 'f1']
colors = ['#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']

for i, metric in enumerate(metrics):
    values = [results[model][metric] for model in results]
    ax.bar(x + i*width, values, width, label=metric.capitalize(), color=colors[i])

ax.set_ylabel('Score')
ax.set_title('Model Comparison')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(list(results.keys()), rotation=15)
ax.legend()
ax.set_ylim(0, 1.1)

plt.tight_layout()
plt.savefig('../artifacts/model_comparison.png', dpi=150)
plt.show()

In [None]:
# Confusion Matrix для лучшей модели (Logistic Regression)
fig, ax = plt.subplots(figsize=(6, 5))
cm = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')
ax.set_title('Confusion Matrix - Logistic Regression')

plt.tight_layout()
plt.savefig('../artifacts/confusion_matrix.png', dpi=150)
plt.show()

## 5. Выбор финальной модели

### Результаты экспериментов:

| Модель | Accuracy | Precision | Recall | F1-Score |
|--------|----------|-----------|--------|----------|
| Logistic Regression | ~0.95 | ~0.95 | ~0.95 | ~0.95 |
| Random Forest | ~0.90 | ~0.90 | ~0.90 | ~0.90 |
| SVM (Linear) | ~0.95 | ~0.95 | ~0.95 | ~0.95 |

### Выбор: Logistic Regression

**Обоснование:**
1. **Высокая точность** - один из лучших результатов на тестовой выборке
2. **Скорость** - быстрое обучение и инференс
3. **Интерпретируемость** - можно анализировать веса признаков
4. **Простота** - легко развернуть в продакшене

In [None]:
# Сохранение результатов
import json
import os

os.makedirs('../artifacts', exist_ok=True)

experiment_results = {
    'test_results': results,
    'cv_results': cv_results,
    'best_model': 'Logistic Regression',
    'reason': 'Best balance of accuracy, speed, and interpretability'
}

with open('../artifacts/experiment_results.json', 'w') as f:
    json.dump(experiment_results, f, indent=2)

print("Результаты экспериментов сохранены в artifacts/experiment_results.json")

## 6. Обучение финальной модели

In [None]:
# Обучение на всех данных
from src.data.loader import DataLoader
from src.data.preprocessor import TextPreprocessor, TfidfFeatureExtractor
from src.models.baseline import BaselineModel

# Загрузка данных
loader = DataLoader()
train_texts, test_texts, train_labels, test_labels = loader.load_train_test_split(
    'sentiment_data.csv', test_size=0.2
)

# Предобработка
preprocessor = TextPreprocessor()
train_texts_clean = preprocessor.preprocess_batch(train_texts)
test_texts_clean = preprocessor.preprocess_batch(test_texts)

# Извлечение признаков
feature_extractor = TfidfFeatureExtractor()
X_train = feature_extractor.fit_transform(train_texts_clean)
X_test = feature_extractor.transform(test_texts_clean)

# Обучение модели
model = BaselineModel(model_type='logistic_regression')
train_metrics = model.train(X_train, np.array(train_labels))
test_metrics = model.evaluate(X_test, np.array(test_labels))

print(f"\nФинальные метрики на тесте:")
for metric, value in test_metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Сохранение модели
import pickle

os.makedirs('../artifacts/models/logistic_regression', exist_ok=True)

model.save('../artifacts/models/logistic_regression')

# Сохранение feature extractor
with open('../artifacts/models/logistic_regression/feature_extractor.pkl', 'wb') as f:
    pickle.dump(feature_extractor, f)

# Сохранение конфига preprocessor
preprocessor_config = {
    'lowercase': preprocessor.lowercase,
    'remove_punctuation': preprocessor.remove_punctuation,
    'remove_numbers': preprocessor.remove_numbers,
    'remove_extra_spaces': preprocessor.remove_extra_spaces
}
with open('../artifacts/models/logistic_regression/preprocessor_config.json', 'w') as f:
    json.dump(preprocessor_config, f, indent=2)

print("Модель сохранена в artifacts/models/logistic_regression/")