In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    ConfusionMatrixDisplay
)

import gdown



In [None]:
file_id = "16PN5DupGCTol025Dmdahho6zYkynu2wj"
file_url = f"https://drive.google.com/uc?id={file_id}"

output = "wb_reviews.csv"
gdown.download(file_url, output, quiet=False)

df = pd.read_csv(output)


print("Первые 5 строк данных:")
print(df.head())
print("\nИнформация о данных:")
print(df.info())


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Downloading...
From (original): https://drive.google.com/uc?id=16PN5DupGCTol025Dmdahho6zYkynu2wj
From (redirected): https://drive.google.com/uc?id=16PN5DupGCTol025Dmdahho6zYkynu2wj&confirm=t&uuid=6fb6132a-cd25-4507-b82d-945892cf3460
To: /Users/berdov/ml/project/classification_of_reviews/baseline/linear_regression/wb_reviews.csv
100%|██████████| 607M/607M [00:50<00:00, 12.1MB/s] 


Первые 5 строк данных:
                        Name  \
0  Кроссовки NIKE AIR MAX SC   
1  Кроссовки NIKE AIR MAX SC   
2  Кроссовки NIKE AIR MAX SC   
3  Кроссовки NIKE AIR MAX SC   
4  Кроссовки NIKE AIR MAX SC   

                                         Description  \
0  Кроссовки выполнены из натуральной кожи с текс...   
1  Кроссовки выполнены из натуральной кожи с текс...   
2  Кроссовки выполнены из натуральной кожи с текс...   
3  Кроссовки выполнены из натуральной кожи с текс...   
4  Кроссовки выполнены из натуральной кожи с текс...   

                                              Review  Rating  
0  Отличные кроссовки, очень лёгкие, на ножке акк...      10  
1                       Красивые, стильные, удобные.      10  
2                          Все хорошо. Идет в размер      10  
3  Удобные и красивые, но вся грязь собирается на...       8  
4                            Удобные пока не носоли.      10  

Информация о данных:
<class 'pandas.core.frame.DataFrame'>
RangeInde

In [5]:

df['combined_text'] = df['Name'] + ' ' + df['Description'] + ' ' + df['Review']

# Разделение на признаки и целевую переменную
X = df['combined_text']
y = df['Rating']

# Разделение на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание базового пайплайна с TF-IDF
def create_pipeline(model):
    return Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('model', model)
    ])

# Инициализация моделей
ridge = Ridge()
lasso = Lasso()

# Параметры для GridSearchCV
params = {
    'tfidf__max_features': [1000, 5000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'model__alpha': [0.01, 0.1, 1, 10, 100]
}

# Обучение и настройка Ridge регрессии
# =============================================================================
ridge_pipeline = create_pipeline(ridge)
ridge_grid = GridSearchCV(ridge_pipeline, params, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
ridge_grid.fit(X_train, y_train)

# Лучшие параметры для Ridge
print("Лучшие параметры для Ridge:")
print(ridge_grid.best_params_)

# Предсказание на тестовых данных
y_pred_ridge = ridge_grid.predict(X_test)

# Оценка качества Ridge
ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_mae = mean_absolute_error(y_test, y_pred_ridge)
ridge_r2 = r2_score(y_test, y_pred_ridge)

print("\nРезультаты для Ridge Regression:")
print(f"MSE: {ridge_mse:.4f}")
print(f"MAE: {ridge_mae:.4f}")
print(f"R²: {ridge_r2:.4f}")

# Обучение и настройка Lasso регрессии
# =============================================================================
lasso_pipeline = create_pipeline(lasso)
lasso_grid = GridSearchCV(lasso_pipeline, params, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
lasso_grid.fit(X_train, y_train)

# Лучшие параметры для Lasso
print("\nЛучшие параметры для Lasso:")
print(lasso_grid.best_params_)

# Предсказание на тестовых данных
y_pred_lasso = lasso_grid.predict(X_test)

# Оценка качества Lasso
lasso_mse = mean_squared_error(y_test, y_pred_lasso)
lasso_mae = mean_absolute_error(y_test, y_pred_lasso)
lasso_r2 = r2_score(y_test, y_pred_lasso)

print("\nРезультаты для Lasso Regression:")
print(f"MSE: {lasso_mse:.4f}")
print(f"MAE: {lasso_mae:.4f}")
print(f"R²: {lasso_r2:.4f}")

# Сравнение моделей
# =============================================================================
print("\nСравнение моделей:")
print(f"MSE Ridge vs Lasso: {ridge_mse:.4f} vs {lasso_mse:.4f}")
print(f"MAE Ridge vs Lasso: {ridge_mae:.4f} vs {lasso_mae:.4f}")
print(f"R² Ridge vs Lasso: {ridge_r2:.4f} vs {lasso_r2:.4f}")

# Эксперименты с разными alpha для Ridge
# =============================================================================
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
ridge_scores = []

for alpha in alphas:
    model = Ridge(alpha=alpha)
    pipeline = create_pipeline(model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    ridge_scores.append(mse)
    print(f"Alpha: {alpha:.3f} | MSE: {mse:.4f}")

# Эксперименты с разными alpha для Lasso
# =============================================================================
lasso_scores = []

for alpha in alphas:
    model = Lasso(alpha=alpha)
    pipeline = create_pipeline(model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    lasso_scores.append(mse)
    print(f"Alpha: {alpha:.3f} | MSE: {mse:.4f}")

# Анализ влияния ngram_range
# =============================================================================
ngram_ranges = [(1,1), (1,2), (1,3)]
for ngram in ngram_ranges:
    model = Ridge(alpha=1.0)
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=ngram, max_features=5000)),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"ngram_range: {ngram} | MSE: {mse:.4f}")

Process LokyProcess-1:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.13/3.13.2/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.2/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/berdov/ml/myenv/lib/python3.13/site-packages/joblib/externals/loky/process_executor.py", line 481, in _process_worker
    if time() - _last_memory_leak_check > _MEMORY_LEAK_CHECK_DELAY:
       ~~~~^^
KeyboardInterrupt
Process LokyProcess-4:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.13/3.13.2/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/opt/homebrew/Cellar/pyt

KeyboardInterrupt: 

Я че ебу