# Линейная регрессия

В ноутбуке рассмотрены Ridge и Lasso регрессии с подбором лучших параметров. Выводы сформулированы в файле .md.

In [142]:
import pandas as pd
import numpy as np
import re
import random
import emoji

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, Lasso

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import optuna
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

## Результаты для WB

In [143]:
df = pd.read_csv("wb_reviews.csv")
cols = list(df.columns)
new_names = {
    cols[0]: 'Name',
    cols[1]: 'Description',
    cols[2]: 'Review',
    cols[3]: 'Rating'
}
df = df.rename(columns=new_names)
df = df.sample(n=50000, random_state=42)
print(df.head())
df['Review'] = df['Review'].fillna('').astype(str)

                                                     Name  \
28749             Ботинки зимние Аскет с МП рaбочая oбувь   
129928     Полотенце банное махровое набор 2 шт в подарок   
14279             Сироп Карамель для кофе и десертов 1 л.   
35226   Рубашка приталенная на кнопках с воротником ст...   
106728            Носки высокие набор хлопок черные 6 пар   

                                              Description  \
28749   Ботинки мужские зимние для работы "Аскет" подо...   
129928  Полотенце банное махровое Greece от бренда Spa...   
14279   Сироп Barinoff Карамель - классический универс...   
35226   Рубашки маломерят на пол размера. Превосходная...   
106728  Носки мужские набор 6 пар в стильной подарочно...   

                                                   Review  Rating  
28749                             Тёплые ,удобные,хорошие       5  
129928  большие, пушистые, впитывают, тяжелые, хор смо...       5  
14279                                                 Все     

In [144]:
def preprocess(text):
    text = re.sub(r'[^а-яА-Яa-zA-Z0-9\s]', '', text)
    return text.lower()

def create_meta_features(df):
    df = df.copy()
    df['text_length'] = df['Review'].apply(lambda x: len(str(x)))
    df['emoji_count'] = df['Review'].apply(lambda x: len(emoji.emoji_list(str(x))))
    df['sentiment'] = df['Review'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    return df[['text_length', 'emoji_count', 'sentiment']]


text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000, max_df=0.95, min_df=3)),
    ('svd', TruncatedSVD(n_components=300))
])

preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'Review'),
    ('meta', FunctionTransformer(create_meta_features), ['Review']) 
])


df['Review'] = df['Review'].fillna('').astype(str).apply(preprocess)

if df['Rating'].between(1, 5).all():
    df['Rating'] *= 2

X = df[['Review']] 
y = df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def optimize_model(model_cls, model_name):
    def objective(trial):
        alpha = trial.suggest_float('alpha', 0.01, 100.0, log=True)
        model = model_cls(alpha=alpha)

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        score = cross_val_score(pipeline, X_train, y_train, cv=3,
                                scoring='neg_mean_squared_error', n_jobs=-1)
        return -np.mean(score)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20)
    print(f"Лучший alpha для {model_name}: {study.best_params['alpha']:.4f}")
    return model_cls(alpha=study.best_params['alpha'])

best_ridge = optimize_model(Ridge, "Ridge")
best_lasso = optimize_model(Lasso, "Lasso")

for name, model in {'Ridge': best_ridge, 'Lasso': best_lasso}.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(f"\n{name.upper()}:")
    print(f"Alpha: {model.alpha:.4f}")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"R²: {r2_score(y_test, y_pred):.4f}")




[I 2025-04-13 16:50:50,079] A new study created in memory with name: no-name-2977aa73-74c9-4500-93de-954846368f3e
Python(61955) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61956) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61957) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61958) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61959) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61960) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61961) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(61962) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
[I 2025-04-13 16:51:00,510] Trial 0 finished with value: 3.49400505592396 and parameters: {'alpha': 31

Лучший alpha для Ridge: 0.0125


[I 2025-04-13 16:53:26,358] Trial 0 finished with value: 4.885277188646292 and parameters: {'alpha': 0.5748432650033207}. Best is trial 0 with value: 4.885277188646292.
[I 2025-04-13 16:53:36,778] Trial 1 finished with value: 4.891015130043972 and parameters: {'alpha': 5.774976980830497}. Best is trial 0 with value: 4.885277188646292.
[I 2025-04-13 16:53:47,203] Trial 2 finished with value: 4.885225440541584 and parameters: {'alpha': 0.10858304024948288}. Best is trial 2 with value: 4.885225440541584.
[I 2025-04-13 16:53:57,104] Trial 3 finished with value: 4.598036629232803 and parameters: {'alpha': 0.019915057875615164}. Best is trial 3 with value: 4.598036629232803.
[I 2025-04-13 16:54:07,301] Trial 4 finished with value: 4.885224360706119 and parameters: {'alpha': 0.053889546754376454}. Best is trial 3 with value: 4.598036629232803.
[I 2025-04-13 16:54:16,537] Trial 5 finished with value: 4.885224349441592 and parameters: {'alpha': 0.052759272042577354}. Best is trial 3 with value:

Лучший alpha для Lasso: 0.0102

RIDGE:
Alpha: 0.0125
MSE: 3.2839
MAE: 1.2146
R²: 0.3444

LASSO:
Alpha: 0.0102
MSE: 4.4474
MAE: 1.4250
R²: 0.1120


## Результаты для Lamoda

In [145]:
df = pd.read_csv("lamoda_reviews.csv")
df = df.sample(n=50000, random_state=42)
print(df.head())
df['Review'] = df['Review'].fillna('').astype(str)

                                                    Name  \
27286                                               Боди   
77907                      Брюки спортивные TIRO23 P PNT   
76962                                           Кардиган   
77131                                              Сумка   
70008  Гель для бровей фиксирующий, c эффектом ламини...   

                                             Description  \
27286  Обратите внимание: Эффект Double push-up боди ...   
77907  Футбольные брюки adidas изготовлены из эластич...   
76962  Данный товар является частью проекта Lamoda pl...   
77131  Сумка выполнена из натуральной кожи. Детали: о...   
70008  Гель фиксатор для бровей, красота и уход для ж...   

                                                  Review  Rating  
27286              Очень неудобно надевать, но красивое.       5  
77907                                   Хорошее качество       5  
76962  Красивый кардиган, плетение по всей длине, и н...       5  
77131     

In [146]:
def preprocess(text):
    text = re.sub(r'[^а-яА-Яa-zA-Z0-9\s]', '', text)
    return text.lower()

def create_meta_features(df):
    df = df.copy()
    df['text_length'] = df['Review'].apply(lambda x: len(str(x)))
    df['emoji_count'] = df['Review'].apply(lambda x: len(emoji.emoji_list(str(x))))
    df['sentiment'] = df['Review'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    return df[['text_length', 'emoji_count', 'sentiment']]


text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000, max_df=0.95, min_df=3)),
    ('svd', TruncatedSVD(n_components=300))
])

preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'Review'),
    ('meta', FunctionTransformer(create_meta_features), ['Review']) 
])


df['Review'] = df['Review'].fillna('').astype(str).apply(preprocess)

if df['Rating'].between(1, 5).all():
    df['Rating'] *= 2

X = df[['Review']] 
y = df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def optimize_model(model_cls, model_name):
    def objective(trial):
        alpha = trial.suggest_float('alpha', 0.01, 100.0, log=True)
        model = model_cls(alpha=alpha)

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        score = cross_val_score(pipeline, X_train, y_train, cv=3,
                                scoring='neg_mean_squared_error', n_jobs=-1)
        return -np.mean(score)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20)
    print(f"Лучший alpha для {model_name}: {study.best_params['alpha']:.4f}")
    return model_cls(alpha=study.best_params['alpha'])

best_ridge = optimize_model(Ridge, "Ridge")
best_lasso = optimize_model(Lasso, "Lasso")

for name, model in {'Ridge': best_ridge, 'Lasso': best_lasso}.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(f"\n{name.upper()}:")
    print(f"Alpha: {model.alpha:.4f}")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"R²: {r2_score(y_test, y_pred):.4f}")

[I 2025-04-13 16:56:35,330] A new study created in memory with name: no-name-7a0fd42b-a19b-473e-b359-747473b73f2b
[I 2025-04-13 16:56:42,916] Trial 0 finished with value: 1.3183989156471305 and parameters: {'alpha': 0.15346457408895084}. Best is trial 0 with value: 1.3183989156471305.
[I 2025-04-13 16:56:50,409] Trial 1 finished with value: 1.339350466217714 and parameters: {'alpha': 8.90799065889988}. Best is trial 0 with value: 1.3183989156471305.
[I 2025-04-13 16:56:57,614] Trial 2 finished with value: 1.3245009547429676 and parameters: {'alpha': 2.0475218918397995}. Best is trial 0 with value: 1.3183989156471305.
[I 2025-04-13 16:57:05,049] Trial 3 finished with value: 1.325880382855117 and parameters: {'alpha': 0.05647667758417339}. Best is trial 0 with value: 1.3183989156471305.
[I 2025-04-13 16:57:12,249] Trial 4 finished with value: 1.3486240865893258 and parameters: {'alpha': 12.101736799619884}. Best is trial 0 with value: 1.3183989156471305.
[I 2025-04-13 16:57:19,427] Trial

Лучший alpha для Ridge: 0.3560


[I 2025-04-13 16:59:10,896] Trial 0 finished with value: 1.6734826718544105 and parameters: {'alpha': 0.8237154647150576}. Best is trial 0 with value: 1.6734826718544105.
[I 2025-04-13 16:59:19,179] Trial 1 finished with value: 1.6734027991456868 and parameters: {'alpha': 0.6748110804554713}. Best is trial 1 with value: 1.6734027991456868.
[I 2025-04-13 16:59:26,796] Trial 2 finished with value: 1.7353195667770738 and parameters: {'alpha': 60.40402867632997}. Best is trial 1 with value: 1.6734027991456868.
[I 2025-04-13 16:59:34,505] Trial 3 finished with value: 1.6741078532531979 and parameters: {'alpha': 1.5576701496791672}. Best is trial 1 with value: 1.6734027991456868.
[I 2025-04-13 16:59:41,754] Trial 4 finished with value: 1.6733964960895742 and parameters: {'alpha': 0.6616385976517912}. Best is trial 4 with value: 1.6733964960895742.
[I 2025-04-13 16:59:49,236] Trial 5 finished with value: 1.6890004795134244 and parameters: {'alpha': 6.640692927110633}. Best is trial 4 with val

Лучший alpha для Lasso: 0.0102

RIDGE:
Alpha: 0.3560
MSE: 1.3427
MAE: 0.6271
R²: 0.2461

LASSO:
Alpha: 0.0102
MSE: 1.6883
MAE: 0.7008
R²: 0.0520


## Результаты для Mustapp

In [147]:
df = pd.read_csv("mustapp_reviews_total.csv")

cols = list(df.columns)
new_names = {
    cols[1]: 'Name',
    cols[2]: 'Description',
    cols[3]: 'Review',
    cols[4]: 'Rating'
}
df = df.rename(columns=new_names)
df = df.sample(n=50000, random_state=42)
print(df.head())
df['Review'] = df['Review'].fillna('').astype(str)

        Mustapp page ID                          Name  \
24186               297            Дьявол носит Prada   
169000            13997                         Сыщик   
154902             9966  Эффект бабочки 3: Откровения   
81260              2084                      Искатели   
99287              4083                         Обман   

                                              Description  \
24186   Мечтающая стать журналисткой провинциальная де...   
169000                                                NaN   
154902                                                NaN   
81260   Джон Уэйн играет бывшего солдата армии конфеде...   
99287   Талантливый хоккеист Крис Пратт, подававший бо...   

                                                   Review  Rating  
24186   погоня за мнимым счастьем, так называемым «рае...       9  
169000  в общем, вывод один  - если в один прекрасный ...       7  
154902              Классный фильм. Рекомендую.\n              10  
81260   всю жизнь 

In [148]:
def preprocess(text):
    text = re.sub(r'[^а-яА-Яa-zA-Z0-9\s]', '', text)
    return text.lower()

def create_meta_features(df):
    df = df.copy()
    df['text_length'] = df['Review'].apply(lambda x: len(str(x)))
    df['emoji_count'] = df['Review'].apply(lambda x: len(emoji.emoji_list(str(x))))
    df['sentiment'] = df['Review'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    return df[['text_length', 'emoji_count', 'sentiment']]


text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000, max_df=0.95, min_df=3)),
    ('svd', TruncatedSVD(n_components=300))
])

preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'Review'),
    ('meta', FunctionTransformer(create_meta_features), ['Review']) 
])


df['Review'] = df['Review'].fillna('').astype(str).apply(preprocess)

if df['Rating'].between(1, 5).all():
    df['Rating'] *= 2

X = df[['Review']] 
y = df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def optimize_model(model_cls, model_name):
    def objective(trial):
        alpha = trial.suggest_float('alpha', 0.01, 100.0, log=True)
        model = model_cls(alpha=alpha)

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        score = cross_val_score(pipeline, X_train, y_train, cv=3,
                                scoring='neg_mean_squared_error', n_jobs=-1)
        return -np.mean(score)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20)
    print(f"Лучший alpha для {model_name}: {study.best_params['alpha']:.4f}")
    return model_cls(alpha=study.best_params['alpha'])

best_ridge = optimize_model(Ridge, "Ridge")
best_lasso = optimize_model(Lasso, "Lasso")

for name, model in {'Ridge': best_ridge, 'Lasso': best_lasso}.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(f"\n{name.upper()}:")
    print(f"Alpha: {model.alpha:.4f}")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"R²: {r2_score(y_test, y_pred):.4f}")

[I 2025-04-13 17:01:48,486] A new study created in memory with name: no-name-69b5dd38-f57f-48f9-9ce0-13d61ca721a2
[I 2025-04-13 17:01:59,443] Trial 0 finished with value: 3.153130353460233 and parameters: {'alpha': 2.4002242720248717}. Best is trial 0 with value: 3.153130353460233.
[I 2025-04-13 17:02:09,421] Trial 1 finished with value: 3.152040482809913 and parameters: {'alpha': 0.044540896319044725}. Best is trial 1 with value: 3.152040482809913.
[I 2025-04-13 17:02:19,844] Trial 2 finished with value: 3.1538899927575357 and parameters: {'alpha': 0.7164980703291711}. Best is trial 1 with value: 3.152040482809913.
[I 2025-04-13 17:02:29,787] Trial 3 finished with value: 3.152910365740619 and parameters: {'alpha': 0.21770739405327977}. Best is trial 1 with value: 3.152040482809913.
[I 2025-04-13 17:02:39,832] Trial 4 finished with value: 3.1507242973805547 and parameters: {'alpha': 0.2846387993584998}. Best is trial 4 with value: 3.1507242973805547.
[I 2025-04-13 17:02:50,336] Trial 5

Лучший alpha для Ridge: 0.0518


[I 2025-04-13 17:05:30,281] Trial 0 finished with value: 3.8051514567721276 and parameters: {'alpha': 16.906932580322067}. Best is trial 0 with value: 3.8051514567721276.
[I 2025-04-13 17:05:40,947] Trial 1 finished with value: 3.7035506215793177 and parameters: {'alpha': 0.013220525409110975}. Best is trial 1 with value: 3.7035506215793177.
[I 2025-04-13 17:05:51,558] Trial 2 finished with value: 3.6876650568176355 and parameters: {'alpha': 0.01069361240178918}. Best is trial 2 with value: 3.6876650568176355.
[I 2025-04-13 17:06:02,843] Trial 3 finished with value: 3.78471846107905 and parameters: {'alpha': 0.09611150022055613}. Best is trial 2 with value: 3.6876650568176355.
[I 2025-04-13 17:06:13,396] Trial 4 finished with value: 3.7847206259132626 and parameters: {'alpha': 0.1981763522988997}. Best is trial 2 with value: 3.6876650568176355.
[I 2025-04-13 17:06:24,612] Trial 5 finished with value: 3.812189485032506 and parameters: {'alpha': 35.79447008099102}. Best is trial 2 with v

Лучший alpha для Lasso: 0.0101

RIDGE:
Alpha: 0.0518
MSE: 3.1574
MAE: 1.3539
R²: 0.1807

LASSO:
Alpha: 0.0101
MSE: 3.7101
MAE: 1.5152
R²: 0.0372
