In [98]:
import pandas as pd
import numpy as np
import re
import random
import emoji

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import optuna
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [99]:
df = pd.read_csv("lamoda_reviews.csv")
df = df.sample(n=100000, random_state=42)
print(df.head())
df['Review'] = df['Review'].fillna('').astype(str)  # Убедимся, что в столбце строки
df['combined_text'] = (
    df["Name"].fillna('') + ' ' +
    df['Description'].fillna('') + ' ' +
    df['Review'].fillna('')
)
df['combined_text'] = df['combined_text'].astype(str).apply(preprocess) 


                                                    Name  \
27286                                               Боди   
77907                      Брюки спортивные TIRO23 P PNT   
76962                                           Кардиган   
77131                                              Сумка   
70008  Гель для бровей фиксирующий, c эффектом ламини...   

                                             Description  \
27286  Обратите внимание: Эффект Double push-up боди ...   
77907  Футбольные брюки adidas изготовлены из эластич...   
76962  Данный товар является частью проекта Lamoda pl...   
77131  Сумка выполнена из натуральной кожи. Детали: о...   
70008  Гель фиксатор для бровей, красота и уход для ж...   

                                                  Review  Rating  
27286              Очень неудобно надевать, но красивое.       5  
77907                                   Хорошее качество       5  
76962  Красивый кардиган, плетение по всей длине, и н...       5  
77131     

In [100]:
def process_emojis(text):
    text = emoji.demojize(text, delimiters=(" ", " "))
    smiley_dict = {
        ":)" : " happy ", ":(" : " sad ", ":D" : " laugh ",
        ";)" : " wink ", ":P" : " playfull ", ":O" : " surprised "
    }
    for k, v in smiley_dict.items():
        text = text.replace(k, v)
    return text

def preprocess(text):
    text = process_emojis(str(text))
    text = re.sub(r'[^а-яА-Яa-zA-Z0-9\s]', '', text)
    return text.lower()


def create_meta_features(df):
    df['text_length'] = df['combined_text'].apply(len)
    df['emoji_count'] = df['Review'].apply(lambda x: len(emoji.emoji_list(str(x))))
    df['sentiment'] = df['combined_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    return df[['text_length', 'emoji_count', 'sentiment']]


text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000, max_df=0.95, min_df=3)),
    ('svd', TruncatedSVD(n_components=300))
])

preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'combined_text'),
    ('meta', FunctionTransformer(create_meta_features), ['combined_text', 'Review'])
])

models = {
    'ridge': Ridge(alpha=10),
    'xgboost': XGBRegressor(),
    'random_forest': RandomForestRegressor(),
    'mlp': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=300),
    'ensemble': VotingRegressor([
        ('ridge', Ridge(alpha=10)),
        ('xgboost', XGBRegressor())
    ])
}

def objective(trial):
    params = {
        'preprocessor__text__tfidf__max_features': trial.suggest_int('preprocessor__text__tfidf__max_features', 1000, 8000),
        'preprocessor__text__tfidf__ngram_range': trial.suggest_categorical('preprocessor__text__tfidf__ngram_range', [(1,1), (1,2), (1,3)]),
        'alpha': trial.suggest_float('alpha', 0.01, 100.0, log=True),
    }

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', Ridge(alpha=params['alpha']))
    ])
    del params['alpha']
    pipeline.set_params(**params)

    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    return -np.mean(scores)

if __name__ == "__main__":
    if df['Rating'].between(1, 5).all():
        df['Rating'] *= 2

    X = df[['combined_text', 'Review']]
    y = df['Rating']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20)

    best_params = study.best_params
    tfidf_params = {k: v for k, v in best_params.items() if k.startswith('preprocessor__')}
    alpha = best_params['alpha']

    best_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', Ridge(alpha=alpha))
    ])
    best_pipeline.set_params(**tfidf_params)

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_test)

    print("Ridge с Optuna")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"R²: {r2_score(y_test, y_pred):.4f}")

    print("Сравнение других моделей:")
    for name, model in models.items():
        pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        print(f"\n[{name.upper()}]")
        print(f"MSE: {mean_squared_error(y_test, preds):.4f}")
        print(f"MAE: {mean_absolute_error(y_test, preds):.4f}")
        print(f"R²: {r2_score(y_test, preds):.4f}")




[I 2025-04-12 01:56:17,107] A new study created in memory with name: no-name-c373281c-aedd-4186-80d2-3388a25e7a52
Python(43470) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(43471) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(43472) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(43473) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(43474) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(43475) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(43476) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(43477) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
[I 2025-04-12 01:56:33,861] Trial 0 finished with value: 1.397290560646417 and parameters: {'preproces

Ridge с Optuna
MSE: 1.3862
MAE: 0.6377
R²: 0.1977
Сравнение других моделей:

[RIDGE]
MSE: 1.3974
MAE: 0.6300
R²: 0.1912

[XGBOOST]
MSE: 1.2535
MAE: 0.5807
R²: 0.2745

[RANDOM_FOREST]
MSE: 1.3029
MAE: 0.5736
R²: 0.2459

[MLP]
MSE: 1.2501
MAE: 0.5358
R²: 0.2764

[ENSEMBLE]
MSE: 1.2634
MAE: 0.5795
R²: 0.2688
