In [72]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    ConfusionMatrixDisplay
)

import gdown



In [None]:
df = pd.read_csv("lamoda_reviews.csv")
df = df.sample(n=100000, random_state=42)
print(df.head())
df = df.sample(n=10000, random_state=42)
df['Review'] = df['Review'].fillna('').astype(str)  # Убедимся, что в столбце строки
df['combined_text'] = (
    df["Good's name"].fillna('') + ' ' +
    df['Description'].fillna('') + ' ' +
    df['Review'].fillna('')
)
df['combined_text'] = df['combined_text'].astype(str).apply(preprocess) 


                                                    Name  \
27286                                               Боди   
77907                      Брюки спортивные TIRO23 P PNT   
76962                                           Кардиган   
77131                                              Сумка   
70008  Гель для бровей фиксирующий, c эффектом ламини...   

                                             Description  \
27286  Обратите внимание: Эффект Double push-up боди ...   
77907  Футбольные брюки adidas изготовлены из эластич...   
76962  Данный товар является частью проекта Lamoda pl...   
77131  Сумка выполнена из натуральной кожи. Детали: о...   
70008  Гель фиксатор для бровей, красота и уход для ж...   

                                                  Review  Rating  
27286              Очень неудобно надевать, но красивое.       5  
77907                                   Хорошее качество       5  
76962  Красивый кардиган, плетение по всей длине, и н...       5  
77131     

In [None]:
import pandas as pd
import numpy as np
import re
import emoji
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna
import joblib

# Обработка эмодзи и смайликов
def process_emojis(text):
    text = emoji.demojize(text, delimiters=(" ", " "))
    smiley_dict = {
        ":)" : " happy ", ":(" : " sad ", ":D" : " laugh ",
        ";)" : " wink ", ":P" : " playful ", ":O" : " surprised "
    }
    for k, v in smiley_dict.items():
        text = text.replace(k, v)
    return text

# Простые стоп-слова
BASIC_STOPWORDS = {
    "и", "в", "во", "не", "что", "он", "на", "я", "с", "со", "как", "а", "то",
    "все", "она", "так", "его", "но", "да", "ты", "к", "у", "же", "вы", "за", "бы", "по", "только",
    "ее", "мне", "было", "вот", "от", "меня", "еще", "нет", "о", "из", "ему", "теперь", "когда",
    "даже", "ну", "вдруг", "ли", "если", "уже", "или", "ни", "быть", "был", "него", "до", "вас",
    "нибудь", "опять", "уж", "вам", "ведь", "там", "потом", "себя", "ничего", "ей", "может", "они",
    "тут", "где", "есть", "надо", "ней", "для", "мы", "тебя", "их", "чем", "была", "сам", "чтоб",
    "без", "будто", "чего", "раз", "тоже", "себе", "под", "будет", "ж", "тогда", "кто", "этот",
    "того", "потому", "этого", "какой", "совсем", "ним", "здесь", "этом", "один", "почти", "мой",
    "тем", "чтобы", "нее", "сейчас", "были", "куда", "зачем", "всех", "никогда", "можно", "при",
    "наконец", "два", "об", "другой", "хоть", "после", "над", "больше", "тот", "через", "эти",
    "нас", "про", "всего", "них", "какая", "много", "разве", "три", "эту", "моя", "впрочем",
    "хорошо", "свою", "этой", "перед", "иногда", "лучше", "чуть", "том", "нельзя", "такой", "им",
    "более", "всегда", "конечно", "всю", "между",
    "the", "and", "is", "in", "to", "of", "it", "that", "on", "for", "with",
    "as", "this", "was", "but", "are", "not", "have", "be", "at", "or", "by"
}

# Препроцессинг текста
def enhanced_preprocess(text):
    text = process_emojis(text)
    text = re.sub(r'[^a-zA-Zа-яА-ЯёЁ\s]', '', text)
    tokens = text.lower().split()
    tokens = [word for word in tokens if word not in BASIC_STOPWORDS]
    return ' '.join(tokens)

# Мета-признаки
def create_meta_features(df):
    df['text_length'] = df['combined_text'].apply(len)
    df['emoji_count'] = df['Review'].apply(lambda x: len(emoji.emoji_list(x)))
    df['sentiment'] = df['combined_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    return df[['text_length', 'emoji_count', 'sentiment']]

# Препроцессор
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000, max_df=0.95, min_df=3)),
    ('svd', TruncatedSVD(n_components=300))
])

preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'combined_text'),
    ('meta', FunctionTransformer(create_meta_features), ['combined_text', 'Review'])
])

# Все модели
models = {
    'ridge': Ridge(alpha=10),
    'xgboost': XGBRegressor(),
    'random_forest': RandomForestRegressor(),
    'mlp': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500),
    'ensemble': VotingRegressor([
        ('ridge', Ridge(alpha=10)),
        ('xgboost', XGBRegressor())
    ])
}

# Optuna
def objective(trial):
    params = {
        'preprocessor__text__tfidf__max_features': trial.suggest_int('tfidf_max_features', 1000, 10000),
        'preprocessor__text__tfidf__ngram_range': trial.suggest_categorical('ngram_range', [(1,1), (1,2), (1,3)]),
        'model__alpha': trial.suggest_float('alpha', 0.01, 100, log=True)
    }
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', Ridge())
    ]).set_params(**params)
    
    return -np.mean(cross_val_score(
        pipeline, X_train, y_train, 
        cv=3, scoring='neg_mean_squared_error', 
        n_jobs=-1, error_score='raise'
    ))

# Основной блок
if __name__ == "__main__":
    if df['Rating'].between(1, 5).all():
        df['Rating'] *= 2

    X = df[['combined_text', 'Review']]
    y = df['Rating']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Тюнинг Ridge через Optuna
    print("🔍 Подбор параметров для Ridge через Optuna...")
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20, n_jobs=1)

    best_params = study.best_params
    tfidf_params = {
        k: v for k, v in best_params.items() if k.startswith('preprocessor__')
    }
    alpha = best_params['alpha']

    best_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', Ridge(alpha=alpha))
    ])
    best_pipeline.set_params(**tfidf_params)

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_test)

    print("\n📌 [Ridge с тюнингом Optuna]")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"R²: {r2_score(y_test, y_pred):.4f}")

    joblib.dump(best_pipeline, 'best_model.pkl')

    print("\n🚀 Сравнение всех остальных моделей:")
    results = []
    for name, model in models.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        mse = mean_squared_error(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        r2 = r2_score(y_test, preds)
        results.append((name, mse, mae, r2))
        print(f"[{name.upper()}] MSE={mse:.4f} | MAE={mae:.4f} | R²={r2:.4f}")

    best_model = sorted(results, key=lambda x: x[0])[0]

    # WordCloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['combined_text']))
    plt.figure(figsize=(15, 7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("🔤 WordCloud по обработанным отзывам", fontsize=16)
    plt.show()

    # Пример предсказания
    sample_text = ["Продукт отличный! Всё работает как надо 😊"]
    sample_df = pd.DataFrame({'combined_text': sample_text, 'Review': sample_text})
    print(f"\n🔮 Sample prediction: {best_pipeline.predict(sample_df)[0]:.1f}/10")


[I 2025-04-12 01:00:34,918] A new study created in memory with name: no-name-48f378ff-8f78-4ff3-bf90-8652b4c734f8


🔍 Подбор параметров для Ridge через Optuna...


Python(42434) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42435) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42436) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42437) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42438) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42439) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42440) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42441) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
[W 2025-04-12 01:00:47,911] Trial 0 failed with parameters: {'tfidf_max_features': 2579, 'ngram_range': (1, 3), 'alpha': 9.682849157683153} because of the following error: TypeError("object of type 'float' has no len

TypeError: object of type 'float' has no len()

А ТЕПЕРЬ ВСЕ БЕЗ СТОП СЛОВ

In [None]:
import pandas as pd
import numpy as np
import re
import random
import emoji

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import optuna
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 🧹 Обработка эмодзи и текста
def process_emojis(text):
    text = emoji.demojize(text, delimiters=(" ", " "))
    smiley_dict = {
        ":)" : " happy ", ":(" : " sad ", ":D" : " laugh ",
        ";)" : " wink ", ":P" : " playfull ", ":O" : " surprised "
    }
    for k, v in smiley_dict.items():
        text = text.replace(k, v)
    return text

def preprocess(text):
    text = process_emojis(str(text))
    text = re.sub(r'[^а-яА-Яa-zA-Z0-9\s]', '', text)
    return text.lower()

# 📊 Мета-признаки
def create_meta_features(df):
    df['text_length'] = df['combined_text'].apply(len)
    df['emoji_count'] = df['Review'].apply(lambda x: len(emoji.emoji_list(str(x))))
    df['sentiment'] = df['combined_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    return df[['text_length', 'emoji_count', 'sentiment']]

# 📚 Основной pipeline
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000, max_df=0.95, min_df=3)),
    ('svd', TruncatedSVD(n_components=300))
])

preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'combined_text'),
    ('meta', FunctionTransformer(create_meta_features), ['combined_text', 'Review'])
])

models = {
    'ridge': Ridge(alpha=10),
    'xgboost': XGBRegressor(),
    'random_forest': RandomForestRegressor(),
    'mlp': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=300),
    'ensemble': VotingRegressor([
        ('ridge', Ridge(alpha=10)),
        ('xgboost', XGBRegressor())
    ])
}

# 🔍 Optuna: подбор параметров
def objective(trial):
    params = {
        'preprocessor__text__tfidf__max_features': trial.suggest_int('preprocessor__text__tfidf__max_features', 1000, 8000),
        'preprocessor__text__tfidf__ngram_range': trial.suggest_categorical('preprocessor__text__tfidf__ngram_range', [(1,1), (1,2), (1,3)]),
        'alpha': trial.suggest_float('alpha', 0.01, 100.0, log=True),
    }

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', Ridge(alpha=params['alpha']))
    ])
    del params['alpha']
    pipeline.set_params(**params)

    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    return -np.mean(scores)

# 🚀 Главный запуск
if __name__ == "__main__":
    if df['Rating'].between(1, 5).all():
        df['Rating'] *= 2

    X = df[['combined_text', 'Review']]
    y = df['Rating']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 🔬 Optuna
    print("🔍 Подбор параметров через Optuna...")
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20)

    best_params = study.best_params
    tfidf_params = {k: v for k, v in best_params.items() if k.startswith('preprocessor__')}
    alpha = best_params['alpha']

    best_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', Ridge(alpha=alpha))
    ])
    best_pipeline.set_params(**tfidf_params)

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_test)

    print("\n📌 [Ridge с Optuna]")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"R²: {r2_score(y_test, y_pred):.4f}")

    # 📈 Промежуточные результаты по остальным моделям
    print("\n📊 Сравнение других моделей:")
    for name, model in models.items():
        pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        print(f"\n[{name.upper()}]")
        print(f"MSE: {mean_squared_error(y_test, preds):.4f}")
        print(f"MAE: {mean_absolute_error(y_test, preds):.4f}")
        print(f"R²: {r2_score(y_test, preds):.4f}")

    # ☁️ WordCloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['combined_text']))
    plt.figure(figsize=(14, 6))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title("📝 WordCloud по текстам отзывов")
    plt.show()

    # 🔮 Предсказание для примера
    sample = ["Товар отличный, качество супер! 😊"]
    sample_df = pd.DataFrame({'combined_text': sample, 'Review': sample})
    prediction = best_pipeline.predict(sample_df)[0]
    print(f"\n📦 Пример предсказания: {prediction:.1f}/10")


[I 2025-04-12 00:55:42,232] A new study created in memory with name: no-name-1e581f62-2bd6-48c5-8b8e-553145b646c9


🔍 Подбор параметров через Optuna...


Python(42299) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42300) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42301) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42302) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(42303) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
[I 2025-04-12 00:55:50,819] Trial 0 finished with value: 4.498937330044945 and parameters: {'preprocessor__text__tfidf__max_features': 7199, 'preprocessor__text__tfidf__ngram_range': (1, 2), 'alpha': 0.4133040146232739}. Best is trial 0 with value: 4.498937330044945.
Python(42304) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
[I 2025-04-12 00:55:58,339] Trial 1 finished with value: 4.501143791345321 and parameters: {'preprocessor__text__tfidf__max_features': 6648, 'pr

KeyboardInterrupt: 