In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_squared_error
import joblib
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt


data_path = r"C:\Users\Lenovo\Desktop\Yelp Dataset\最终数据1.xlsx"
df = pd.read_excel(data_path)

text_col = 'review_text'
target_col = 'stars'

model_name = 'all-MiniLM-L6-v2'  # small, fast, decent accuracy
sbert_model = SentenceTransformer(model_name)

sentences = df[text_col].fillna("").tolist()
X_text = sbert_model.encode(sentences, show_progress_bar=True)

# Combine with numeric features if exist
numeric_cols = [c for c in df.columns if c not in [text_col, target_col, 'business_id']]
X_numeric = df[numeric_cols].values if numeric_cols else np.empty((df.shape[0],0))

X = np.hstack([X_text, X_numeric])
y = df[target_col].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


kf = KFold(n_splits=10, shuffle=True, random_state=42)

mlp_oof = np.zeros(X_train_scaled.shape[0])
mlp_lgb_oof = np.zeros(X_train_scaled.shape[0])

mlp_models = []
lgb_models = []

for train_idx, val_idx in kf.split(X_train_scaled):
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    mlp = MLPRegressor(
        hidden_layer_sizes=(128, 64),
        activation='relu',
        solver='adam',
        max_iter=100,
        early_stopping=True,
        n_iter_no_change=10,
        random_state=42
    )
    mlp.fit(X_tr, y_tr)
    mlp_oof[val_idx] = mlp.predict(X_val)
    mlp_models.append(mlp)

    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'max_depth': 6,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'verbose': -1,
        'n_jobs': -1,
        'random_state': 42
    }

    lgb_model = lgb.train(
        lgb_params,
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_train, lgb_val],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    mlp_lgb_oof[val_idx] = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
    lgb_models.append(lgb_model)

# MLP only
for mlp in mlp_models:
    mlp.fit(X_train_scaled, y_train)

mlp_pred_test = np.mean([m.predict(X_test_scaled) for m in mlp_models], axis=0)

# MLP + LightGBM stacking
mlp_full_pred = np.mean([m.predict(X_train_scaled) for m in mlp_models], axis=0)
lgb_full_pred = np.mean([l.predict(X_train_scaled) for l in lgb_models], axis=0)
stack_X_train = np.vstack([mlp_full_pred, lgb_full_pred]).T

meta_model = Ridge(alpha=1.0)
meta_model.fit(stack_X_train, y_train)

mlp_test_pred = np.mean([m.predict(X_test_scaled) for m in mlp_models], axis=0)
lgb_test_pred = np.mean([l.predict(X_test_scaled) for l in lgb_models], axis=0)
stack_X_test = np.vstack([mlp_test_pred, lgb_test_pred]).T

stack_pred_test = meta_model.predict(stack_X_test)

def evaluate(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    acc = np.mean(np.abs(y_true - y_pred) <= 0.5)
    return r2, rmse, acc

mlp_r2, mlp_rmse, mlp_acc = evaluate(y_test, mlp_pred_test)
stack_r2, stack_rmse, stack_acc = evaluate(y_test, stack_pred_test)

print("=== Test Set Performance ===")
print(f"MLP Only -> R²: {mlp_r2:.4f}, RMSE: {mlp_rmse:.4f}, Accuracy±0.5: {mlp_acc:.4f}")
print(f"MLP + LightGBM Stacking -> R²: {stack_r2:.4f}, RMSE: {stack_rmse:.4f}, Accuracy±0.5: {stack_acc:.4f}")

# Residuals vs Actual Stars
residuals = y_test - stack_pred_test
plt.figure(figsize=(8,6))
plt.scatter(y_test, residuals, c=y_test, cmap='RdBu_r', alpha=0.6)
plt.axhline(0, color='black', linestyle='--')
plt.xlabel("Actual Stars")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residuals vs Actual Stars (Stacking Model)")
plt.colorbar(label="Stars")
plt.show()

# Actual vs Predicted
plt.figure(figsize=(8,6))
plt.scatter(y_test, stack_pred_test, c=y_test, cmap='viridis', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
plt.xlabel("Actual Stars")
plt.ylabel("Predicted Stars")
plt.title("Actual vs Predicted Stars (Stacking Model)")
plt.colorbar(label="Stars")
plt.show()

joblib.dump({
    'scaler': scaler,
    'sbert_model_name': model_name,
    'mlp_models': mlp_models,
    'lgb_models': lgb_models,
    'meta_model': meta_model
}, r"C:\Users\Lenovo\Desktop\Yelp Dataset\stacking_model_bert.pkl")

print("All models trained and saved successfully!")