In [8]:
# find the best param
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, mean_squared_error

df = pd.read_csv("/Users/emmali/PyCharmMiscProject/final_many_point_UTF-8.csv")

feature_columns = [
    "checkin_count", "BusinessAcceptsCreditCards", "WiFi", "HasTV",
    "BikeParking", "OutdoorSeating", "RestaurantsPriceRange2", "BusinessParking",
    "RestaurantsReservations", "Alcohol", "GoodForKids", "text_count",
    "avg_sentiment", "RestaurantsTakeOut", "taste_sentiment",
    "service_sentiment", "ambience_sentiment"
]

X = df[feature_columns]
y = df["stars"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

xgb = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

param_dist = {
    "n_estimators": [300, 500, 800, 1200],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [3, 5, 7, 9],
    "min_child_weight": [1, 3, 5, 10],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_alpha": [0, 0.1, 0.5, 1],
    "reg_lambda": [1, 2, 5, 10]
}

rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring=rmse_scorer,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Best parameters found: ", random_search.best_params_)
print("Best CV RMSE: ", -random_search.best_score_)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, min_child_weight=5, n_estimators=500, reg_alpha=0, reg_lambda=2, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, min_child_weight=5, n_estimators=500, reg_alpha=0, reg_lambda=2, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, min_child_weight=5, n_estimators=500, reg_alpha=0, reg_lambda=2, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, min_child_weight=5, n_estimators=500, reg_alpha=0, reg_lambda=2, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, min_child_weight=5, n_estimators=500, reg_alpha=0, reg_lambda=2, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=1, reg_lamb

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("/Users/emmali/PyCharmMiscProject/final_real.csv")

feature_columns = [
    "checkin_count", "BusinessAcceptsCreditCards", "WiFi", "HasTV",
    "BikeParking", "OutdoorSeating", "RestaurantsPriceRange2", "BusinessParking",
    "RestaurantsReservations", "Alcohol", "GoodForKids", "text_count",
    "avg_sentiment", "RestaurantsTakeOut", "taste_sentiment",
    "service_sentiment", "ambience_sentiment","local_cuisine_preference_score"
]

X = df[feature_columns].values
y = df["stars"].values

xgb_model = XGBRegressor(
    subsample=0.6,
    reg_lambda=10,
    reg_alpha=1,
    n_estimators=1200,
    min_child_weight=1,
    max_depth=3,
    learning_rate=0.01,
    colsample_bytree=0.6,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

ridge_model = Ridge(alpha=1.0, random_state=42)  # 一级 Ridge
meta_model = Ridge(alpha=1.0, random_state=42)  # 二级 Ridge

kf = KFold(n_splits=10, shuffle=True, random_state=42)

rmse_scores, r2_scores, acc_scores = [], [], []

for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    xgb_model.fit(X_train, y_train)
    ridge_model.fit(X_train, y_train)

    train_preds = np.vstack([
        xgb_model.predict(X_train),
        ridge_model.predict(X_train)
    ]).T
    test_preds = np.vstack([
        xgb_model.predict(X_test),
        ridge_model.predict(X_test)
    ]).T

    meta_model.fit(train_preds, y_train)
    final_preds = meta_model.predict(test_preds)

    rmse = np.sqrt(mean_squared_error(y_test, final_preds))
    r2 = r2_score(y_test, final_preds)
    acc = np.mean(np.abs(final_preds - y_test) <= 0.5)

    rmse_scores.append(rmse)
    r2_scores.append(r2)
    acc_scores.append(acc)

    print(f"Fold {fold+1}: RMSE={rmse:.4f}, R²={r2:.4f}, ACC={acc:.4%}")

print("\nOverall Performance (10-Fold CV):")
print(f"Average RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"Average R²: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")
print(f"Average ACC: {np.mean(acc_scores):.4%} ± {np.std(acc_scores):.4%}")


Fold 1: RMSE=0.4902, R²=0.6404, ACC=73.8589%
Fold 2: RMSE=0.5129, R²=0.6473, ACC=75.1037%
Fold 3: RMSE=0.4692, R²=0.6509, ACC=76.3485%
Fold 4: RMSE=0.4593, R²=0.6601, ACC=76.3485%
Fold 5: RMSE=0.5121, R²=0.5861, ACC=74.0664%
Fold 6: RMSE=0.4816, R²=0.6222, ACC=75.9336%
Fold 7: RMSE=0.4476, R²=0.6683, ACC=78.6307%
Fold 8: RMSE=0.4657, R²=0.6542, ACC=72.9730%
Fold 9: RMSE=0.4811, R²=0.6120, ACC=75.8836%
Fold 10: RMSE=0.4777, R²=0.6316, ACC=74.2204%

Overall Performance (10-Fold CV):
Average RMSE: 0.4797 ± 0.0201
Average R²: 0.6373 ± 0.0236
Average ACC: 75.3367% ± 1.5569%


import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge

df = pd.read_csv("/Users/emmali/PyCharmMiscProject/final_many_point_UTF-8.csv")

feature_columns = [
    "checkin_count", "BusinessAcceptsCreditCards", "WiFi","HasTV","BikeParking","OutdoorSeating",
    "RestaurantsPriceRange2","BusinessParking","RestaurantsReservations",
    "Alcohol", "GoodForKids", "text_count", "avg_sentiment","RestaurantsTakeOut",
    "taste_sentiment","service_sentiment","ambience_sentiment","local_cuisine_preference_score"
]

X = df[feature_columns]
y = df["stars"]

xgb_model = XGBRegressor(
    subsample=0.6,
    reg_lambda=10,
    reg_alpha=1,
    n_estimators=1200,
    min_child_weight=1,
    max_depth=3,
    learning_rate=0.01,
    colsample_bytree=0.6,
    random_state=42
)

meta_model = Ridge(alpha=1.0)

kf = KFold(n_splits=10, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X))

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    xgb_model.fit(X_train, y_train)
    val_preds = xgb_model.predict(X_val)

    meta_model.fit(val_preds.reshape(-1, 1), y_val)
    final_val_preds = meta_model.predict(val_preds.reshape(-1, 1))

    oof_preds[val_idx] = final_val_preds
    test_preds[val_idx] = final_val_preds

rmse = np.sqrt(mean_squared_error(y, oof_preds))
r2 = r2_score(y, oof_preds)
acc = np.mean(np.abs(oof_preds - y) <= 0.5)

print("model on train data over all performance (10-Fold CV):")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")
print(f"ACC (±0.5): {acc:.4f}")

xgb_model.fit(X, y)
meta_model.fit(xgb_model.predict(X).reshape(-1, 1), y)

final_preds = meta_model.predict(xgb_model.predict(X).reshape(-1, 1))

final_rmse = np.sqrt(mean_squared_error(y, final_preds))
final_r2 = r2_score(y, final_preds)
final_acc = np.mean(np.abs(final_preds - y) <= 0.5)

print("\nmodel on test data:")
print(f"Final RMSE: {final_rmse:.4f}")
print(f"Final R²: {final_r2:.4f}")
print(f"Final ACC (±0.5): {final_acc:.4f}")
