# Table of Contents
- Initializations
- EDA-Feature Engineering
  - Parsing and Calculating Helpfullnes Ratio
  - Visualizing Binned Distributions of Helpfullness Metrics
  - Boxplot of Ratings by Helpfulness Ratio Bins
  - Text Preprocessing for Review and Summary Fields
  - Sentiment Analysis on Reviews Using VADER
  - Sentiment Distribution of Reviews
  - Most Common WOrds in Positive, Negative, and Neutral Reviews
- Model Development
 - Loading Pretrained Sentence Transformer for Embedding
 - Calculating Laplace-Smoothing for Helpfullnes Ratio
 - Splitting Data
 - Hyperparameter Configuration for Various Machine Learning Models
 - Objective Function for Hyperparameter Tuning Using Optuna
 - Model Evaluation and Saving
- Model Training
  - Train and evaluate SVR
  - Train and evaluate LightGBM
  - Train and evaluate XGBoost
  - Train and evaluate CatBoost
  - Train and evaluate Random Forest
- Predictions on Test Set
  - SVR Prediction
  - XGBoost Prediction
  - LightGBM Prediction
  - CatBoost Prediction
  - Random Forest Prediction







  

# Initializations

In [None]:
%pip install -q catboost
%pip install -q optuna
%pip install -q sentence-transformers
%pip install -q vaderSentiment

In [None]:
import joblib
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import string
import json
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler

from catboost import CatBoostRegressor

from sentence_transformers import SentenceTransformer

import ast
from collections import Counter
import string

import optuna

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
df = pd.read_csv("/content/drive/MyDrive/BERT_rating_prediction/book_rating_20k.csv")
df.head(10)

# EDA - Feature Engineering

In [None]:
# check for nan
df.isnull().sum()

In [None]:
# check for duplicate samples
df.duplicated().sum()

In [None]:
df.dtypes

### Parsing and Calculating Helpfulness Ratio

In [None]:
def parse_helpful(value):
    try:

        if isinstance(value, str) and value.startswith("[") and value.endswith("]"):
            parsed = ast.literal_eval(value)
            if isinstance(parsed, list) and len(parsed) == 2:
                return parsed[0], parsed[1]

        elif isinstance(value, str) and '/' in value:
            helpful_votes, total_votes = map(int, value.split('/'))
            return helpful_votes, total_votes
    except:
        pass
    return 0, 0

df[['helpful_votes', 'total_votes']] = df['helpful'].apply(
    lambda x: pd.Series(parse_helpful(x))
)

df['helpfulness_ratio'] = np.where(
    df['total_votes'] == 0,
    0,
    np.round(df['helpful_votes'] / df['total_votes'], 2)
)

df.drop('helpful', axis=1, inplace=True)

In [None]:
df.head()

### Visualizing Binned Distributions of Helpfulness *Metrics*

In [None]:
df['helpful_votes_binned'] = pd.cut(df['helpful_votes'], bins=[-1, 0, 1, 3, 5, 10, 20, np.inf],
                                     labels=['0', '1', '2-3', '4-5', '6-10', '11-20', '21+'])

df['total_votes_binned'] = pd.cut(df['total_votes'], bins=[-1, 0, 1, 3, 5, 10, 20, np.inf],
                                   labels=['0', '1', '2-3', '4-5', '6-10', '11-20', '21+'])

df['helpfulness_ratio_binned'] = pd.cut(df['helpfulness_ratio'], bins=[-0.01, 0, 0.25, 0.5, 0.75, 1.0],
                                        labels=['0', '0.01–0.25', '0.26–0.5', '0.51–0.75', '0.76–1.0'])


fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.countplot(x='helpful_votes_binned', hue="helpful_votes_binned", data=df, ax=axes[0], palette='hls')
axes[0].set_title("helpful_votes_binned")

sns.countplot(x='total_votes_binned', hue="total_votes_binned",data=df, ax=axes[1], palette='hls')
axes[1].set_title("total_votes_binned")

sns.countplot(x='helpfulness_ratio_binned', hue="helpfulness_ratio_binned", data=df, ax=axes[2], palette='hls')
axes[2].set_title("helpfulness_ratio_binned")

plt.tight_layout()
plt.show()

df.drop('helpful_votes_binned', axis=1, inplace=True)
df.drop('total_votes_binned', axis=1, inplace=True)
df.drop('helpfulness_ratio_binned', axis=1, inplace=True)

### Boxplot of Ratings by Helpfulness Ratio Bins

In [None]:
df['helpfulness_bin'] = pd.cut(df['helpfulness_ratio'], bins=[0, 0.25, 0.5, 0.75, 1.0])
sns.boxplot(x='helpfulness_bin', y='rating', data=df)
plt.title("Rating Distribution by Helpfulness Ratio Bin")
plt.show()
df.drop("helpfulness_bin", axis=1, inplace=True)

In [None]:
df.drop("reviewTime", axis=1, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

### Text Preprocessing for Review and Summary Fields

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def text_preprocessing(text):
    text = text.lower() # Convert all characters to lowercase
    text = text.strip() # Remove leading and trailing whitespace
    text = re.sub(r'https?://www.', '', text) # Remove URLs that start with 'http://www.' or 'https://www.'
    text = re.sub('\[.*?\]', '', text) # Remove content inside square brackets (e.g., [example])
    text = re.sub("\\W"," ",text) # Replace all non-word characters
    text = re.sub('https?://\S+|www\.\S+', '', text) # Remove remaining URLs
    text = re.sub('<.*?>+', '', text) # Remove HTML tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    text = re.sub('\n', '', text) # Remove newline characters
    text = re.sub('\w*\d\w*', '', text) # Remove any word that contains digits
    return text


new_df = df.copy()
new_df['reviewText'] = new_df['reviewText'].apply(text_preprocessing)
new_df['summary'] = new_df['summary'].apply(text_preprocessing)

### Sentiment Analysis on Reviews Using VADER

In [None]:
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(review):
    return analyzer.polarity_scores(review)

new_df['vader_scores'] = new_df['reviewText'].apply(get_sentiment)

new_df['compound'] = new_df['vader_scores'].apply(lambda score_dict: score_dict['compound'])

def classify_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

new_df['sentiment'] = new_df['compound'].apply(classify_sentiment)

new_df.drop('vader_scores', axis=1, inplace=True)
new_df.drop('compound', axis=1, inplace=True)

new_df.head(10)

### Sentiment Distribution of Reviews

In [None]:
plt.figure(figsize=(8, 6))

sns.countplot(data=new_df, x='sentiment', palette='hls')

plt.title('Sentiment Distribution of Reviews', fontsize=14)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Count', fontsize=12)

for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2, p.get_height()),
                       ha='center', va='bottom', fontsize=11)

plt.show()

### Most Common Words in Positive, Negative, and Neutral Reviews

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def filter_tokens(tokens):
    return [token.lower() for token in tokens if token.lower() not in stop_words and token not in punctuation]

def plot_most_common_words(common_words, sentiment, color):
    words, counts = zip(*common_words)

    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(counts), y=list(words), palette=color)

    plt.title(f'Most Common Words in {sentiment} Reviews', fontsize=16)
    plt.xlabel('Frequency', fontsize=14)
    plt.ylabel('Words', fontsize=14)
    plt.tight_layout()
    plt.show()

positive_reviews = new_df[new_df['sentiment'] == 'Positive']['reviewText']
negative_reviews = new_df[new_df['sentiment'] == 'Negative']['reviewText']
neutral_reviews  = new_df[new_df['sentiment'] == 'Neutral']['reviewText']

def get_filtered_words(text_series):
    words = []
    for review in text_series.dropna():
        tokens = review.split()
        filtered = filter_tokens(tokens)
        words.extend(filtered)
    return words

positive_words_list = get_filtered_words(positive_reviews)
negative_words_list = get_filtered_words(negative_reviews)
neutral_words_list  = get_filtered_words(neutral_reviews)

positive_words = Counter(positive_words_list).most_common(20)
negative_words = Counter(negative_words_list).most_common(20)
neutral_words  = Counter(neutral_words_list).most_common(20)

plot_most_common_words(positive_words, 'Positive', 'Greens')
plot_most_common_words(negative_words, 'Negative', 'Reds')
plot_most_common_words(neutral_words, 'Neutral', 'Blues')

In [None]:
# I deleted "sentiment" because i realized it can mislead the model
new_df.drop("sentiment", axis=1, inplace=True)

# Model Development

### Loading Pretrained Sentence Transformer Model for Embedding

In [None]:
# vectorization
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

### Calculating Laplace-Smoothing for Helpfulness Ratio

In [None]:
α = 5
# global mean ratio (ignore zero-vote by replacing 0→nan)
μ = (
    new_df["helpful_votes"].sum()
    / new_df["total_votes"].replace(0, np.nan).sum()
)

# Laplace‐smoothed ratio
new_df["smoothed_ratio"] = (
    new_df["helpful_votes"] + α * μ
) / (
    new_df["total_votes"] + α
)

new_df.drop("helpful_votes", axis=1, inplace=True)
new_df.drop("total_votes", axis=1, inplace=True)
new_df.drop("helpfulness_ratio", axis=1, inplace=True)

new_df['text_combined'] = new_df['reviewText'] + " " + new_df['summary']

new_df.drop("summary", axis=1, inplace=True)
new_df.drop("reviewText", axis=1, inplace=True)

In [None]:
new_df.head()

### Splitting Data

In [None]:
train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42, shuffle=True)
weights = train_df["smoothed_ratio"].values
train_df.drop("smoothed_ratio", axis=1, inplace=True)
test_df.drop("smoothed_ratio", axis=1, inplace=True)

### Hyperparameter Configuration for Various Machine Learning Models

In [None]:
class CFG:
  cat_params = {
            'depth': (4, 10),
            'learning_rate': (0.01, 0.3),
            'l2_leaf_reg': (1, 15),
            'colsample_bylevel': (0.6, 1.0),
            'min_data_in_leaf': (1, 30),
            'grow_policy': (['SymmetricTree', 'Depthwise', 'Lossguide']),
            'bootstrap_type': (['Bayesian', 'Bernoulli', 'MVS']),
            'iterations': (800, 2000),
        }

  xgb_params = {
            'enable_categorical': True,
            'objective': 'reg:squarederror',
            'verbosity': 0,
            'n_estimators': (1000, 10000),
            'learning_rate': (1e-3, 0.1),
            'max_depth': (3, 8),
            'subsample': (0.5, 1.0),
            'colsample_bytree': (0.1, 1.0),
            'min_child_weight': (1, 100),
            'reg_lambda': (1e-2, 100.0)
        }

  seed = 42
  lgb_params = {
        'objective': 'regression',
        'metric': 'l2',
        'learning_rate': (1e-3, 0.2),
        'num_iterations': (100, 1000),
        'num_leaves': (16, 512),
        'max_depth': (3, 16),
        'min_child_samples': (5, 50),
        'reg_alpha': (1e-8, 10.0),
        'reg_lambda': (1e-8, 10.0),
        'extra_trees': ([True, False]),
        'importance_type': (['split', 'gain']),
        'max_bin': (128, 512),
        'verbose': -1
    }

  svr_params = {
        'C': (1e-1, 10.0),
        'epsilon': (1e-3, 10.0),
        'gamma': (['scale', 'auto']),
        'kernel': (['linear', 'poly', 'rbf', 'sigmoid'])
    }

  rf_params = {
      "n_estimators": (100, 1000),
      "max_depth": (10, 50),
      "min_samples_split": (2, 32),
      "min_samples_leaf": (1, 32),
      "max_features": (['sqrt', 'log2']),
      "criterion": ["squared_error"]
  }


### Objective Function for Hyperparameter Tuning Using Optuna

In [None]:
def objective(trial, X, y, model_name, weights, n_splits=5):
  """
    Objective function for hyperparameter optimization using Optuna.
    It performs K-Fold cross-validation and calculates Mean Squared Error (MSE) for the given model.

    Parameters:
    - trial: The current Optuna trial, which contains hyperparameter suggestions.
    - X: The feature matrix (training data).
    - y: The target variable (training targets).
    - model_name: The name of the model (e.g., "CatBoost", "XGBoost", "LightGBM", "SVR", "RF").
    - weights: Sample weights for the training data.
    - n_splits: The number of splits for K-Fold cross-validation (default is 5).

    Returns:
    - The mean MSE score over the K-Fold splits. This will be minimized by Optuna to find the best hyperparameters.
    """

  X = np.array(X)
  y = np.array(y)
  weights = np.array(weights)
  print(f"\n----- Training {model_name} -----")

  cat_cols = [col for col in train_df.columns if train_df[col].dtype == ["category", "object"]]
  num_cols = [col for col in train_df.columns if train_df[col].dtype != ["category", "object"]]

  # K-Fold cross-validation
  kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

  mse_scores = []

  for train_index, valid_index in kf.split(X):
      X_train, X_valid = X[train_index], X[valid_index]
      y_train, y_valid = y[train_index], y[valid_index]

      # Assigning sample weights for the training data
      sample_weights = weights[train_index]

      # Training and hyperparameter tuning for CatBoost
      if model_name.startswith("CatBoost"):
        params = {
            "loss_function": 'RMSE',
            'depth': trial.suggest_int('depth', *CFG.cat_params["depth"]),
            'learning_rate': trial.suggest_float('learning_rate', *CFG.cat_params["learning_rate"], log=True),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', *CFG.cat_params["l2_leaf_reg"]),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', *CFG.cat_params["colsample_bylevel"]),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', *CFG.cat_params["min_data_in_leaf"]),
            'grow_policy': trial.suggest_categorical('grow_policy', CFG.cat_params["grow_policy"]),
            'bootstrap_type': trial.suggest_categorical('bootstrap_type', CFG.cat_params["bootstrap_type"]),
            'iterations': trial.suggest_int('iterations', *CFG.cat_params["iterations"]),
        }

        model = CatBoostRegressor(
            **params,
            cat_features=cat_cols,
            verbose = 200
        )

      # Training and hyperparameter tuning for XGBoost
      if model_name.startswith("XGBoost"):
        params = {
            'enable_categorical': CFG.xgb_params["enable_categorical"],
            'objective': CFG.xgb_params['objective'],
            'verbosity': CFG.xgb_params["verbosity"],
            'n_estimators': trial.suggest_int("n_estimators", *CFG.xgb_params["n_estimators"]),
            'learning_rate': trial.suggest_loguniform("learning_rate", *CFG.xgb_params["learning_rate"]),
            'max_depth': trial.suggest_int("max_depth", *CFG.xgb_params["max_depth"]),
            'subsample': trial.suggest_float("subsample", *CFG.xgb_params["subsample"]),
            'colsample_bytree': trial.suggest_float("colsample_bytree", *CFG.xgb_params["colsample_bytree"]),
            'min_child_weight': trial.suggest_int("min_child_weight", *CFG.xgb_params["min_child_weight"]),
            'reg_lambda': trial.suggest_loguniform("reg_lambda", *CFG.xgb_params["reg_lambda"])
        }
        model = XGBRegressor(**params)

      # Training and hyperparameter tuning for LightGBM
      if model_name.startswith("LightGBM"):
        params = {
          'objective': 'regression',
          'metric': 'l2',
          'learning_rate': trial.suggest_float('learning_rate', *CFG.lgb_params['learning_rate'], log=True),
          'num_iterations': trial.suggest_int('num_iterations', *CFG.lgb_params['num_iterations']),
          'num_leaves': trial.suggest_int('num_leaves', *CFG.lgb_params['num_leaves']),
          'max_depth': trial.suggest_int('max_depth', *CFG.lgb_params['max_depth']),
          'min_child_samples': trial.suggest_int('min_child_samples', *CFG.lgb_params['min_child_samples']),
          'reg_alpha': trial.suggest_float('reg_alpha', *CFG.lgb_params['reg_alpha'], log=True),
          'reg_lambda': trial.suggest_float('reg_lambda', *CFG.lgb_params['reg_lambda'], log=True),
          'extra_trees': trial.suggest_categorical('extra_trees', CFG.lgb_params['extra_trees']),
          'importance_type': trial.suggest_categorical('importance_type', CFG.lgb_params['importance_type']),
          'max_bin': trial.suggest_int('max_bin', *CFG.lgb_params['max_bin']),
          'verbose': 200,
          'seed': CFG.seed
        }
        model = LGBMRegressor(**params)

      # Training and hyperparameter tuning for Support Vector Machine
      if model_name.startswith("SVR"):
        params = {
          'C': trial.suggest_float('C', *CFG.svr_params['C']),
          'epsilon': trial.suggest_float('epsilon', *CFG.svr_params['epsilon']),
          'gamma': trial.suggest_categorical('gamma', CFG.svr_params['gamma']),
          'kernel': trial.suggest_categorical('kernel', CFG.svr_params['kernel']),
        }
        model =  SVR(**params)

      # Training and hyperparameter tuning for Random Fores
      if model_name.startswith("RF"):
        params = {
          "n_estimators": trial.suggest_int('n_estimators', *CFG.rf_params["n_estimators"]),
          "max_depth": trial.suggest_int('max_depth', *CFG.rf_params["max_depth"]),
          "min_samples_split": trial.suggest_int('min_samples_split', *CFG.rf_params["min_samples_split"]),
          "min_samples_leaf": trial.suggest_int('min_samples_leaf', *CFG.rf_params["min_samples_leaf"]),
          "max_features": trial.suggest_categorical('max_features', CFG.rf_params["max_features"]),
          "criterion": trial.suggest_categorical('criterion', CFG.rf_params["criterion"]),
        }
        model = RandomForestRegressor(**params)

      # Fitting the model and making predictions
      model.fit(X_train, y_train, sample_weight=sample_weights)
      preds = model.predict(X_valid)

      # Calculating mean squared error
      mse = mean_squared_error(y_valid, preds)

      mse_scores.append(mse)

  return np.mean(mse_scores)

In [None]:
train_embeddings = embedding_model.encode(list(train_df['text_combined'].values), show_progress_bar = True)
test_embeddings = embedding_model.encode(list(test_df['text_combined'].values), show_progress_bar = True)

X_train = train_embeddings
y_train = train_df['rating'].values

X_test = test_embeddings
y_test = test_df['rating'].values

def run(X_train, y_train, model_name, n_trials, weights):
  """
    Function to run hyperparameter optimization using Optuna.

    Parameters:
    - X_train: The training feature matrix
    - y_train: The training target values
    - model_name: The name of the model to use (e.g., CatBoost, XGBoost)
    - n_trials: The number of Optuna trials for optimization
    - weights: Sample weights for training

    Returns:
    - The best hyperparameters found by Optuna
    """

  study = optuna.create_study(direction="minimize")
  study.optimize(lambda trial: objective(trial, X_train, y_train, model_name, weights), n_trials=n_trials)

  print("Best MSE:", study.best_value)
  print("Best params:", study.best_params)

  return study.best_params

### Model Evaluation and Saving

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test, test_df, model_name):
    """
    This function evaluates a given machine learning model by training it on the training dataset,
    predicting on the test dataset, and calculating performance metrics. It also applies clipping and
    rounding on predictions to handle outliers and adjusts the results accordingly.

    Parameters:
    - model: The machine learning model to be evaluated (e.g., XGBoost, CatBoost, etc.)
    - X_train: Training features
    - y_train: Target values for the training set
    - X_test: Testing features
    - y_test: Target values for the test set
    - test_df: DataFrame that includes test data and will hold the predictions
    - model_name: The name of the model (string) for displaying results

    Returns:
    - test_df: The input DataFrame containing predictions and the evaluation metrics

    The function prints out the following evaluation metrics:
    - MAE (Mean Absolute Error)
    - MSE (Mean Squared Error)
    - R2 (R-squared)
    - Number of predictions greater than 5

    It then clips predictions that exceed a value of 5.0, rounds the predictions,
    and re-evaluates the model using the clipped and rounded predictions.
    """

    # Fit
    model.fit(X_train, y_train)

    # Predict
    preds = model.predict(X_test)
    test_df['preds'] = preds

    # Before clipping
    print(f"\n{model_name} Results (Before Clipping)")
    print("MAE:\t{:.5f}".format(mean_absolute_error(y_test, preds)))
    print("MSE:\t{:.5f}".format(mean_squared_error(y_test, preds)))
    print("R2:\t{:.5f}".format(r2_score(y_test, preds)))
    print("Preds > 5:\t", len(np.where(test_df['preds'] > 5)[0]))

    # Clipping predictions
    test_df['preds'] = test_df['preds'].apply(lambda x: 5.0 if x > 5.0 else x)

    # Rounding predictions
    test_df['preds'] = test_df['preds'].round()

    # After clipping
    print(f"\n{model_name} Results (After Clipping)")
    print("MAE:\t{:.5f}".format(mean_absolute_error(y_test, test_df['preds'])))
    print("MSE:\t{:.5f}".format(mean_squared_error(y_test, test_df['preds'])))
    print("R2:\t{:.5f}".format(r2_score(y_test, test_df['preds'])))

    return test_df

def save_model(model, model_name, output_dir="/content/drive/MyDrive/BERT_rating_prediction/ml_models/"):
    """
    This function saves a trained model to a specified directory in .pkl (pickle) format for later use.

    Parameters:
    - model: The trained machine learning model (e.g., XGBoost, CatBoost, etc.)
    - model_name: The name of the model to be saved (string)
    - output_dir: The directory where the model will be saved (string, default is '/content/drive/MyDrive/BERT_rating_prediction/ml_models/')

    Returns:
    - None

    The function creates the output directory (if it doesn't exist) and saves the model
    as a .pkl file using the provided model_name.
    """

    os.makedirs(output_dir, exist_ok=True)
    file_path = os.path.join(output_dir, f"{model_name}.pkl")
    joblib.dump(model, file_path)
    print(f"Model saved to: {file_path}")

# Model Training

### Train and evaluate SVR

In [None]:
best_params = run(X_train, y_train, model_name="SVR", n_trials=10, weights=weights)
best_model = SVR(**best_params)
test_df_svr = evaluate_model(best_model, X_train, y_train, X_test, y_test, test_df.copy(), "SVR")
save_model(best_model, "SVR")

### Train and evaluate LightGBM

In [None]:
best_params = run(X_train, y_train, model_name="LightGBM", n_trials=1, weights=weights)
best_model = LGBMRegressor(**best_params, random_state=42)
test_df_lgb = evaluate_model(best_model, X_train, y_train, X_test, y_test, test_df.copy(), "LightGBM")
save_model(best_model, "LightGBM")

### Train and evaluate XGBoost

In [None]:
best_params = run(X_train, y_train, model_name="XGBoost", n_trials=1, weights=weights)
best_model = XGBRegressor(**best_params, random_state=42)
test_df_xgb = evaluate_model(best_model, X_train, y_train, X_test, y_test, test_df.copy(), "XGBoost")
save_model(best_model, "XGBoost")

### Train and evaluate CatBoost

In [None]:
best_params = run(X_train, y_train, model_name="CatBoost", n_trials=1, weights=weights)
best_model = CatBoostRegressor(**best_params, random_state=42, verbose=200, cat_features=[])
test_df_cat = evaluate_model(best_model, X_train, y_train, X_test, y_test, test_df.copy(), "CatBoost")
save_model(best_model, "CatBoost")

### Train and evaluate RandomForest

In [None]:
best_params = run(X_train, y_train, model_name="RF", n_trials=1, weights=weights)
best_model = RandomForestRegressor(**best_params, random_state=42)
test_df_rf = evaluate_model(best_model, X_train, y_train, X_test, y_test, test_df.copy(), "RF")
save_model(best_model, "RandomForest")

# Predictions on Test Set

### SVR prediction

In [None]:
test_df_svr.head(10)

### XGBoost prediction

In [None]:
test_df_xgb.head(10)

### LightGBM prediction

In [None]:
test_df_lgb.head(10)

### Catboost prediction

In [None]:
test_df_cat.head(10)

### Random Forest prediction

In [None]:
test_df_rf.head(10)