In [75]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error
import re

In [76]:
# Load the JSON data into a DataFrame
user_df = pd.read_json('data/user.json', lines=True)
business_df = pd.read_json('data/business.json', lines=True)
review_df = pd.read_json('data/review_train.json', lines=True)

In [77]:
# Load the training data
train_data = pd.read_csv("yelp_train.csv")
val_data = pd.read_csv("yelp_val.csv")

# Merge user and business dataset
train_data = pd.merge(train_data, user_df, on="user_id", how="left")
train_data = pd.merge(train_data, business_df, on="business_id", how="left")
val_data = pd.merge(val_data, user_df, on="user_id", how="left")
val_data = pd.merge(val_data, business_df, on="business_id", how="left")

In [78]:
# Add suffix to "user_id" and "business_id"
train_data['user_id'] = train_data ['user_id'].astype(str) + '-u'
train_data['business_id'] = train_data['business_id'].astype(str) + '-b'

val_data['user_id'] = val_data ['user_id'].astype(str) + '-u'
val_data['business_id'] = val_data['business_id'].astype(str) + '-b'

In [63]:
test_data = pd.read_csv("yelp_test_in.csv")

In [65]:
# Add suffix to "user_id" and "business_id"
test_data['user_id'] = test_data ['user_id'].astype(str) + '-u'
test_data['business_id'] = test_data['business_id'].astype(str) + '-b'

# Merge with PBG embedding

In [79]:
# Merge PBG embedding with training and validation dataset
user_embeddings_file = './embeded_features/PBG/epoch100/user_embeddings.tsv'
business_embeddings_file = './embeded_features/PBG/epoch100/business_embeddings.tsv'
city_embeddings_file = './embeded_features/PBG/epoch100/city_embeddings.tsv'
category_embeddings_file = './embeded_features/PBG/epoch100/category_embeddings.tsv'

In [70]:
# Only read the existed user_id and business_id in train and val to save memory
unique_user_ids = set(train_data['user_id']).union(set(val_data['user_id'])).union(set(test_data['user_id']))
unique_business_ids = set(train_data['business_id']).union(set(val_data['business_id'])).union(set(test_data['business_id']))

In [71]:
# Function to process embeddings file and filter based on IDs
def filter_embeddings(embedding_file, ids_to_keep, id_column_name):
    filtered_embeddings = []
    with open(embedding_file, 'r') as file:
        for line in file:
            entity_id = line.split('\t')[0]
            if entity_id in ids_to_keep:
                filtered_embeddings.append(line.strip().split('\t'))

    # Convert to DataFrame
    df = pd.DataFrame(filtered_embeddings)
    df.set_index(0, inplace=True)  # Set index as entity_id
    df.columns = [f'{id_column_name}_eb{i}' for i in range(1, df.shape[1] + 1)]
    return df

# Filter the user and business embeddings
user_embeddings = filter_embeddings(user_embeddings_file, unique_user_ids, 'user')
business_embeddings = filter_embeddings(business_embeddings_file, unique_business_ids, 'business')

In [72]:
user_embeddings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11270 entries, MWuVbyBgP4vD24Rc7UH5xw-u to hYan1ohCp1Vg58cNwPhV5g-u
Data columns (total 100 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_eb1    11270 non-null  object
 1   user_eb2    11270 non-null  object
 2   user_eb3    11270 non-null  object
 3   user_eb4    11270 non-null  object
 4   user_eb5    11270 non-null  object
 5   user_eb6    11270 non-null  object
 6   user_eb7    11270 non-null  object
 7   user_eb8    11270 non-null  object
 8   user_eb9    11270 non-null  object
 9   user_eb10   11270 non-null  object
 10  user_eb11   11270 non-null  object
 11  user_eb12   11270 non-null  object
 12  user_eb13   11270 non-null  object
 13  user_eb14   11270 non-null  object
 14  user_eb15   11270 non-null  object
 15  user_eb16   11270 non-null  object
 16  user_eb17   11270 non-null  object
 17  user_eb18   11270 non-null  object
 18  user_eb19   11270 non-null  object
 19  user_eb2

In [73]:
business_embeddings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25054 entries, fRu_POqPowUo6nHSRMjOPw-b to wMhovVi6ToUtVoP-YxmsOA-b
Data columns (total 100 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   business_eb1    25054 non-null  object
 1   business_eb2    25054 non-null  object
 2   business_eb3    25054 non-null  object
 3   business_eb4    25054 non-null  object
 4   business_eb5    25054 non-null  object
 5   business_eb6    25054 non-null  object
 6   business_eb7    25054 non-null  object
 7   business_eb8    25054 non-null  object
 8   business_eb9    25054 non-null  object
 9   business_eb10   25054 non-null  object
 10  business_eb11   25054 non-null  object
 11  business_eb12   25054 non-null  object
 12  business_eb13   25054 non-null  object
 13  business_eb14   25054 non-null  object
 14  business_eb15   25054 non-null  object
 15  business_eb16   25054 non-null  object
 16  business_eb17   25054 non-null  object
 17  business_eb1

In [74]:
# Save filtered user and business embedding file
user_embeddings.to_csv('./embeded_features/PBG/epoch100/filtered_user_embeddings.csv')
business_embeddings.to_csv('./embeded_features/PBG/epoch100/filtered_business_embeddings.csv')

In [80]:
# Read filtered user and business embedding file
user_embeddings = pd.read_csv('./embeded_features/PBG/epoch100/filtered_user_embeddings.csv', index_col=0)
business_embeddings = pd.read_csv('./embeded_features/PBG/epoch100/filtered_business_embeddings.csv', index_col=0)

In [81]:
# Merge user and business embeddings with both training and validation datasets
train_data = train_data.merge(user_embeddings, how='left', left_on='user_id', right_index=True)
train_data = train_data.merge(business_embeddings, how='left', left_on='business_id', right_index=True)

val_data = val_data.merge(user_embeddings, how='left', left_on='user_id', right_index=True)
val_data = val_data.merge(business_embeddings, how='left', left_on='business_id', right_index=True)

In [82]:
# Merge city embeddings with both training and validation datasets
# Clean "city"
train_data['city'] = train_data['city'].astype(str) + '-ct'
train_data['city'] = train_data['city'].str.strip().str.lower().str.replace(' ', '_')
val_data['city'] = val_data['city'].astype(str) + '-ct'
val_data['city'] = val_data['city'].str.strip().str.lower().str.replace(' ', '_')

unique_city_ids = set(train_data['city']).union(set(val_data['city']))
city_embeddings = filter_embeddings(city_embeddings_file, unique_city_ids, 'city')

train_data = train_data.merge(city_embeddings, how='left', left_on='city', right_index=True)
val_data = val_data.merge(city_embeddings, how='left', left_on='city', right_index=True)

In [83]:
# Merge category embeddings with both training and validation datasets
category_embeddings = pd.read_csv(category_embeddings_file, sep='\t', header=None, index_col=0)
category_embeddings.columns = [f'category_eb{i}' for i in range(1, category_embeddings.shape[1] + 1)]

def average_category_embeddings(category_string, category_embeddings, embedding_size):
    if category_string == None:
        return [0] * embedding_size
    cleaned_categories = [cat.replace(' ', '_') + '-c' for cat in category_string.split(", ")]
    embeddings = category_embeddings.loc[category_embeddings.index.intersection(cleaned_categories)]
    if embeddings.empty:
        return [0] * embedding_size
    return embeddings.mean().tolist()

# The number of dimensions in your embeddings is 100
embedding_size = 100
category_embedding_cols = [f'category_eb{i}' for i in range(1, embedding_size + 1)]

train_avg_category_embeddings = train_data['categories'].apply(lambda x: average_category_embeddings(x, category_embeddings, embedding_size))
val_avg_category_embeddings = val_data['categories'].apply(lambda x: average_category_embeddings(x, category_embeddings, embedding_size))

# Convert lists of embeddings to a DataFrame
train_avg_category_embeddings_df = pd.DataFrame(train_avg_category_embeddings.tolist(), index=train_data.index, columns=category_embedding_cols)
val_avg_category_embeddings_df = pd.DataFrame(val_avg_category_embeddings.tolist(), index=val_data.index, columns=category_embedding_cols)

# Concatenate the new DataFrame with the original training and validation data
train_data = pd.concat([train_data, train_avg_category_embeddings_df], axis=1)
val_data = pd.concat([val_data, val_avg_category_embeddings_df], axis=1)

# Drop the original 'categories' column if no longer needed
train_data.drop('categories', axis=1, inplace=True)
val_data.drop('categories', axis=1, inplace=True)

In [17]:
train_data

Unnamed: 0,user_id,business_id,stars_x,name_x,review_count_x,yelping_since,friends,useful,funny,cool,...,category_eb91,category_eb92,category_eb93,category_eb94,category_eb95,category_eb96,category_eb97,category_eb98,category_eb99,category_eb100
0,vxR_YV0atFxIxfOnF9uHjQ-u,gTw6PENNGl68ZPUpYWP50A-b,5.0,Debbie,353,2006-06-16,"ir2V_EKfO7XOfKkmX6khCg, uukJrcxFaQFYlbXDql4Kbw...",2005,1826,1872,...,0.061217,-0.048337,-0.167386,0.130525,0.037815,0.097196,0.160434,-0.051821,-0.223712,0.018650
1,o0p-iTC5yTBV5Yab_7es4g-u,iAuOpYDfOTuzQ6OPpEiGwA-b,4.0,Scott,433,2010-01-08,"CkyNJfLv6yx55Pdjq_8T_g, 55sr442Csr6cJ3c430MSow...",22,19,33,...,0.046681,-0.092869,-0.029402,0.012135,-0.074395,0.241833,-0.016240,-0.104173,-0.093384,-0.008089
2,-qj9ouN0bzMXz1vfEslG-A-u,5j7BnXXvlS69uLVHrY9Upw-b,2.0,Jaddahti,154,2012-06-30,"-xDW3gYiYaoeVASXywTPgw, OueXAik2P-eUcXbd1qGXKw...",2660,644,1003,...,-0.174777,0.044597,-0.064023,0.115727,0.029150,-0.094246,0.322639,-0.236703,0.235710,0.063903
3,E43QxgV87Ij6KxMCHcijKw-u,jUYp798M93Mpcjys_TTgsQ-b,5.0,Nikki,668,2012-03-26,"pufGgg3EuY_As7cu__pM1w, 3ONXl3eodyqkhysi-UoseA...",1214,180,826,...,-0.159266,-0.072648,0.023384,0.261897,-0.019492,0.004084,0.063447,0.113596,-0.087337,-0.187862
4,T13IBpJITI32a1k41rc-tg-u,3MntE_HWbNNoyiLGxywjYA-b,5.0,Aimee,304,2015-12-18,"swUxTfJ96XZJ0ufmljJiXQ, T1oHdzsrFeTrQhHfNdls8A...",14,9,8,...,0.049052,-0.044036,-0.050409,0.138590,0.155675,0.059859,0.365941,-0.228835,-0.034429,0.291386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455849,Tsm8VraTp5OGyVALtUiCeQ-u,DXlDzOcpdUE_F21tok0fgw-b,5.0,Monica,951,2009-10-25,"VAI4LygxcbgHctXSS63kbg, 9uwTqhvD5340tgbD6BJrGA...",16440,7770,15239,...,0.131627,-0.035727,0.097267,0.017720,-0.098105,0.290240,-0.123005,-0.108009,-0.011020,-0.090139
455850,BrqGnby6aahIOc0N_1x0Bg-u,_UEQPDDiSgyYqjORWeLfJg-b,1.0,Joah,63,2012-07-13,"uOIaBdtM_uNHDNWW51fTgg, bu978PmR7CL4TfSCqE28RQ...",8,3,0,...,0.068299,-0.076515,-0.224999,0.349166,0.026720,0.032217,0.034222,-0.016249,0.011128,0.007497
455851,-lh59ko3dxChBSZ9U7LfUw-u,KPV_FVNWkgmYh1ArVlt6kg-b,3.0,Lissa,902,2007-08-14,"wXVrWYbX5pldpqsxcVk_6g, OK7jABaab-ITXjFopirh2A...",462,152,345,...,0.009964,-0.037476,-0.019379,-0.169863,0.051777,0.450110,0.181399,0.063993,-0.198551,-0.088449
455852,3VPVQ4fCNF1vYayTJUxKNA-u,YNDxeeRUARbd8GRnscJSvg-b,5.0,Sonya,52,2010-07-27,"bHwcGZP51S02OY72OZi7wg, kmSOityfRkz73gfeBJdXqw...",27,10,23,...,0.102350,-0.106567,-0.072236,0.151984,0.023981,0.143254,0.148726,-0.021718,-0.093215,0.116072


# Other feature processing

In [84]:
# Delete uncessary features
columns_to_drop = [
    'user_id', 'business_id',   # Identifiers
    'name_x', 'name_y',        # User and business names (assuming name_x is from the user dataset and name_y is from the business dataset)
    'address', 'postal_code',  # Specific location details which might not be generalizable
    'hours',                   # Detailed operational hours (unless you plan to engineer features from it)
    'neighborhood',            # Can be dropped if you're using city and state for location info
    'attributes',              # Might be complex to parse, but can be useful if processed correctly      
    'city', 'state'            # Use Latitude and Longitude
]

train_data = train_data.drop(columns=columns_to_drop)
val_data = val_data.drop(columns=columns_to_drop)

def feature_process_noreview(train_data):
    # Convert "yelping_since" to "yelping_duration":
    collection_date = pd.to_datetime("2018-07-02") # latest review date
    train_data['yelping_since'] = pd.to_datetime(train_data['yelping_since'])
    train_data['yelping_duration'] = (collection_date - train_data['yelping_since']).dt.days

    # Convert "friends" to "num_of_friends":
    train_data['num_of_friends'] = train_data['friends'].apply(lambda x: 0 if x == "None" else len(x.split(',')))

    # Convert "elite" to "num_of_elites":
    train_data['num_of_elites'] = train_data['elite'].apply(lambda x: 0 if x == "None" else len(x.split(',')))

    train_data.drop(['yelping_since', 'friends', 'elite'], axis=1, inplace=True)
    
    return train_data

train_data = feature_process_noreview(train_data)
val_data = feature_process_noreview(val_data)

train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 455854 entries, 0 to 455853
Columns: 426 entries, stars_x to num_of_elites
dtypes: float64(305), int64(21), object(100)
memory usage: 1.5+ GB


In [19]:
train_data

Unnamed: 0,stars_x,review_count_x,useful,funny,cool,fans,average_stars,compliment_hot,compliment_more,compliment_profile,...,category_eb94,category_eb95,category_eb96,category_eb97,category_eb98,category_eb99,category_eb100,yelping_duration,num_of_friends,num_of_elites
0,5.0,353,2005,1826,1872,69,4.11,213,15,6,...,0.130525,0.037815,0.097196,0.160434,-0.051821,-0.223712,0.018650,4399,787,8
1,4.0,433,22,19,33,12,3.92,7,4,2,...,0.012135,-0.074395,0.241833,-0.016240,-0.104173,-0.093384,-0.008089,3097,109,4
2,2.0,154,2660,644,1003,9,3.69,12,6,0,...,0.115727,0.029150,-0.094246,0.322639,-0.236703,0.235710,0.063903,2193,55,0
3,5.0,668,1214,180,826,407,4.11,143,18,8,...,0.261897,-0.019492,0.004084,0.063447,0.113596,-0.087337,-0.187862,2289,2598,6
4,5.0,304,14,9,8,11,3.60,5,2,0,...,0.138590,0.155675,0.059859,0.365941,-0.228835,-0.034429,0.291386,927,98,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455849,5.0,951,16440,7770,15239,81,3.47,55,10,8,...,0.017720,-0.098105,0.290240,-0.123005,-0.108009,-0.011020,-0.090139,3172,174,8
455850,1.0,63,8,3,0,5,3.63,0,0,1,...,0.349166,0.026720,0.032217,0.034222,-0.016249,0.011128,0.007497,2180,99,0
455851,3.0,902,462,152,345,127,3.71,420,35,57,...,-0.169863,0.051777,0.450110,0.181399,0.063993,-0.198551,-0.088449,3975,1221,10
455852,5.0,52,27,10,23,1,3.33,0,1,0,...,0.151984,0.023981,0.143254,0.148726,-0.021718,-0.093215,0.116072,2897,151,0


In [20]:
# Convert embedding values for float for training

# List of embedding column names
# embedding_columns = [col for col in train_data.columns if 'eb' in col]

# Convert each embedding column to numeric type (float)
# for col in embedding_columns:
#    train_data[col] = pd.to_numeric(train_data[col], errors='raise')
#    val_data[col] = pd.to_numeric(val_data[col], errors='raise')


In [85]:
# Convert all embedding columns to float at once
embedding_columns = [col for col in train_data.columns if 'eb' in col]
train_data[embedding_columns] = train_data[embedding_columns].astype(float)
val_data[embedding_columns] = val_data[embedding_columns].astype(float).fillna(0)

# Training XGBOOST

In [86]:
X_train = train_data.drop('stars_x', axis=1) # Features
y_train = train_data['stars_x'] # Target variable

X_val = val_data.drop('stars_x', axis=1) # Features for evaluation data
y_val = val_data['stars_x'] # True ratings for evaluation data

In [40]:
# Create DMatrix for training
dtrain = xgb.DMatrix(X_train, label=y_train)

# Create DMatrix for validation
dvalid = xgb.DMatrix(X_val, label=y_val)

evals = [(dtrain, 'train'), (dvalid, 'valid')]

# Define parameters
# params = {
#     'objective': 'reg:linear',
#     'eval_metric': 'rmse',
#     'alpha': 0.3, 
#     'colsample_bytree': 0.5,         
#     'learning_rate': 0.02, 
#     'lambda': 10,
#     'max_depth': 8, 
#     #'max_bin': 500,
#     'min_child_weight': 150,
#     'subsample': 0.8        
    
# }

# Parameters from BO
params = {
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'alpha': 2.5315767101140376, 
    'colsample_bytree': 0.7870537878025798, 
    'learning_rate': 0.010692770887950544, 
    'lambda': 19.713199469203857, 
    'max_depth': 11, 
    'min_child_weight': 261, 
    'subsample': 0.835210157107768
}

# Train model
num_rounds = 5000
bst = xgb.train(params, dtrain, num_rounds, evals=evals, early_stopping_rounds=50, verbose_eval=10)
print("Best RMSE at iteration:", bst.best_iteration)


[0]	train-rmse:3.40723	valid-rmse:3.40508
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[10]	train-rmse:3.09186	valid-rmse:3.08961
[20]	train-rmse:2.81173	valid-rmse:2.80941
[30]	train-rmse:2.56421	valid-rmse:2.56179
[40]	train-rmse:2.34481	valid-rmse:2.34237
[50]	train-rmse:2.15171	valid-rmse:2.14925
[60]	train-rmse:1.98215	valid-rmse:1.9797
[70]	train-rmse:1.83348	valid-rmse:1.83108
[80]	train-rmse:1.70424	valid-rmse:1.70196
[90]	train-rmse:1.59128	valid-rmse:1.58924
[100]	train-rmse:1.49389	valid-rmse:1.49217
[110]	train-rmse:1.41042	valid-rmse:1.40912
[120]	train-rmse:1.33875	valid-rmse:1.33804
[130]	train-rmse:1.27791	valid-rmse:1.2779
[140]	train-rmse:1.22603	valid-rmse:1.22683
[150]	train-rmse:1.18207	valid-rmse:1.18382
[160]	train-rmse:1.1451	valid-rmse:1.14789
[170]	train-rmse:1.11406	valid-rmse:1.11797
[180]	train-rmse:1.08796	valid-rmse:1.09304
[190]	train-rmse:1.06627	valid-rm

[1760]	train-rmse:0.878147	valid-rmse:0.971097
[1770]	train-rmse:0.877859	valid-rmse:0.971103
[1780]	train-rmse:0.877394	valid-rmse:0.971099
[1790]	train-rmse:0.877017	valid-rmse:0.971102
[1800]	train-rmse:0.876556	valid-rmse:0.971107
[1810]	train-rmse:0.876149	valid-rmse:0.971108
Stopping. Best iteration:
[1760]	train-rmse:0.878147	valid-rmse:0.971097

Best RMSE at iteration: 1760


In [41]:
# Get feature importance
importance = bst.get_score(importance_type='weight')

# Sort features by importance
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)

# Display the top 30 features
for feature, score in sorted_importance[:30]:
    print(f"Feature: {feature}, Importance: {score}")

Feature: average_stars, Importance: 6793
Feature: stars_y, Importance: 3266
Feature: review_count_x, Importance: 2131
Feature: user_eb87, Importance: 2073
Feature: user_eb93, Importance: 2043
Feature: user_eb81, Importance: 1974
Feature: user_eb51, Importance: 1957
Feature: user_eb85, Importance: 1935
Feature: user_eb48, Importance: 1934
Feature: user_eb77, Importance: 1887
Feature: user_eb37, Importance: 1884
Feature: user_eb54, Importance: 1857
Feature: user_eb9, Importance: 1854
Feature: user_eb1, Importance: 1833
Feature: user_eb88, Importance: 1807
Feature: user_eb40, Importance: 1806
Feature: user_eb22, Importance: 1786
Feature: user_eb34, Importance: 1767
Feature: user_eb10, Importance: 1765
Feature: user_eb46, Importance: 1757
Feature: user_eb17, Importance: 1751
Feature: user_eb100, Importance: 1751
Feature: user_eb26, Importance: 1747
Feature: user_eb31, Importance: 1739
Feature: user_eb90, Importance: 1737
Feature: review_count_y, Importance: 1727
Feature: user_eb91, Importa

In [45]:
model_name = './models/model-ubcctEmbedding426-epoch100-RMSE971097.json'
model_best_iter_file = './models/model-ubcctEmbedding426-epoch100-RMSE971097-best-iteration.txt'

In [46]:
# Save the model
#model_name = './models/model-ubcctEmbedding426-epoch100-RMSE97189.json'
bst.save_model(model_name)

#model_best_iter_file = './models/model-ubcctEmbedding426-epoch100-RMSE97189-best-iteration.txt'
# Save the best iteration number to a file
with open(model_best_iter_file, 'w') as f:
    f.write(str(bst.best_iteration))

In [47]:
model_xgb = xgb.Booster()
model_xgb.load_model(model_name)

# Load the best iteration number
with open(model_best_iter_file, 'r') as f:
    best_iteration = int(f.read())
    
# Assuming you have a DMatrix for your evaluation set named "dval"
y_pred = model_xgb.predict(dvalid, ntree_limit=best_iteration+1)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(dvalid.get_label(), y_pred))
print(f"RMSE: {rmse}")

RMSE: 0.9710966348648071


# Train full dataset combining training and validation dataset

In [87]:
full_train_data = pd.concat([train_data, val_data])
X_full_train = full_train_data.drop('stars_x', axis=1)  
y_full_train = full_train_data['stars_x'] 

In [88]:
dtrain_full = xgb.DMatrix(X_full_train, label=y_full_train)

# Create DMatrix for validation
dvalid = xgb.DMatrix(X_val, label=y_val)

In [57]:
evals = [(dtrain_full, 'train'), (dvalid, 'valid')]

# Parameters from BO
params = {
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'alpha': 2.5315767101140376, 
    'colsample_bytree': 0.7870537878025798, 
    'learning_rate': 0.010692770887950544, 
    'lambda': 19.713199469203857, 
    'max_depth': 11, 
    'min_child_weight': 261, 
    'subsample': 0.835210157107768
}

# Train model
num_rounds = 1600
final_model = xgb.train(params, dtrain_full, num_rounds, evals=evals, early_stopping_rounds=50, verbose_eval=10)

[0]	train-rmse:3.40671	valid-rmse:3.40506
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[10]	train-rmse:3.09203	valid-rmse:3.09029
[20]	train-rmse:2.81319	valid-rmse:2.81135
[30]	train-rmse:2.56441	valid-rmse:2.5625
[40]	train-rmse:2.34492	valid-rmse:2.34294
[50]	train-rmse:2.15099	valid-rmse:2.14894
[60]	train-rmse:1.98083	valid-rmse:1.97871
[70]	train-rmse:1.83185	valid-rmse:1.82968
[80]	train-rmse:1.70206	valid-rmse:1.69982
[90]	train-rmse:1.58924	valid-rmse:1.58695
[100]	train-rmse:1.49235	valid-rmse:1.49002
[110]	train-rmse:1.40855	valid-rmse:1.40619
[120]	train-rmse:1.33682	valid-rmse:1.33447
[130]	train-rmse:1.27601	valid-rmse:1.27364
[140]	train-rmse:1.2245	valid-rmse:1.22214
[150]	train-rmse:1.18084	valid-rmse:1.17848
[160]	train-rmse:1.14399	valid-rmse:1.14164
[170]	train-rmse:1.11292	valid-rmse:1.11058
[180]	train-rmse:1.08691	valid-rmse:1.0846
[190]	train-rmse:1.06523	valid-rm

In [58]:
print("Best RMSE at iteration:", final_model.best_iteration)

Best RMSE at iteration: 1599


In [59]:
model_name = './models/model-TrainVal-ubcctEmbedding426-epoch100-RMSE890982.json'
model_best_iter_file = './models/model-TrainVal-ubcctEmbedding426-epoch100-RMSE890982-best-iteration.txt'

In [60]:
# Save the model
#model_name = './models/model-ubcctEmbedding426-epoch100-RMSE97189.json'
final_model.save_model(model_name)

#model_best_iter_file = './models/model-ubcctEmbedding426-epoch100-RMSE97189-best-iteration.txt'
# Save the best iteration number to a file
with open(model_best_iter_file, 'w') as f:
    f.write(str(final_model.best_iteration))

In [89]:
final_model_xgb = xgb.Booster()
final_model_xgb.load_model(model_name)

# Load the best iteration number
with open(model_best_iter_file, 'r') as f:
    best_iteration = int(f.read())
    
# Assuming you have a DMatrix for your evaluation set named "dval"
y_pred = final_model_xgb.predict(dvalid, ntree_limit=best_iteration+1)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(dvalid.get_label(), y_pred))
print(f"RMSE: {rmse}")

RMSE: 0.8909814953804016


In [90]:
# Print Error Distribution
y_true = dvalid.get_label()

# Calculate absolute errors
abs_errors = np.abs(y_true - y_pred)

# Define the error distribution bins
error_bins = {'>=0 and <1': 0, '>=1 and <2': 0, '>=2 and <3': 0, '>=3 and <4': 0, '>=4': 0}

# Increment the count in the appropriate bin
for error in abs_errors:
    if error >= 4:
        error_bins['>=4'] += 1
    elif error >= 3:
        error_bins['>=3 and <4'] += 1
    elif error >= 2:
        error_bins['>=2 and <3'] += 1
    elif error >= 1:
        error_bins['>=1 and <2'] += 1
    else:
        error_bins['>=0 and <1'] += 1

for r, count in error_bins.items():
    print(f"{r}: {count}")

>=0 and <1: 108400
>=1 and <2: 28780
>=2 and <3: 4532
>=3 and <4: 332
>=4: 0


# Hyperparameter tunning for XGB

In [33]:
import optuna
import xgboost as xgb

In [34]:
X_train = train_data.drop('stars_x', axis=1) # Features
y_train = train_data['stars_x'] # Target variable

X_val = val_data.drop('stars_x', axis=1) # Features for evaluation data
y_val = val_data['stars_x'] # True ratings for evaluation data

In [35]:
def objective(trial):
    # Define the hyperparameter search space
    params = {
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'alpha': trial.suggest_float('alpha', 0.1, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'lambda': trial.suggest_float('lambda', 1, 20),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 100, 300),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0)
    }

    # Create DMatrix for training and validation
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)
    evals = [(dtrain, 'train'), (dvalid, 'valid')]

    # Train the model
    num_rounds = 5000
    model = xgb.train(params, dtrain, num_rounds, evals=evals, early_stopping_rounds=50, verbose_eval=0)

    # Return the best validation RMSE
    return model.best_score


In [36]:
def custom_callback(study, trial):
    print(f"Trial {trial.number} finished with value: {trial.value} and parameters: {trial.params}.")
    print(f"Best value so far: {study.best_value}")

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, callbacks=[custom_callback])

[32m[I 2023-12-03 07:57:08,661][0m A new study created in memory with name: no-name-1b7340b0-e618-4198-9ec9-37047185dd0a[0m
[32m[I 2023-12-03 08:17:36,252][0m Trial 0 finished with value: 0.973538 and parameters: {'alpha': 3.5599386311312755, 'colsample_bytree': 0.9222685842493455, 'learning_rate': 0.05368006944638404, 'lambda': 14.735756471394263, 'max_depth': 6, 'min_child_weight': 240, 'subsample': 0.7295622824961296}. Best is trial 0 with value: 0.973538.[0m


Trial 0 finished with value: 0.973538 and parameters: {'alpha': 3.5599386311312755, 'colsample_bytree': 0.9222685842493455, 'learning_rate': 0.05368006944638404, 'lambda': 14.735756471394263, 'max_depth': 6, 'min_child_weight': 240, 'subsample': 0.7295622824961296}.
Best value so far: 0.973538


[32m[I 2023-12-03 09:00:03,550][0m Trial 1 finished with value: 0.973745 and parameters: {'alpha': 9.464305216724641, 'colsample_bytree': 0.6104765814759154, 'learning_rate': 0.017351959456478425, 'lambda': 6.1335577563297115, 'max_depth': 4, 'min_child_weight': 158, 'subsample': 0.6076496839947623}. Best is trial 0 with value: 0.973538.[0m


Trial 1 finished with value: 0.973745 and parameters: {'alpha': 9.464305216724641, 'colsample_bytree': 0.6104765814759154, 'learning_rate': 0.017351959456478425, 'lambda': 6.1335577563297115, 'max_depth': 4, 'min_child_weight': 158, 'subsample': 0.6076496839947623}.
Best value so far: 0.973538


[32m[I 2023-12-03 09:08:07,088][0m Trial 2 finished with value: 0.975075 and parameters: {'alpha': 9.143797937632241, 'colsample_bytree': 0.9708763023419246, 'learning_rate': 0.08144904877210958, 'lambda': 7.889431754222032, 'max_depth': 10, 'min_child_weight': 218, 'subsample': 0.5778424410382944}. Best is trial 0 with value: 0.973538.[0m


Trial 2 finished with value: 0.975075 and parameters: {'alpha': 9.143797937632241, 'colsample_bytree': 0.9708763023419246, 'learning_rate': 0.08144904877210958, 'lambda': 7.889431754222032, 'max_depth': 10, 'min_child_weight': 218, 'subsample': 0.5778424410382944}.
Best value so far: 0.973538


[32m[I 2023-12-03 09:43:11,505][0m Trial 3 finished with value: 0.973144 and parameters: {'alpha': 8.278160655491014, 'colsample_bytree': 0.5050039009767537, 'learning_rate': 0.015182912643913478, 'lambda': 3.523530787935206, 'max_depth': 5, 'min_child_weight': 137, 'subsample': 0.43615517739829357}. Best is trial 3 with value: 0.973144.[0m


Trial 3 finished with value: 0.973144 and parameters: {'alpha': 8.278160655491014, 'colsample_bytree': 0.5050039009767537, 'learning_rate': 0.015182912643913478, 'lambda': 3.523530787935206, 'max_depth': 5, 'min_child_weight': 137, 'subsample': 0.43615517739829357}.
Best value so far: 0.973144


[32m[I 2023-12-03 09:58:07,563][0m Trial 4 finished with value: 0.974639 and parameters: {'alpha': 6.218886702219857, 'colsample_bytree': 0.7120395914176371, 'learning_rate': 0.09417436760456355, 'lambda': 17.706770765549617, 'max_depth': 3, 'min_child_weight': 287, 'subsample': 0.9113552829271676}. Best is trial 3 with value: 0.973144.[0m


Trial 4 finished with value: 0.974639 and parameters: {'alpha': 6.218886702219857, 'colsample_bytree': 0.7120395914176371, 'learning_rate': 0.09417436760456355, 'lambda': 17.706770765549617, 'max_depth': 3, 'min_child_weight': 287, 'subsample': 0.9113552829271676}.
Best value so far: 0.973144


[32m[I 2023-12-03 10:11:58,272][0m Trial 5 finished with value: 0.97546 and parameters: {'alpha': 6.954379320375452, 'colsample_bytree': 0.6296843474569804, 'learning_rate': 0.06337624474238872, 'lambda': 9.243489295766238, 'max_depth': 3, 'min_child_weight': 202, 'subsample': 0.41935646404933}. Best is trial 3 with value: 0.973144.[0m


Trial 5 finished with value: 0.97546 and parameters: {'alpha': 6.954379320375452, 'colsample_bytree': 0.6296843474569804, 'learning_rate': 0.06337624474238872, 'lambda': 9.243489295766238, 'max_depth': 3, 'min_child_weight': 202, 'subsample': 0.41935646404933}.
Best value so far: 0.973144


[32m[I 2023-12-03 10:30:59,942][0m Trial 6 finished with value: 0.974924 and parameters: {'alpha': 7.407320739684848, 'colsample_bytree': 0.7102581720008061, 'learning_rate': 0.042469104050549385, 'lambda': 17.956034322141655, 'max_depth': 3, 'min_child_weight': 214, 'subsample': 0.45661846274097406}. Best is trial 3 with value: 0.973144.[0m


Trial 6 finished with value: 0.974924 and parameters: {'alpha': 7.407320739684848, 'colsample_bytree': 0.7102581720008061, 'learning_rate': 0.042469104050549385, 'lambda': 17.956034322141655, 'max_depth': 3, 'min_child_weight': 214, 'subsample': 0.45661846274097406}.
Best value so far: 0.973144


[32m[I 2023-12-03 10:45:27,833][0m Trial 7 finished with value: 0.974472 and parameters: {'alpha': 4.82559075405013, 'colsample_bytree': 0.6440261880300254, 'learning_rate': 0.06760366276735703, 'lambda': 15.311519550962124, 'max_depth': 4, 'min_child_weight': 215, 'subsample': 0.6546429846544746}. Best is trial 3 with value: 0.973144.[0m


Trial 7 finished with value: 0.974472 and parameters: {'alpha': 4.82559075405013, 'colsample_bytree': 0.6440261880300254, 'learning_rate': 0.06760366276735703, 'lambda': 15.311519550962124, 'max_depth': 4, 'min_child_weight': 215, 'subsample': 0.6546429846544746}.
Best value so far: 0.973144


[32m[I 2023-12-03 10:51:14,636][0m Trial 8 finished with value: 0.975185 and parameters: {'alpha': 0.26080647826524117, 'colsample_bytree': 0.5854318333406645, 'learning_rate': 0.08751618426500075, 'lambda': 8.184545927801121, 'max_depth': 12, 'min_child_weight': 251, 'subsample': 0.7088035906856436}. Best is trial 3 with value: 0.973144.[0m


Trial 8 finished with value: 0.975185 and parameters: {'alpha': 0.26080647826524117, 'colsample_bytree': 0.5854318333406645, 'learning_rate': 0.08751618426500075, 'lambda': 8.184545927801121, 'max_depth': 12, 'min_child_weight': 251, 'subsample': 0.7088035906856436}.
Best value so far: 0.973144


[32m[I 2023-12-03 11:59:30,711][0m Trial 9 finished with value: 0.971138 and parameters: {'alpha': 7.196324518026052, 'colsample_bytree': 0.6155127541727401, 'learning_rate': 0.010024275555949036, 'lambda': 15.605384373450232, 'max_depth': 10, 'min_child_weight': 274, 'subsample': 0.5322992890217315}. Best is trial 9 with value: 0.971138.[0m


Trial 9 finished with value: 0.971138 and parameters: {'alpha': 7.196324518026052, 'colsample_bytree': 0.6155127541727401, 'learning_rate': 0.010024275555949036, 'lambda': 15.605384373450232, 'max_depth': 10, 'min_child_weight': 274, 'subsample': 0.5322992890217315}.
Best value so far: 0.971138


[32m[I 2023-12-03 12:24:28,869][0m Trial 10 finished with value: 0.972382 and parameters: {'alpha': 2.952806746664268, 'colsample_bytree': 0.8712708476139344, 'learning_rate': 0.034093556672554884, 'lambda': 13.374575695864536, 'max_depth': 9, 'min_child_weight': 299, 'subsample': 0.8950327128391171}. Best is trial 9 with value: 0.971138.[0m


Trial 10 finished with value: 0.972382 and parameters: {'alpha': 2.952806746664268, 'colsample_bytree': 0.8712708476139344, 'learning_rate': 0.034093556672554884, 'lambda': 13.374575695864536, 'max_depth': 9, 'min_child_weight': 299, 'subsample': 0.8950327128391171}.
Best value so far: 0.971138


[32m[I 2023-12-03 12:55:25,099][0m Trial 11 finished with value: 0.972079 and parameters: {'alpha': 2.607294932117917, 'colsample_bytree': 0.8475854755153138, 'learning_rate': 0.0315314029856972, 'lambda': 12.457215533644675, 'max_depth': 9, 'min_child_weight': 293, 'subsample': 0.9894619780562619}. Best is trial 9 with value: 0.971138.[0m


Trial 11 finished with value: 0.972079 and parameters: {'alpha': 2.607294932117917, 'colsample_bytree': 0.8475854755153138, 'learning_rate': 0.0315314029856972, 'lambda': 12.457215533644675, 'max_depth': 9, 'min_child_weight': 293, 'subsample': 0.9894619780562619}.
Best value so far: 0.971138


[32m[I 2023-12-03 13:28:28,304][0m Trial 12 finished with value: 0.972627 and parameters: {'alpha': 1.166376915050888, 'colsample_bytree': 0.8062624185355353, 'learning_rate': 0.029151698953757244, 'lambda': 12.075330965839495, 'max_depth': 8, 'min_child_weight': 270, 'subsample': 0.9921380938076089}. Best is trial 9 with value: 0.971138.[0m


Trial 12 finished with value: 0.972627 and parameters: {'alpha': 1.166376915050888, 'colsample_bytree': 0.8062624185355353, 'learning_rate': 0.029151698953757244, 'lambda': 12.075330965839495, 'max_depth': 8, 'min_child_weight': 270, 'subsample': 0.9921380938076089}.
Best value so far: 0.971138


[32m[I 2023-12-03 14:46:50,863][0m Trial 13 finished with value: 0.971097 and parameters: {'alpha': 2.5315767101140376, 'colsample_bytree': 0.7870537878025798, 'learning_rate': 0.010692770887950544, 'lambda': 19.713199469203857, 'max_depth': 11, 'min_child_weight': 261, 'subsample': 0.835210157107768}. Best is trial 13 with value: 0.971097.[0m


Trial 13 finished with value: 0.971097 and parameters: {'alpha': 2.5315767101140376, 'colsample_bytree': 0.7870537878025798, 'learning_rate': 0.010692770887950544, 'lambda': 19.713199469203857, 'max_depth': 11, 'min_child_weight': 261, 'subsample': 0.835210157107768}.
Best value so far: 0.971097


[33m[W 2023-12-03 14:49:58,918][0m Trial 14 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "/Users/aoyanliang/anaconda3/envs/dsci553-spark-py36/lib/python3.6/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-35-f8e833e8e721>", line 22, in objective
    model = xgb.train(params, dtrain, num_rounds, evals=evals, early_stopping_rounds=50, verbose_eval=0)
  File "/Users/aoyanliang/anaconda3/envs/dsci553-spark-py36/lib/python3.6/site-packages/xgboost/training.py", line 204, in train
    xgb_model=xgb_model, callbacks=callbacks)
  File "/Users/aoyanliang/anaconda3/envs/dsci553-spark-py36/lib/python3.6/site-packages/xgboost/training.py", line 74, in _train_internal
    bst.update(dtrain, i, obj)
  File "/Users/aoyanliang/anaconda3/envs/dsci553-spark-py36/lib/python3.6/site-packages/xgboost/core.py", line 894, in update
    dtrain.handle))
KeyboardInterrupt


KeyboardInterrupt: 