## Xgboost with vf
training_data.csv (5/3 - 5/21)  
validation_data_without_sbi.csv (5/22 - 5/24 w/o sbi)  
validation_data_with_sbi.csv (5/22 - 5/24 w/ sbi)  

In [8]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from math import sqrt

# Load the CSV file
file_path = 'training_data.csv'  # Replace with the path to your training data file
df = pd.read_csv(file_path)

# Feature columns and target column
features = ['sno', 'act', 'tot', 'lat', 'lng', 'date_value', 'time', 'week', 'popularity', 'rainfall', 'see_rate_value', 'mrt_distances']
X = df[features]
y = df['sbi']  # Target column

# XGBoost parameters
params = {
    'max_depth': 12,  # Maximum depth of the tree
    'eta': 0.1,      # Learning rate
    'objective': 'reg:squarederror',  # Regression task
}

# Five-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = []
rmse_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Train the model
    bst = xgb.train(params, dtrain, num_boost_round=150)

    # Predict
    y_pred = bst.predict(dtest)

    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)  # Calculate root mean squared error
    mse_scores.append(mse)
    rmse_scores.append(rmse)

# Save the model
bst.save_model('xgboost_model.json')

# Average MSE and RMSE
average_mse = sum(mse_scores) / len(mse_scores)
average_rmse = sum(rmse_scores) / len(rmse_scores)
print(f"Average MSE: {average_mse}")
print(f"Average RMSE: {average_rmse}")


Average MSE: 6.02052936413026
Average RMSE: 2.453615654972574


## Validation with unseen dataset

In [9]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from math import sqrt

# Load the model
bst = xgb.Booster()
bst.load_model('xgboost_model.json')

# Load new data (without sbi)
pri_file_path = 'validation_data_without_sbi.csv'
pri_df = pd.read_csv(pri_file_path)

# Load validation data (with sbi)
val_file_path = 'validation_data_with_sbi.csv'
val_df = pd.read_csv(val_file_path)

# Feature columns
features = ['sno', 'act', 'tot', 'lat', 'lng', 'date_value', 'time', 'week', 'popularity', 'rainfall', 'see_rate_value', 'mrt_distances']

# Predict on new data
pri_X = pri_df[features]
dpri = xgb.DMatrix(pri_X)
pri_predictions = bst.predict(dpri)

# Predict on validation data and calculate accuracy
val_X = val_df[features]
val_y = val_df['sbi']
dval = xgb.DMatrix(val_X)
val_predictions = bst.predict(dval)

# Calculate MSE and RMSE
mse = mean_squared_error(val_y, val_predictions)
rmse = sqrt(mse)
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")

# Optional: Save the prediction results to a CSV file
val_df['predicted_sbi'] = val_predictions
val_df.to_csv('val_with_predictions.csv', index=False)
print("The prediction results of the validation data have been saved to: val_with_predictions.csv")


MSE: 69.86700448839885
RMSE: 8.358648484557706
The prediction results of the validation data have been saved to: val_with_predictions.csv
