# Model Training Module : Using RF and XGBoost

Imports

In [5]:
import numpy as np 
import pandas as pd 
import xgboost as xgb 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error 
import warnings 
warnings.filterwarnings('ignore') 

Split data into train and test based on the year >=2023 goes in test

In [2]:
x_train=pd.read_csv('x_train.csv') 
y_train=pd.read_csv('y_train.csv') 
x_test=x_train.iloc[45000:,:].to_numpy() 
y_test=y_train.iloc[45000:].to_numpy() 
x_train=x_train.iloc[:45000,:].to_numpy() 
y_train=y_train.iloc[:45000].to_numpy() 

Train XGBoost

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBRegressor

# 1) Prepare data
X = x_train  # your (n_samples × 126) feature matrix
y = y_train  # your target vector

# Optionally split into train/hold‑out to avoid overfitting on full data
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 2) Define model and parameter grid
xgb = XGBRegressor(objective='reg:squarederror', n_jobs=-1, verbosity=0)

param_grid = {
    'max_depth':        [3, 5, 7],
    'learning_rate':    [0.001, 0.01, 0.1],
    'n_estimators':     [100, 250, 500],
    'subsample':        [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha':        [0, 1, 5],
    'reg_lambda':       [0, 1, 5]
}

# 3) Set up GridSearchCV
grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # maximize negative MAE
    cv=3,                               # 3-fold cross validation
    verbose=2
)

# 4) Run grid search
grid.fit(X_tr, y_tr)

# 5) Best parameters and performance
print("Best parameters found:", grid.best_params_)
print("Best CV MAE: {:.2f}".format(-grid.best_score_))

# 6) Validate on hold‑out
best_model = grid.best_estimator_
mae_val = np.mean(np.abs(best_model.predict(X_val) - y_val))
print("Hold‑out MAE: {:.2f}".format(mae_val))


Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
[CV] END colsample_bytree=0.6, learning_rate=0.001, max_depth=3, n_estimators=100, reg_alpha=0, reg_lambda=0, subsample=0.6; total time=  24.1s


KeyboardInterrupt: 

In [6]:
x_train = x_train.reshape(x_train.shape[0], -1)
dtrain = xgb.DMatrix(x_train, label=y_train)

params = {
    'objective': 'reg:squarederror',  # Regression objective for continuous target
    'eval_metric': 'rmse',             # Mean Absolute Error, suitable for regression
    'max_depth': 5,                   # Slightly deeper tree depth to capture more complexity
    'learning_rate': 0.001,            # Lower learning rate for better generalization
    'n_estimators': 250,              # Increase to allow the model to learn more from data
    'subsample': 0.6,                 # Use 60% of data per tree for better generalization
    'colsample_bytree': 0.8,          # Use 80% of features per tree
    'reg_alpha': 1,                 # L1 regularization to reduce overfitting
    'reg_lambda': 1                  # L2 regularization to control complexity
}  


watchlist = [(dtrain, 'train')]


num_round = 1000 
model = xgb.train(params, dtrain, num_round)

Predict on XGBoost chunkwise for the whole team with captain point = 2X and vice captain points = 1.5X

In [7]:
x_test = x_test.reshape(x_test.shape[0], -1)
dtest = xgb.DMatrix(x_test)
y_pred = model.predict(dtest)

In [8]:
import numpy as np

def process_chunks(y_pred, y_test, chunk_size=22):
    length = (len(y_pred) // chunk_size) * chunk_size  # Exclude the remainder
    y_pred_chunks = [y_pred[i:i + chunk_size] for i in range(0, length, chunk_size)]
    y_test_chunks = [y_test[i:i + chunk_size] for i in range(0, length, chunk_size)]

    top_11_indices_pred = []
    sorted_y_test_chunks = []  
    sorted_y_pred_chunks = [] 
    for chunk in y_pred_chunks:
        sorted_indices = np.argsort(chunk)[::-1] 
        top_11 = sorted_indices[:11]
        top_11_indices_pred.append(top_11)
    for chunk, indices in zip(y_test_chunks, top_11_indices_pred):
        sorted_chunk = np.zeros_like(chunk)
        sorted_chunk[:len(indices)] = chunk[indices]  # Assign top 11 values
        remaining_indices = [i for i in range(chunk_size) if i not in indices]
        sorted_chunk[len(indices):] = chunk[remaining_indices]  # Assign rest
        sorted_y_pred_chunks.append(sorted_chunk)  

    top_11_indices_test = []
    for chunk in y_test_chunks:
        sorted_indices = np.argsort(chunk)[::-1]  # Sort actual values in descending order
        top_11 = sorted_indices[:11]
        top_11_indices_test.append(top_11)
        sorted_y_test_chunks.append(chunk[sorted_indices])  # Sort the y_test values correctly
    
    return sorted_y_test_chunks, sorted_y_pred_chunks, top_11_indices_pred
y_test=y_test.flatten() 
y_pred=y_pred.flatten()
sorted_y_test_chunks, sorted_y_pred_chunks, top_11_indices_pred = process_chunks(y_pred, y_test)
mae_list = []
sum1_list = []

for i, (sorted_y_pred_chunk, sorted_y_test_chunk) in enumerate(zip(sorted_y_pred_chunks, sorted_y_test_chunks)):
    sum1 = sorted_y_test_chunk[0] * 2 + sorted_y_test_chunk[1] * 1.5 + np.sum(sorted_y_test_chunk[2:11]) 
    sum2 = sorted_y_pred_chunk[0] * 2 + sorted_y_pred_chunk[1] * 1.5 + np.sum(sorted_y_pred_chunk[2:11]) 

    error = np.abs(sum1 - sum2) 
    mae_list.append(error)
    sum1_list.append(sum1)

    print(f"Chunk {i+1} Error: {error}, True Sum: {sum1}, Predicted Sum: {sum2}")
print("Mean Absolute Error (MAE):", np.mean(np.array(mae_list)))
print("Average Dream Team Points in ODI:", np.mean(np.array(sum1_list)))


Chunk 1 Error: 150.0, True Sum: 686.0, Predicted Sum: 536.0
Chunk 2 Error: 781.0, True Sum: 1369.0, Predicted Sum: 588.0
Chunk 3 Error: 544.5, True Sum: 1300.0, Predicted Sum: 755.5
Chunk 4 Error: 213.0, True Sum: 984.5, Predicted Sum: 771.5
Chunk 5 Error: 310.0, True Sum: 946.0, Predicted Sum: 636.0
Chunk 6 Error: 436.5, True Sum: 1442.5, Predicted Sum: 1006.0
Chunk 7 Error: 367.5, True Sum: 818.5, Predicted Sum: 451.0
Chunk 8 Error: 436.5, True Sum: 1162.5, Predicted Sum: 726.0
Chunk 9 Error: 262.0, True Sum: 907.0, Predicted Sum: 645.0
Chunk 10 Error: 680.5, True Sum: 1323.0, Predicted Sum: 642.5
Chunk 11 Error: 190.0, True Sum: 887.0, Predicted Sum: 697.0
Chunk 12 Error: 316.5, True Sum: 1057.0, Predicted Sum: 740.5
Chunk 13 Error: 468.0, True Sum: 1210.0, Predicted Sum: 742.0
Chunk 14 Error: 706.0, True Sum: 1134.0, Predicted Sum: 428.0
Chunk 15 Error: 337.0, True Sum: 1225.5, Predicted Sum: 888.5
Chunk 16 Error: 529.5, True Sum: 1155.5, Predicted Sum: 626.0
Chunk 17 Error: 241.5,

In [9]:
print("Mean of y_test:", np.mean(y_test))
print("Std dev of y_test:", np.std(y_test))


Mean of y_test: 47.02742857142857
Std dev of y_test: 44.89197308732898


In [51]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

RMSE: 28.34296734762068


In [12]:
import pickle
pickle.dump(model, open("xgb_model.pkl", "wb"))

In [7]:
import os
import json
import pandas as pd

# Define the folder names and their corresponding csv folder mapping
folder_mapping = {
    'data1_json': 'csv1',
    'data2_json': 'csv2',
    'data3_json': 'csv3'
}

# Initialize a list to store the results
rows = []

# Base directory (adjust if needed)
base_dir = os.getcwd()  # or specify the directory where data folders are located

# Iterate through each folder in the mapping
for folder, csv_folder in folder_mapping.items():
    folder_path = os.path.join(base_dir, folder)
    # List all files that end with .json
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_full_path = os.path.join(folder_path, filename)
            # Open and load the JSON data
            with open(file_full_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Extract teams and date. 
            # Assuming teams is a list under info -> teams and date is the first element in info -> dates.
            teams = data.get('info', {}).get('teams', [])
            date_list = data.get('info', {}).get('dates', [])
            
            # Assign team1 and team2 based on the order in the list (if available)
            team1 = teams[0] if len(teams) > 0 else None
            team2 = teams[1] if len(teams) > 1 else None
            
            # Get the first date from the dates list
            date = date_list[0] if date_list else None
            
            # Create the new file path as described: dt/{csv_folder}/{filename with .csv extension}
            base_file, _ = os.path.splitext(filename)
            new_filename = base_file + '.csv'
            new_file_path = os.path.join('dt', csv_folder, new_filename)
            
            # Append the row details to the list
            rows.append({
                'file_path': new_file_path,
                'team1': team1,
                'team2': team2,
                'date': date
            })

# Create DataFrame
df = pd.DataFrame(rows)
print(df)

# Optionally, save the dataframe to a CSV file
df.to_csv('combined_data2.csv', index=False)


                file_path        team1                 team2        date
0       dt/csv1/65240.csv        Kenya          South Africa  2003-02-12
1       dt/csv1/66357.csv     Pakistan             Sri Lanka  2003-05-10
2       dt/csv1/65708.csv     Pakistan            Bangladesh  2004-07-17
3       dt/csv1/64825.csv    Australia           West Indies  2003-05-30
4       dt/csv1/65255.csv      England              Pakistan  2003-02-22
...                   ...          ...                   ...         ...
2384  dt/csv3/1322278.csv        India           New Zealand  2022-11-25
2385  dt/csv3/1146722.csv   Bangladesh           West Indies  2018-07-28
2386  dt/csv3/1020037.csv  New Zealand          South Africa  2017-03-01
2387  dt/csv3/1377757.csv     Scotland  United Arab Emirates  2023-06-23
2388  dt/csv3/1377752.csv      Ireland              Scotland  2023-06-21

[2389 rows x 4 columns]


Now for Random Forest Regressor

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

In [7]:
# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

In [10]:
import pickle
pickle.dump(rf, open("rf_model.pkl", "wb"))

In [12]:
y_pred = rf.predict(x_test)

In [13]:
def process_chunks(y_pred, y_test, chunk_size=22):
    length = (len(y_pred) // chunk_size) * chunk_size  # Exclude the remainder
    y_pred_chunks = [y_pred[i:i + chunk_size] for i in range(0, length, chunk_size)]
    y_test_chunks = [y_test[i:i + chunk_size] for i in range(0, length, chunk_size)]

    top_11_indices_pred = []
    sorted_y_test_chunks = []  
    sorted_y_pred_chunks = [] 
    for chunk in y_pred_chunks:
        sorted_indices = np.argsort(chunk)[::-1] 
        top_11 = sorted_indices[:11]
        top_11_indices_pred.append(top_11)
    for chunk, indices in zip(y_test_chunks, top_11_indices_pred):
        sorted_chunk = np.zeros_like(chunk)
        sorted_chunk[:len(indices)] = chunk[indices]  # Assign top 11 values
        remaining_indices = [i for i in range(chunk_size) if i not in indices]
        sorted_chunk[len(indices):] = chunk[remaining_indices]  # Assign rest
        sorted_y_pred_chunks.append(sorted_chunk)  

    top_11_indices_test = []
    for chunk in y_test_chunks:
        sorted_indices = np.argsort(chunk)[::-1]  # Sort actual values in descending order
        top_11 = sorted_indices[:11]
        top_11_indices_test.append(top_11)
        sorted_y_test_chunks.append(chunk[sorted_indices])  # Sort the y_test values correctly
    
    return sorted_y_test_chunks, sorted_y_pred_chunks, top_11_indices_pred

In [14]:
# 1. Ensure y_test and y_pred are 1D arrays
y_test = y_test.flatten()
y_pred = y_pred.flatten()

# 2. Use the same process_chunks function you already have
sorted_y_test_chunks, sorted_y_pred_chunks, top_11_indices_pred = process_chunks(y_pred, y_test)

mae_list = []
rmse_list = []
sum1_list = []

# 3. Compute chunk-wise MAE and RMSE (team-wise sums)
for i, (sorted_y_pred_chunk, sorted_y_test_chunk) in enumerate(zip(sorted_y_pred_chunks, sorted_y_test_chunks)):
    # Sum calculation with captain and vice-captain weightage
    sum1 = sorted_y_test_chunk[0] * 2 + sorted_y_test_chunk[1] * 1.5 + np.sum(sorted_y_test_chunk[2:11])
    sum2 = sorted_y_pred_chunk[0] * 2 + sorted_y_pred_chunk[1] * 1.5 + np.sum(sorted_y_pred_chunk[2:11])

    error = np.abs(sum1 - sum2)  # Absolute Error
    squared_error = (sum1 - sum2) ** 2  # Squared Error

    mae_list.append(error)
    rmse_list.append(squared_error)
    sum1_list.append(sum1)

    print(f"Chunk {i+1} Error: {error:.2f}, True Sum: {sum1:.2f}, Predicted Sum: {sum2:.2f}")

# 4. Final aggregated metrics
mae_final = np.mean(mae_list)
rmse_final = np.sqrt(np.mean(rmse_list))  # RMSE is sqrt(MSE)
avg_true_sum = np.mean(sum1_list)

print("Team-wise Mean Absolute Error (MAE):", round(mae_final, 2))
print("Team-wise Root Mean Squared Error (RMSE):", round(rmse_final, 2))
print("Average Dream Team Points in ODI (True Sum):", round(avg_true_sum, 2))


Chunk 1 Error: 133.00, True Sum: 686.00, Predicted Sum: 553.00
Chunk 2 Error: 895.00, True Sum: 1369.00, Predicted Sum: 474.00
Chunk 3 Error: 551.50, True Sum: 1300.00, Predicted Sum: 748.50
Chunk 4 Error: 237.50, True Sum: 984.50, Predicted Sum: 747.00
Chunk 5 Error: 222.00, True Sum: 946.00, Predicted Sum: 724.00
Chunk 6 Error: 410.00, True Sum: 1442.50, Predicted Sum: 1032.50
Chunk 7 Error: 485.00, True Sum: 818.50, Predicted Sum: 333.50
Chunk 8 Error: 550.00, True Sum: 1162.50, Predicted Sum: 612.50
Chunk 9 Error: 354.00, True Sum: 907.00, Predicted Sum: 553.00
Chunk 10 Error: 764.50, True Sum: 1323.00, Predicted Sum: 558.50
Chunk 11 Error: 255.00, True Sum: 887.00, Predicted Sum: 632.00
Chunk 12 Error: 546.50, True Sum: 1057.00, Predicted Sum: 510.50
Chunk 13 Error: 302.00, True Sum: 1210.00, Predicted Sum: 908.00
Chunk 14 Error: 734.50, True Sum: 1134.00, Predicted Sum: 399.50
Chunk 15 Error: 305.50, True Sum: 1225.50, Predicted Sum: 920.00
Chunk 16 Error: 652.50, True Sum: 1155.

In [15]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

RMSE: 44.81863941510613


Some Plotting

In [13]:
# 2. Load all_features list (your custom feature names)
with open("all_features.pkl", "rb") as f:
    all_features = pickle.load(f)

In [18]:
# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[-50:]  # Top 50 features

plt.figure(figsize=(14, 10))
plt.title('Feature Importance for Fantasy Points Prediction')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [all_features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.tight_layout()
plt.savefig('images/feature_importance_rf.png')
plt.close()

# 5. Prediction vs Actual

plt.figure(figsize=(10, 10))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Fantasy Points')
plt.ylabel('Predicted Fantasy Points')
plt.title('Actual vs Predicted Fantasy Points')
plt.text(0.05, 0.85, f'MAE = {mae_final:.2f}', transform=plt.gca().transAxes)  
plt.text(0.05, 0.90, f'RMSE = {rmse:.2f}', transform=plt.gca().transAxes)
plt.axis('equal')
plt.grid(True)
plt.savefig('images/prediction_vs_actual_rf.png')
plt.close()

# 6. Analysis of errors
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.3)
plt.axhline(y=0, color='r', linestyle='-')
plt.xlabel('Predicted Fantasy Points')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True)
plt.savefig('images/residual_plot_rf.png')
plt.close()

# 7. Distribution of Residuals
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, bins=50)
plt.title('Distribution of Residuals')
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.savefig('images/residuals_distribution_rf.png')
plt.close()


print("Visualization complete! Check the PNG files for the plots.")
print("\nTop 20 features by importance:")
for i, idx in enumerate(indices[::-1]):
    print(f"{i+1}. {all_features[idx]} (Importance: {importances[idx]:.4f})")

Visualization complete! Check the PNG files for the plots.

Top 20 features by importance:
1. match_EWMA Fantasy Points (Importance: 0.0663)
2. weather_temperature (Importance: 0.0629)
3. weather_wind_speed (Importance: 0.0507)
4. match_Balls_Thrown (Importance: 0.0423)
5. match_Strike_Rate (Importance: 0.0393)
6. match_total_points (Importance: 0.0349)
7. match_Batting_Average (Importance: 0.0332)
8. match_Economy (Importance: 0.0321)
9. match_Balls_Faced (Importance: 0.0288)
10. weather_precipitation (Importance: 0.0215)
11. match_Runs_Given (Importance: 0.0210)
12. match_Runs (Importance: 0.0200)
13. match_Boundaries_Given (Importance: 0.0185)
14. match_Boundaries_Scored (Importance: 0.0184)
15. venue_fantasy_points (Importance: 0.0156)
16. venue_strike_rate (Importance: 0.0155)
17. match_Wickets (Importance: 0.0146)
18. venue_economy (Importance: 0.0139)
19. match_Number_of_Dismissals (Importance: 0.0127)
20. venue_batting_average (Importance: 0.0125)
21. match_matches_played (Impo

In [27]:
# 5. Prediction vs Actual

plt.figure(figsize=(10, 10))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Fantasy Points')
plt.ylabel('Predicted Fantasy Points')
plt.title('Actual vs Predicted Fantasy Points')
plt.text(0.05, 0.85, f'MAE = {np.mean(np.array(mae_list)):.2f}', transform=plt.gca().transAxes)  
plt.text(0.05, 0.90, f'RMSE = {rmse:.2f}', transform=plt.gca().transAxes)
plt.axis('equal')
plt.grid(True)
plt.savefig('images/prediction_vs_actual_xgb.png')
plt.close()

# 6. Analysis of errors
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.3)
plt.axhline(y=0, color='r', linestyle='-')
plt.xlabel('Predicted Fantasy Points')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True)
plt.savefig('images/residual_plot_xgb.png')
plt.close()

# 7. Distribution of Residuals
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, bins=50)
plt.title('Distribution of Residuals')
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.savefig('images/residuals_distribution_xgb.png')
plt.close()


print("Visualization complete! Check the PNG files for the plots.")

Visualization complete! Check the PNG files for the plots.
