# College Football Matchup Model

This will create the matchup model based on the CSV data

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle

In [2]:
merged_df = pd.DataFrame()

weeks = [1, 2, 3, 4]
year = 2025

for week in weeks:
	week_df = pd.read_csv(f'cfbd_{year}_{week}_games.csv')

	merged_df = pd.concat([merged_df, week_df], ignore_index=True)

merged_df.head()

Unnamed: 0,season,week,homeId,awayId,homePoints,awayPoints,home_year_x,home_elo,home_fpi,home_rating,...,away_epaAllowed,away_successRate,away_successRateAllowed,away_explosiveness,away_explosivenessAllowed,away_year,away_rank,away_points,homeTeam,awayTeam
0,2025,1,2306,66,21,24,2025,1664,4.861,-12.1,...,0.133378,0.416411,0.39983,0.970938,0.927144,2025,54,197.45,,
1,2025,1,2305,278,31,7,2025,1633,11.232,13.9,...,0.205689,0.426509,0.428021,0.997792,0.939405,2025,103,142.63,,
2,2025,1,98,2534,41,24,2025,1445,-9.183,-0.7,...,0.234115,0.370148,0.460192,1.013398,0.947381,2025,122,108.47,,
3,2025,1,62,24,23,20,2025,1178,-10.623,-10.1,...,0.157432,0.393485,0.434907,0.967133,0.915603,2025,56,196.58,,
4,2025,1,58,68,34,7,2025,1554,3.927,2.5,...,0.220183,0.439745,0.403321,0.974668,1.045889,2025,62,189.06,,


In [3]:
merged_df.columns

Index(['season', 'week', 'homeId', 'awayId', 'homePoints', 'awayPoints',
       'home_year_x', 'home_elo', 'home_fpi', 'home_rating', 'home_offense',
       'home_defense', 'home_year_y', 'home_epa', 'home_epaAllowed',
       'home_successRate', 'home_successRateAllowed', 'home_explosiveness',
       'home_explosivenessAllowed', 'home_year', 'home_rank', 'home_points',
       'away_year_x', 'away_elo', 'away_fpi', 'away_rating', 'away_offense',
       'away_defense', 'away_year_y', 'away_epa', 'away_epaAllowed',
       'away_successRate', 'away_successRateAllowed', 'away_explosiveness',
       'away_explosivenessAllowed', 'away_year', 'away_rank', 'away_points',
       'homeTeam', 'awayTeam'],
      dtype='object')

In [4]:
features = ['season', 'week', 'homeId', 'awayId', 'home_year_x', 'home_elo', 'home_fpi', 'home_rating',
       'home_offense', 'home_defense', 'home_year_y',
       'home_epa', 'home_epaAllowed', 'home_successRate',
       'home_successRateAllowed', 'home_explosiveness',
       'home_explosivenessAllowed', 'home_year', 'home_rank', 'home_points',
       'away_year_x', 'away_elo', 'away_fpi', 'away_rating', 'away_offense',
       'away_defense', 'away_year_y', 'away_epa',
       'away_epaAllowed', 'away_successRate', 'away_successRateAllowed',
       'away_explosiveness', 'away_explosivenessAllowed', 'away_year',
       'away_rank', 'away_points']


X = merged_df[features]
y = merged_df[['homePoints', 'awayPoints']]

In [5]:
# Split the data into Train and Test so we can predict

TEST_PROP, RANDOM_SEED = 0.5, 0
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_PROP, random_state=RANDOM_SEED)

In [6]:
type(X_train)

pandas.core.frame.DataFrame

In [7]:
model =  RandomForestRegressor()
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:
# 6 Evaluation: Test the model via predicting

y_pred_train = model.predict(X_train)
train_mse = mean_squared_error(y_train, y_pred_train)
train_r2 = r2_score(y_train, y_pred_train)
print(f"Train MSE: {train_mse:.2f}, R2: {train_r2:.2f}")

y_pred_test = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
test_r2 = r2_score(y_test, y_pred_test)
print(f"Test MSE: {test_mse:.2f}, R2: {test_r2:.2f}")

Train MSE: 12.01, R2: 0.93
Test MSE: 102.89, R2: 0.50


In [9]:
# Save Model

filename = 'college_football_model.pkl'
with open(filename, 'wb') as file:
		pickle.dump(model, file)