In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegressionCV

In [2]:
from google.colab import drive
drive.mount('/content/drive')
DIR_PATH = "/content/drive/MyDrive/CANSSI Competition/"

Mounted at /content/drive


In [3]:
train = pd.read_parquet(DIR_PATH+'train_datatrots_2013-2022 (2).parquet', engine='pyarrow')
test = pd.read_parquet(DIR_PATH+'test_datatrots_2013-2022 (2).parquet', engine='pyarrow')

In [4]:
train_ID = train['Race_ID']
test_ID = test['Race_ID']

In [5]:
X_train = train.drop(columns=['Race_ID','RaceStartTime', 'win probability'])
y_train = train['win probability']
X_test = test.drop(columns=['Race_ID','RaceStartTime', 'win probability'])
y_test = test['win probability']

In [6]:
# normalize prediction result function
def normalize_predictions_by_id(predictions, ids):
    predictions = np.clip(predictions, 0, 1)
    predictions = pd.Series(predictions)
    ids = pd.Series(ids)
    sums = predictions.groupby(ids).transform('sum').replace(0, pd.NA)
    normalized_predictions = predictions / sums
    normalized_predictions = normalized_predictions.fillna(1.0 / len(predictions))
    return normalized_predictions.values

In [7]:
y_train_normalized = normalize_predictions_by_id(y_train, train_ID)
y_test_normalized = normalize_predictions_by_id(y_test, test_ID)

In [8]:
# Xgboost
xgboost = XGBRegressor(random_state=42)
xgboost.fit(X_train, y_train_normalized)
xgb_predictions = xgboost.predict(X_test)

In [9]:
# Normalize
xgb_predictions = pd.Series(xgb_predictions)
xgb_predictions.index = y_test.index
xgb_pred_norm= normalize_predictions_by_id(xgb_predictions, test_ID)
xgb_mse = mean_squared_error(xgb_pred_norm, y_test_normalized)
print(f'XGBoost MSE: {xgb_mse:.4f}')

XGBoost MSE: 0.0129


In [11]:
# Lasso Regression
lasso = LassoCV(cv=5).fit(X_train, y_train_normalized)
lasso_predictions = lasso.predict(X_test)
lasso_predictions = pd.Series(lasso_predictions)
lasso_predictions.index = y_test.index
lasso_pred_norm= normalize_predictions_by_id(lasso_predictions, test_ID)
lasso_mse = mean_squared_error(lasso_pred_norm, y_test_normalized)
print(f"Lasso MSE: {lasso_mse:.4f}")

Lasso MSE: 0.0139


  model = cd_fast.enet_coordinate_descent(


In [14]:
# Linear Regression
lm = LinearRegression().fit(X_train, y_train_normalized)
lm_predictions = lm.predict(X_test)
lm_predictions = pd.Series(lm_predictions, index=y_test.index)
lm_pred_norm = normalize_predictions_by_id(lm_predictions, test_ID)
lm_mse = mean_squared_error(lm_pred_norm, y_test_normalized)
print(f"Linear Regression MSE: {lm_mse:.4f}")

Linear Regression MSE: 0.0139


In [15]:
# current best model is XGBoost
xgb_pred_norm

array([0.13504088, 0.13498738, 0.09455411, ..., 0.        , 0.        ,
       0.00422223], dtype=float32)

submission for all observation's forcasting

In [16]:
X = pd.concat([X_train, X_test], ignore_index=True)
ID = train['Race_ID'].append(test['Race_ID'], ignore_index=True)
y = y_train.append(y_test, ignore_index=True)
y_normalized = normalize_predictions_by_id(y, ID)

  ID = train['Race_ID'].append(test['Race_ID'], ignore_index=True)
  y = y_train.append(y_test, ignore_index=True)


In [18]:
xgb_predictions = xgboost.predict(X)
xgb_predictions = pd.Series(xgb_predictions)
xgb_predictions.index = y.index
xgb_pred_norm= normalize_predictions_by_id(xgb_predictions, ID)
xgb_mse = mean_squared_error(xgb_pred_norm, y_normalized)
print(f'XGBoost MSE: {xgb_mse:.4f}')

XGBoost MSE: 0.0121


In [19]:
xgb_pred_norm

array([0.15317927, 0.10670125, 0.14966989, ..., 0.        , 0.        ,
       0.00422223], dtype=float32)

In [20]:
xgb_pred_norm.shape

(1200412,)

In [23]:
pd.DataFrame({"Win Probability":xgb_pred_norm}).to_parquet('win_probability_variable.parquet')