Import Data

In [None]:
import pandas as pd
import numpy as np
import streamlit as st
import altair as alt
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import col

session.sql("USE DATABASE EVENT").collect()
session.sql("USE SCHEMA DATATHON_2025_TEAM_KAPPA").collect()


In [None]:
 # Load data from tables
race_df = session.table("VW_ALLRACESTATS")

In [None]:
print("Race Table:")
race_df.show()

In [None]:
race_pdf=race_df.to_pandas()

In [None]:
# 2. CLEAN & FORMAT DATA
race_pdf = race_pdf.copy()
race_pdf ["RACE_DATE"] = pd.to_datetime(race_pdf ["RACE_DATE"])
race_pdf  = race_pdf.sort_values("RACE_DATE")

In [None]:
# 3. CREATE TARGET COLUMN (McLaren in top 3)
race_pdf["MCLAREN_TOP3"] = ((race_pdf["CONSTRUCTOR_NAME"] == "McLaren") &
                      (race_pdf["RACE_POSITION_ORDER"].between(1, 3))).astype(int)

In [None]:
# Optional: View balance
print("Target class distribution:")
print(race_pdf["MCLAREN_TOP3"].value_counts())

In [None]:
race_pdf['RACE_DATE'].unique()

In [None]:
race_pdf.describe()

In [None]:
race_pdf.isnull().sum()

In [None]:
# Drop rows with key nulls
race_pdf = race_pdf.dropna(subset=["RACE_POSITION_ORDER", "GRID_POSITION", "DRIVER_NAME", "CONSTRUCTOR_NAME"])

In [None]:
race_pdf.shape

In [None]:
# DROP ROWS WHERE TARGET IS NOT RELEVANT 
df_mclaren = race_pdf[race_pdf["CONSTRUCTOR_NAME"] == "McLaren"].copy()
df_mclaren.head()

Feature Engineering

In [None]:
# Position gain/loss
df_mclaren["POSITION_DELTA"] = df_mclaren["GRID_POSITION"] - df_mclaren["RACE_POSITION_ORDER"]

# McLaren flag
df_mclaren["IS_MCLAREN"] = (df_mclaren["CONSTRUCTOR_NAME"] == "McLaren").astype(int)

# McLaren top 3 finish target
df_mclaren["MCLAREN_TOP3"] = ((df_mclaren["IS_MCLAREN"] == 1) & (df_mclaren["RACE_POSITION_ORDER"] <= 3)).astype(int)

# Sort by date for rolling features
df_mclaren = df_mclaren.sort_values(["DRIVERID", "RACE_DATE"])

# Rolling average for driver's recent performance (last 3 races)
df_mclaren["DRIVER_AVG_POSITION"] = df_mclaren.groupby("DRIVERID")["RACE_POSITION_ORDER"].transform(lambda x: x.shift(1).rolling(3).mean())

# Rolling average for constructor performance
df_mclaren["TEAM_AVG_POSITION"] = df_mclaren.groupby("CONSTRUCTOR_NAME")["RACE_POSITION_ORDER"].transform(lambda x: x.shift(1).rolling(3).mean())

In [None]:
# Create binary target: 1 if McLaren won, else 0
df_mclaren["MCLAREN_WIN"] = ((df_mclaren["RACE_POSITION_ORDER"] == 1) & 
                      (df_mclaren["CONSTRUCTOR_NAME"].str == "McLaren")).astype(int)

In [None]:
df_mclaren.head()

In [None]:
key_features = df_mclaren[["GRID_POSITION", "QUALI_POSITION", "POINTS", "FASTEST_LAP_RANK",
    "FASTESTLAPSPEED", "DRIVER_POINTS_STANDINGS", "DRIVER_POSITION_STANDINGS",
    "CONSTRUCTOR_POINTS_STANDINGS", "CONSTRUCTOR_POSITION_STANDINGS", "WINS"
]]

In [None]:
#Compare features vs. target (McLaren Win)
import seaborn as sns
import matplotlib.pyplot as plt

for col_name in key_features:
    plt.figure(figsize=(6, 3))
    sns.violinplot(x="MCLAREN_WIN", y=col_name, data=df_mclaren)
    plt.title(f"{col_name} Distribution vs. McLaren Win")
    plt.xticks([0, 1], ["No", "Yes"])
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation matrix (numeric only)
numeric_cols = [
    "GRID_POSITION", "FASTESTLAP", "FASTEST_LAP_RANK", "FASTESTLAPSPEED",
    "LAPS_COMPLETED", "POINTS", "RACE_POSITION_ORDER"
]
plt.figure(figsize=(10, 6))
sns.heatmap(df_mclaren[numeric_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()

Train/Test Split

In [None]:
# Features and target
X = df_mclaren[["GRID_POSITION", "QUALI_POSITION", "POINTS", "FASTEST_LAP_RANK",
    "FASTESTLAPSPEED", "DRIVER_POINTS_STANDINGS", "DRIVER_POSITION_STANDINGS",
    "CONSTRUCTOR_POINTS_STANDINGS", "CONSTRUCTOR_POSITION_STANDINGS", "WINS"
]]
y = df_mclaren["MCLAREN_TOP3"]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Scale features 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 10, None],
    "min_samples_split": [2, 5]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3)
grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)

In [None]:
# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth = 10, min_samples_split = 2)
rf.fit(X_train, y_train)

In [None]:
# EVALUATE
y_pred = rf.predict(X_test)
print("\nModel Performance:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# FEATURES
features = [
    "GRID_POSITION", "QUALI_POSITION", "POINTS", "FASTEST_LAP_RANK",
    "FASTESTLAPSPEED", "DRIVER_POINTS_STANDINGS", "DRIVER_POSITION_STANDINGS",
    "CONSTRUCTOR_POINTS_STANDINGS", "CONSTRUCTOR_POSITION_STANDINGS", "WINS"
]

In [None]:
# FEATURE IMPORTANCE
import matplotlib.pyplot as plt
import seaborn as sns

importances = pd.Series(rf.feature_importances_, index=features)
plt.figure(figsize=(8, 4))
sns.barplot(x=importances.values, y=importances.index)
plt.title("Feature Importance")
plt.tight_layout()
plt.show()

In [None]:
# McLaren win prediction for next race 
# Prepare one row with latest McLaren data
next_race_input = df_mclaren[features].iloc[-1:].copy()  # Just an example; replace with real upcoming values

prediction = rf.predict(next_race_input)
print("\n Prediction for next race: McLaren Top 3?", "YES" if prediction[0] == 1 else "NO")

In [None]:
#Explaining the model prediction
import shap

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[1], X_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

# Train a decision tree (depth-limited for interpretability)
dt_clf = DecisionTreeClassifier(max_depth=10, class_weight="balanced", random_state=42)
dt_clf.fit(X_train, y_train)

In [None]:
# Plot the decision tree
plt.figure(figsize=(10, 10))
plot_tree(
    dt_clf,
    feature_names=features,
    class_names=["No Top 3", "Top 3"],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title("Decision Tree for McLaren Top 3 Finish Prediction")
plt.show()

In [None]:
accuracy = accuracy_score(y_test, dt_clf.predict(X_test))
print("Decision Tree Accuracy:", accuracy)

Predicting whether McLaren will win at Top 3 position in 2025 race

X = df_mclaren[["GRID_POSITION", "QUALI_POSITION", "POINTS", "FASTEST_LAP_RANK",
    "FASTESTLAPSPEED", "DRIVER_POINTS_STANDINGS", "DRIVER_POSITION_STANDINGS",
    "CONSTRUCTOR_POINTS_STANDINGS", "CONSTRUCTOR_POSITION_STANDINGS", "WINS"
]]

In [None]:
# Example new race data 
new_race = pd.DataFrame([{
    "GRID_POSITION": 0.945,
    "QUALI_POSITION": 8,
    "POINTS": 80,
    "FASTEST_LAP_RANK": 50,
    "FASTESTLAPSPEED": 130,
    "DRIVER_POINTS_STANDINGS" : 1.065,
    "DRIVER_POSITION_STANDINGS": 120,
    "CONSTRUCTOR_POINTS_STANDINGS": 120,
    "CONSTRUCTOR_POSITION_STANDINGS":190,
    "WINS": 40}])

new_race_scaled = scaler.transform(new_race)
prediction = rf.predict(new_race_scaled)
print("McLaren expected to finish in Top 3:", bool(prediction[0]))

PREVIOUS CODE

In [None]:
#All data from 2013 and previous already deleted using sql


# # Filter out rows where 'YEAR' is less than or equal to 2013
# new_df = race_df.filter(race_df["YEAR"] > 2013)

# # Show the resulting DataFrame
# new_df.show()

In [None]:
#check for nulls
race_pdf.isnull().sum()

In [None]:
#Creating a subset of race_pdf dataframe to show missing values
race_pdf_missing_value = race_pdf[race_pdf.isna().any(axis=1)]
race_pdf_missing_value.shape

#Inspecting the entries with missing values

race_pdf_missing_value.head(race_pdf_missing_value.shape[0])

In [None]:
constructor_pdf= race_pdf.set_index('CONSTRUCTOR_NAME')
constructor_pdf.head()

In [None]:
constructor_pdf.duplicated().sum()
constructor_pdf[constructor_pdf.duplicated()]

In [None]:
st.write("Number of laps by Driver Name")
laps_by_winner = constructor_pdf.groupby("DRIVER_NAME")["LAPS_COMPLETED"].sum()
st.bar_chart(laps_by_winner)

In [None]:
#Number of wins by constructor
st.write("Number of wins by Constructor Name")
wins_by_constructor = constructor_pdf.groupby("CONSTRUCTOR_NAME")["WINS"].sum()
st.bar_chart(wins_by_constructor)

In [None]:
#Relationship between how many laps completed and how many wins.
st.write("Number of laps by wins")
laps_by_winner = constructor_pdf.groupby("LAPS_COMPLETED")["WINS"].sum()
st.bar_chart(laps_by_winner)

In [None]:
# Filter to only the **winning constructor** per race (RACE_POSITION == 1)
winners_df = constructor_pdf[
    (constructor_pdf["RACE_POSITION_ORDER"] >= 1) & 
    (constructor_pdf["RACE_POSITION_ORDER"] <= 3)
]
print(winners_df.head())

In [None]:
winners_df.head()

In [None]:
# winners_pdf = winners_df.to_pandas()
# winners_pdf= winners_pdf.set_index('l_0006_CONSTRUCTOR_NAME')
# winners_pdf.head(20)

In [None]:
st.write("Number of laps by wins")
laps_by_winner = winners_df.groupby("LAPS_COMPLETED")["WINS"].sum()
st.bar_chart(laps_by_winner)

In [None]:
winners_df.head()

In [None]:
winners_df.columns

In [None]:
#Grid position and wins
st.write("Number of wins by grid position")
win_by_gridposition = winners_df.groupby("GRID_POSITION")["WINS"].sum()
st.bar_chart(win_by_gridposition)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from snowflake.snowpark.functions import col

In [None]:
winners_df.columns

In [None]:
# #renaming columns for clarity
# winners_pdf.rename(columns={
#     'r_0007_CONSTRUCTOR_NAME':'CONSTRUCTOR_NAME',
#     # add others if needed
# }, inplace=True)

In [None]:
# winners_pdf.head()

In [None]:
# Print columns to debug
print(winners_df.columns.tolist())

#Create binary target (McLaren win)
winners_df["MCLAREN_WIN"] = ((winners_df["RACE_POSITION_ORDER"] == 1) &
                              (winners_df["CONSTRUCTORID"] == 1 )).astype(int)

In [None]:
winners_df["MCLAREN_WIN"]

In [None]:
mclaren_df = winners_df[['CONSTRUCTORID', 'MCLAREN_WIN']]
mclaren_df.head()

In [None]:
mclaren_wins = mclaren_df.groupby(['CONSTRUCTORID'], as_index=False)['MCLAREN_WIN'].sum()

plt.figure(figsize=(12, 8))
plot_mg = sns.countplot('MCLAREN_WIN', data=mclaren_df)

plt.title("Number of wins by McLaren")
plt.ylabel("Win Count")

In [None]:
winners_df.columns

In [None]:
# Distribution plots of key features
key_features = ["GRID_POSITION", "FASTESTLAP", "FASTEST_LAP_RANK", "FASTESTLAPSPEED"]

for col_name in key_features:
    plt.figure(figsize=(6, 3))
    sns.histplot(winners_df[col_name], kde=True, bins=30)
    plt.title(f"Distribution of {col_name}")
    plt.tight_layout()
    plt.show()

In [None]:
# McLaren win frequency
plt.figure(figsize=(5, 3))
sns.countplot(x="MCLAREN_WIN", data=winners_df)
plt.title("McLaren Wins vs Non-Wins")
plt.xticks([0, 1], ["No Win", "Win"])
plt.tight_layout()
plt.show()

In [None]:
#Compare features vs. target (McLaren Win)
import seaborn as sns
import matplotlib.pyplot as plt

for col_name in key_features:
    plt.figure(figsize=(6, 3))
    sns.violinplot(x="MCLAREN_WIN", y=col_name, data=winners_df)
    plt.title(f"{col_name} Distribution vs. McLaren Win")
    plt.xticks([0, 1], ["No", "Yes"])
    plt.tight_layout()
    plt.show()

In [None]:
# Compare mean of each feature for McLaren win vs non-win
group_stats = winners_df.groupby("MCLAREN_WIN")[key_features].mean().T
group_stats.columns = ["No Win", "Win"]
print("Feature averages by McLaren win:\n", group_stats)

In [None]:
# Which constructors win most?
winners = winners_df[winners_df["RACE_POSITION_ORDER"] == 1]
plt.figure(figsize=(10, 4))
sns.countplot(y="CONSTRUCTORID", data=winners, order=winners["CONSTRUCTORID"].value_counts().index)
plt.title("Constructor Win Frequency")
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(winners_df[["MCLAREN_WIN", "GRID_POSITION", "POINTS", "FASTEST_LAP_RANK"]], hue="MCLAREN_WIN")
plt.show()

In [None]:
#Finding all unique values in Year column
# winners_df['YEAR'].unique()

In [None]:
winners_df.head(2)

In [None]:
mclaren_won_year = winners_df[
    (winners_df["MCLAREN_WIN"] != 0)
]

print(mclaren_won_year[["RACE_DATE", "MCLAREN_WIN"]])

In [None]:
#Create binary target (Mercedes win) to compare McLaren and Mercedes wins
winners_df["MERCEDES_WIN"] = ((winners_df["RACE_POSITION_ORDER"] == 1) &
                              (winners_df["CONSTRUCTORID"] == 131)).astype(int)
# Print columns to debug
print(winners_df.columns.tolist())

In [None]:
#FInding all dates where Mercedes won
mercedes_won_year = winners_df[
    (winners_df["MERCEDES_WIN"] != 0)
]

print(mercedes_won_year[["RACE_DATE", "MERCEDES_WIN"]])

In [None]:
#Mercedes Grid position and wins
st.write("Mercedes: Number of wins by grid position")
mercedes_win_by_gridposition = winners_df.groupby("GRID_POSITION")["MERCEDES_WIN"].sum()
st.bar_chart(mercedes_win_by_gridposition)

In [None]:
# Selecting features and target
X = winners_df[[
    "GRID_POSITION", "FASTESTLAP", "FASTEST_LAP_RANK", "FASTESTLAPSPEED",
    "LAPS_COMPLETED", "POINTS", "RACE_POSITION_ORDER"
]]
y = winners_df["MCLAREN_WIN"]

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Scale features (optional, good for EDA + RF)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Split into training and testing
# X_train, X_test, y_train, y_test = train_test_split(
#     X_scaled, y, test_size=0.2, random_state=42
# )