In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,  accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib

CLEAN_FILE_PATH = "data/processed/cleaned_data.csv"
MODEL_PATH_RF = "models/random_forest_model.joblib"
 

In [None]:
if not os.path.exists("models"):
    raise FileNotFoundError(f"Cleaned data not found at {CLEAN_FILE_PATH} ")

df  = pd.read_csv(CLEAN_FILE_PATH)
print(f"Loaded dataset with {len(df)} rows and columns: {list(df.columns)}")

df.head()

Loaded dataset with 2063 rows and columns: ['time', 'team', 'event', 'start_x', 'start_y', 'end_x', 'end_y', 'possession_duration', 'zone', 'dist_to_goal', 'possession_streak', 'pass_distance', 'team_encoded', 'event_encoded']


In [None]:
if "x_norm" not in df.columns or "y_norm" not in df.columns:
    if "end_x" in df.columns and "end_y" in df.columns:
        df["x_norm"] = df["end_x"] / 50.0
        df["y_norm"] = df["end_y"] / 25.0

if "team_encoded" not in df.columns and "team" in df.columns:
    df["team_encoded"] = df["team"].astype('category').cat.codes

if "event_coded" not in df.columns and "event" in df.columns:
    df["event_encoded"] = df["event"].astype('category').cat.codes

features = ["x_norm", "y_norm", "team_encoded"]
target = "event_encoded"

x = df[features]
y = df[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
# Training Random Forest Model

rf_model = RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42)
rf_model.fit(x_train, y_train)

y_pred = rf_model.predict(x_test)

print("Random Forest accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test,y_pred))

ConfusionMatrixDisplay.from_estimator(rf_model, x_test, y_test)
plt.show()

os.makedirs(os.path.dirname(MODEL_PATH_RF), exist_ok=True)
joblib.dump(rf_model, MODEL_PATH_RF)

print(f"Random Forest model saved at {MODEL_PATH_RF}")


Random Forest accuracy:  0.6150121065375302
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00         9
           2       0.68      0.71      0.70       240
           3       0.31      0.32      0.32        68
           4       0.56      0.46      0.50        81

    accuracy                           0.62       413
   macro avg       0.71      0.70      0.70       413
weighted avg       0.61      0.62      0.61       413

Random Forest model saved at models/random_forest_model.joblib
