# Setup

In [6]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Cleaning


In [7]:
csv_file_path = 'clean_merged_playerdata_with_weather.csv'

df = pd.read_csv(csv_file_path)

# drop columns we definitely don't need
df = df.drop(["bet_type", "tie_rule", "open_time", "close_time",
              "p1_outcome_text", "p2_outcome_text", "p3_outcome_text",
              "book", "event_completed", "event_name", "odds",

             'p1_player_name', 'p2_player_name', 'p3_player_name',

             'dg_id_p1', 'fin_text_p1', 'fin_text_p2', 'fin_text_p3',
             'course_name_p1', 'teetime_p2', 'teetime_p3', 'wx_teetime',
             'wx_datetime_hour',
             'wx_date_from_close', 'wx_conditions', 'wx_icon', 'wx_datetimeEpoch',
             'tour_p1', 'season'], axis=1)

# rename columns we'd like to keep
df = df.rename(columns={'teetime_p1':'teetime'})

df = df.drop(["teetime"], axis=1)


# preciptype can either only be nan or 'rain'
df['wx_preciptype'] = df['wx_preciptype'].fillna(0)
df['wx_preciptype'] = df['wx_preciptype'].apply(lambda x: 1 if x != 0 else x)



# Create one outcome column
df['outcome'] = (
    df[['p1_outcome', 'p2_outcome', 'p3_outcome']]
    .fillna(0) # turn all na's or NaNs to 0
    .idxmax(axis=1)
    .str.extract(r'p(\d+)_outcome') # pull out 1, 2, 3
    .astype(float) # convert to floats
)
# Then remove the other outcome column
df = df.drop(['p1_outcome', 'p2_outcome', 'p3_outcome'], axis=1)

X = df.select_dtypes(include=["number"]).drop(columns=["outcome"])
y = df["outcome"]

# drop any rows where the target is NaN
mask = y.notna()
X = X[mask]
y = y[mask]

# Trainâ€“test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=0,
    stratify=y
)

# impute NaNs in features with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_train_imp = imputer.fit_transform(X_train)
X_test_imp  = imputer.transform(X_test)

#scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imp)
X_test_scaled  = scaler.transform(X_test_imp)

  df = pd.read_csv(csv_file_path)


# Logistic Regression

In [8]:
log_reg = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs"
)

log_reg.fit(X_train_scaled, y_train)

acc = log_reg.score(X_test_scaled, y_test)
print("\nLogistic regression accuracy:", acc)

y_pred = log_reg.predict(X_test_scaled)

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification report:")
print(classification_report(y_test, y_pred))




Logistic Regression Accuracy: 0.8897490905699095

Confusion Matrix:
[[4764  123  119]
 [ 478 3957  108]
 [ 226  128  818]]

Classification Report:
              precision    recall  f1-score   support

         1.0       0.87      0.95      0.91      5006
         2.0       0.94      0.87      0.90      4543
         3.0       0.78      0.70      0.74      1172

    accuracy                           0.89     10721
   macro avg       0.86      0.84      0.85     10721
weighted avg       0.89      0.89      0.89     10721

