In [1]:
# !unzip archive.zip

In [2]:
import pandas as pd
import preprocessing
import model

In [3]:
events_file_path = './data/events.csv'
ginf_file_path = './data/ginf.csv'

In [4]:
df = preprocessing.load_and_merge_data(events_file_path, ginf_file_path)

2024-04-18 20:31:55,683:INFO:Starting to load datasets.
2024-04-18 20:31:59,842:INFO:Loaded events data from ./data/events.csv.
2024-04-18 20:31:59,876:INFO:Loaded game information data from ./data/ginf.csv.
2024-04-18 20:32:00,361:INFO:Merged events and game information datasets.


In [5]:
df.head(3)

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,at,fthg,ftag,odd_h,odd_d,odd_a,odd_over,odd_under,odd_bts,odd_bts_n
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,1,12.0,2,Hamburg SV,Borussia Dortmund,...,Hamburg SV,3,1,1.56,4.41,7.42,,,,
1,UFot0hit/,UFot0hit2,2,4,"Corner, Borussia Dortmund. Conceded by Dennis...",2,,1,Borussia Dortmund,Hamburg SV,...,Hamburg SV,3,1,1.56,4.41,7.42,,,,
2,UFot0hit/,UFot0hit3,3,4,"Corner, Borussia Dortmund. Conceded by Heiko ...",2,,1,Borussia Dortmund,Hamburg SV,...,Hamburg SV,3,1,1.56,4.41,7.42,,,,


In [6]:
X_train, X_test, y_train, y_test = preprocessing.preprocess_data(df, scale=False)

2024-04-18 20:32:00,442:INFO:Starting preprocessing of data.
2024-04-18 20:32:03,265:INFO:Removed duplicate records.
2024-04-18 20:32:03,326:INFO:Filled missing values for categorical data.
2024-04-18 20:32:04,353:INFO:Selected features and target. Applied meaningful one-hot encoding.
2024-04-18 20:32:04,767:INFO:Split data into training and testing sets with similar distribution for the target variable.
2024-04-18 20:32:04,768:INFO:Returned data without scaling.


In [7]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (752807, 32)
X_test shape: (188202, 32)
y_train shape: (752807,)
y_test shape: (188202,)


In [8]:
print(X_train.columns)

Index(['fast_break', 'location_Attacking_half', 'location_Centre_of_the_box',
       'location_Defensive_half', 'location_Difficult_angle_and_long_range',
       'location_Difficult_angle_on_the_left',
       'location_Difficult_angle_on_the_right',
       'location_Left_side_of_the_box',
       'location_Left_side_of_the_six_yard_box', 'location_Left_wing',
       'location_Long_range', 'location_More_than_35_yards',
       'location_More_than_40_yards', 'location_Not_recorded',
       'location_Outside_the_box', 'location_Penalty_spot',
       'location_Right_side_of_the_box',
       'location_Right_side_of_the_six_yard_box', 'location_Right_wing',
       'location_Very_close_range', 'bodypart_head', 'bodypart_left_foot',
       'bodypart_right_foot', 'assist_method_Cross',
       'assist_method_Headed_pass', 'assist_method_None', 'assist_method_Pass',
       'assist_method_Through_ball', 'situation_Corner', 'situation_Free_kick',
       'situation_Open_play', 'situation_Set_piece'],

In [9]:
# print(y_train)

In [10]:
best_model = model.train_model(X_train, y_train)

2024-04-18 20:32:04,854:INFO:Starting model training.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


2024-04-18 20:40:02,997:INFO:Model training completed in 478.14 seconds.
2024-04-18 20:40:02,998:INFO:Best parameters: {'C': 10, 'solver': 'liblinear'}. Best score: 0.978241435240778.


In [11]:
model.evaluate_model(best_model, X_test, y_test)

2024-04-18 20:40:03,302:INFO:Model evaluation completed. Accuracy: 0.9779226575700577.
2024-04-18 20:40:03,304:INFO:Classification Report: 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    183313
           1       0.71      0.25      0.37      4889

    accuracy                           0.98    188202
   macro avg       0.85      0.62      0.68    188202
weighted avg       0.97      0.98      0.97    188202



In [12]:
model.save_model(best_model, filename='./models/LR_base.pkl')

2024-04-18 20:40:03,315:INFO:Model saved to ./models/LR_base.pkl.
