In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import seaborn as sns

warnings.filterwarnings('ignore')

## FE Team data

In [None]:
train_home_team_statistics_df = pd.read_csv('Train_Data/train_home_team_statistics_df.csv', index_col=0)
train_away_team_statistics_df = pd.read_csv('Train_Data/train_away_team_statistics_df.csv', index_col=0)

train_scores = pd.read_csv('Train_Data/Y_train.csv', index_col=0)
train_scores_supp = pd.read_csv('benchmark_and_extras/Y_train_supp.csv', index_col=0)


train_home = train_home_team_statistics_df.iloc[:,2:]
train_away = train_away_team_statistics_df.iloc[:,2:]

train_home.columns = 'HOME_' + train_home.columns
train_away.columns = 'AWAY_' + train_away.columns

train_data =  pd.concat([train_home,train_away],join='inner',axis=1)
train_scores = train_scores.loc[train_data.index]

train_data = train_data.replace({np.inf:np.nan,-np.inf:np.nan})

train_data = train_data.fillna(train_data.median())


test_home = pd.read_csv('Test_Data/test_home_team_statistics_df.csv', index_col=0)
test_away = pd.read_csv('Test_Data/test_away_team_statistics_df.csv', index_col=0)

test_home.columns = 'HOME_' + test_home.columns
test_away.columns = 'AWAY_' + test_away.columns

test_data =  pd.concat([test_home,test_away],join='inner',axis=1)

test_data = test_data.fillna(test_data.median())



In [None]:
def combine_outcomes(row):
    if row['HOME_WINS'] == 1:
        return 1
    elif row['DRAW'] == 1:
        return 0
    elif row['AWAY_WINS'] == 1:
        return 2

train_scores['Outcome'] = train_scores.apply(combine_outcomes, axis=1)
train_new_y = train_scores['Outcome']

### Viz

In [None]:

def summarize_dataframe(df):
    summary_df = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary_df['missing#'] = df.isna().sum().values*100
    summary_df['missing%'] = (df.isna().sum().values*100)/len(df)
    summary_df['uniques'] = df.nunique().values
    summary_df['first_value'] = df.iloc[0].values
    summary_df['last_value'] = df.iloc[len(df)-1].values
    summary_df['count'] = df.count().values
    desc = pd.DataFrame(df.describe().T)
    summary_df['min'] = desc['min']
    summary_df['max'] = desc['max']
    summary_df['mean'] = desc['mean']
    return summary_df

summarize_dataframe(train_home).style.background_gradient(cmap='Blues')


In [None]:
import missingno as msno
msno.bar(train_home, color=(0.3,0.3,0.5))

In [None]:
import sweetviz
train_data_all = pd.concat([train_home, train_scores['Outcome']], axis=1)

my_report = sweetviz.compare([train_data_all, "Train"], [test_home, "Test"], "Outcome")

In [None]:
plt.hist(train_data['HOME_TEAM_SHOTS_TOTAL_season_sum'], label='train');
plt.legend();
plt.title('Distribution of HOME_TEAM_SHOTS_TOTAL_season_sum');

In [None]:
plt.hist(test_data['HOME_TEAM_SHOTS_TOTAL_season_sum'], label='test');


In [None]:
train_data.columns[140:160]

In [None]:
train_data_all = pd.concat([pythagorean_train_small, train_scores['HOME_WINS']], axis=1)
train_data_all

In [None]:
train_data_all.columns

In [None]:
p=12.4
train_data_all['PYTH_HOME'] = (train_data_all['HOME_TEAM_GAME_WON_5_last_match_sum'] ** p) / ((train_data_all['HOME_TEAM_GAME_WON_5_last_match_sum'] ** p) + (train_data_all['HOME_TEAM_GAME_LOST_5_last_match_sum'] ** p))

In [None]:
import seaborn as sns

plt.figure(figsize=(12, 8))
feat = "PYTHAGOREAN_season_sum"
plt.subplot(1,2,1)
sns.violinplot(x = 'HOME_WINS', y = feat, data = train_data_all[0:])

plt.subplot(1,2,2)
sns.distplot(train_data_all[train_data_all['HOME_WINS'] == 1.0][feat][0:] , label = "1", color = 'red')
sns.distplot(train_data_all[train_data_all['HOME_WINS'] == 0.0][feat][0:] , label = "0" , color = 'blue' )
sns.distplot(train_data_all[train_data_all['HOME_WINS'] == 2.0][feat][0:] , label = "2" , color = 'yellow' )

plt.show()

In [None]:
n = train_data_all.shape[0]
sns.pairplot(train_data_all[['PYTHAGOREAN_5_last_match_sum', 'PYTHAGOREAN_5_last_match_average',
       'PYTHAGOREAN_season_sum', 'PYTHAGOREAN_season_average', 'HOME_WINS']][0:n], hue='HOME_WINS', vars=['PYTHAGOREAN_5_last_match_sum', 'PYTHAGOREAN_5_last_match_average',
       'PYTHAGOREAN_season_sum', 'PYTHAGOREAN_season_average'])
plt.show()

### Preprocessing

In [None]:
def add_missing_indicators(df, missing_threshold=0.01, consider_columns=None):
    if consider_columns is None:
        consider_columns = df.columns

    missing_stats = df[consider_columns].isna().mean()
    features_with_lots_of_nans = missing_stats[missing_stats > missing_threshold].index.tolist()
    print(features_with_lots_of_nans)
    for feature in features_with_lots_of_nans:
        indicator_feature_name = f"{feature}_is_nan"
        df[indicator_feature_name] = df[feature].isna().astype(int)
    return df

def process_dataframe(df):
    selected_features = df.columns[2:]
    df['Selected_NaN_Count'] = df[selected_features].isna().sum(axis=1)

    df_columns = df.iloc[:,2:].columns.to_list()
    for feat in df_columns:
        df[feat] = df.groupby('TEAM_NAME')[feat].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else x.mean()))

    df.iloc[:, 2:] = df.iloc[:, 2:].fillna(
        df.iloc[:, 2:].mean()
    )

    return df

In [None]:
train_home_team_statistics_df = add_missing_indicators(train_home_team_statistics_df, missing_threshold=0.1)
train_home_team_statistics_df = process_dataframe(train_home_team_statistics_df)


train_away_team_statistics_df = add_missing_indicators(train_away_team_statistics_df, missing_threshold=0.1)
train_away_team_statistics_df = process_dataframe(train_away_team_statistics_df)

## Pythagorean Feat

### Home WON vs LOST

In [None]:
def optimize_p(home_values, away_values, target, p_values):
    best_p = None
    best_corr = 0
    for p in p_values:
        pythagorean_feature = calculate_pythagorean_feature(home_values, away_values, p)
        corr, _ = pearsonr(pythagorean_feature, target)
        if abs(corr) > abs(best_corr):
            best_corr = abs(corr)
            best_p = p
    return best_p, best_corr

def calculate_pythagorean_feature(home_values, away_values, p):
    epsilon = 1e-5
    pythagorean_feature = (home_values + epsilon) ** p / ((home_values + epsilon) ** p + (away_values + epsilon) ** p)
    return np.nan_to_num(pythagorean_feature, nan=0.0, posinf=0.0, neginf=0.0)


feature_pairs = [
    ('HOME_TEAM_GAME_WON_5_last_match_sum', 'HOME_TEAM_GAME_LOST_5_last_match_sum'),
    ('HOME_TEAM_GAME_WON_5_last_match_average', 'HOME_TEAM_GAME_LOST_5_last_match_average'),
    ('HOME_TEAM_GAME_WON_season_sum', 'HOME_TEAM_GAME_LOST_season_sum'),
    ('HOME_TEAM_GAME_WON_season_average', 'HOME_TEAM_GAME_LOST_season_average')
]

target = train_scores['HOME_WINS']
p_values = np.arange(1.01, 6.01, 0.01)

pythagorean_train_small_home = pd.DataFrame(index=train_data.index)

optimal_p_values = {}

for home_feat, away_feat in feature_pairs:
    home_values = train_data[home_feat]
    away_values = train_data[away_feat]

    best_p, best_corr = optimize_p(home_values, away_values, target, p_values)
    optimal_p_values[home_feat] = best_p

    pythagorean_feature = calculate_pythagorean_feature(home_values, away_values, best_p)

    new_feature_name = f"PYTHAGOREAN_HOME_{home_feat.replace('HOME_TEAM_GAME_WON_', '')}"
    pythagorean_train_small_home[new_feature_name] = pythagorean_feature

    print(f"Best p for {home_feat} vs. {away_feat}: {best_p} (Correlation: {best_corr})")


In [None]:
pythagorean_test_small_home = pd.DataFrame(index=test_data.index)

for home_feat, away_feat in feature_pairs:
    home_values = test_data[home_feat]
    away_values = test_data[away_feat]
    best_p = optimal_p_values[home_feat]

    pythagorean_feature = calculate_pythagorean_feature(home_values, away_values, best_p)

    new_feature_name = f"PYTHAGOREAN_HOME_{home_feat.replace('HOME_TEAM_GAME_WON_', '')}"
    pythagorean_test_small_home[new_feature_name] = pythagorean_feature

In [None]:
pythagorean_test_small_home

### AWAY WON vs LOST

In [None]:
feature_pairs = [
    ('AWAY_TEAM_GAME_WON_5_last_match_sum', 'AWAY_TEAM_GAME_LOST_5_last_match_sum'),
    ('AWAY_TEAM_GAME_WON_5_last_match_average', 'AWAY_TEAM_GAME_LOST_5_last_match_average'),
    ('AWAY_TEAM_GAME_WON_season_sum', 'AWAY_TEAM_GAME_LOST_season_sum'),
    ('AWAY_TEAM_GAME_WON_season_average', 'AWAY_TEAM_GAME_LOST_season_average')
]

target = train_scores['AWAY_WINS']
p_values = np.arange(1.01, 6.01, 0.01)

pythagorean_train_small_away = pd.DataFrame(index=train_data.index)

optimal_p_values = {}

for home_feat, away_feat in feature_pairs:
    home_values = train_data[home_feat]
    away_values = train_data[away_feat]

    best_p, best_corr = optimize_p(home_values, away_values, target, p_values)
    optimal_p_values[home_feat] = best_p

    pythagorean_feature = calculate_pythagorean_feature(home_values, away_values, best_p)

    new_feature_name = f"PYTHAGOREAN_AWAY_{home_feat.replace('AWAY_TEAM_GAME_WON_', '')}"
    pythagorean_train_small_away[new_feature_name] = pythagorean_feature

    print(f"Best p for {home_feat} vs. {away_feat}: {best_p} (Correlation: {best_corr})")

In [None]:
pythagorean_test_small_away = pd.DataFrame(index=test_data.index)

for home_feat, away_feat in feature_pairs:
    home_values = test_data[home_feat]
    away_values = test_data[away_feat]
    best_p = optimal_p_values[home_feat]

    pythagorean_feature = calculate_pythagorean_feature(home_values, away_values, best_p)

    new_feature_name = f"PYTHAGOREAN_AWAY_{home_feat.replace('AWAY_TEAM_GAME_WON_', '')}"
    pythagorean_test_small_away[new_feature_name] = pythagorean_feature

### Pyth HOME WINS

In [None]:
def optimize_p(home_feat, away_feat, target, p_values):
    best_p = None
    best_corr = 0

    for p in p_values:
        epsilon = 1e-5
        home_values = train_data[home_feat] + epsilon
        away_values = train_data[away_feat] + epsilon

        pythagorean_feature = (home_values ** p) / ((home_values ** p) + (away_values ** p))
        # pythagorean_feature = (home_values ** p)

        pythagorean_feature = np.nan_to_num(pythagorean_feature, nan=0.0, posinf=0.0, neginf=0.0)

        corr, _ = pearsonr(pythagorean_feature, target)

        if abs(corr) > abs(best_corr):
            best_corr = abs(corr)
            best_p = p

    return best_p, best_corr

p_values = np.arange(1.01, 6.01, 0.01)
optimal_p_values_home_wins = {}
optimal_corr = 0
for home_feat, away_feat in zip(train_home.columns, train_away.columns):
    best_p, best_corr = optimize_p(home_feat, away_feat, train_scores['HOME_WINS'], p_values)
    print(f"Best p for {home_feat} vs. {away_feat}: {best_p} (Correlation: {best_corr})")
    optimal_p_values_home_wins[home_feat] = best_p
    optimal_corr += best_corr

In [None]:
def calculate_pythagorean_feature(home_values, away_values, p):
    epsilon = 1e-5
    pythagorean_feature = (home_values + epsilon) ** p / ((home_values + epsilon) ** p + (away_values + epsilon) ** p)
    return np.nan_to_num(pythagorean_feature, nan=0.0, posinf=0.0, neginf=0.0)

pythagorean_train_df_home_wins = pd.DataFrame(index=train_data.index)

for home_feat, away_feat in zip(train_home.columns, train_away.columns):
    home_values = train_data[home_feat]
    away_values = train_data[away_feat]
    best_p = optimal_p_values_home_wins[home_feat]

    pythagorean_feature = calculate_pythagorean_feature(home_values, away_values, best_p)

    new_feature_name = f"PYTHAG_HOME_WINS_{home_feat.replace('HOME_', '')}"
    pythagorean_train_df_home_wins[new_feature_name] = pythagorean_feature





```
# Ce texte est au format code
```



In [None]:
pythagorean_test_df_home_wins = pd.DataFrame(index=test_data.index)

for home_feat, away_feat in zip(test_home.columns, test_away.columns):
    home_values = test_data[home_feat]
    away_values = test_data[away_feat]
    best_p = optimal_p_values_home_wins[home_feat]

    pythagorean_feature = calculate_pythagorean_feature(home_values, away_values, best_p)

    new_feature_name = f"PYTHAG_HOME_WINS_{home_feat.replace('HOME_', '')}"
    pythagorean_test_df_home_wins[new_feature_name] = pythagorean_feature



### Pyth AWAY WINS

In [None]:
def optimize_p(home_feat, away_feat, target, p_values):
    best_p = None
    best_corr = 0

    for p in p_values:
        epsilon = 1e-5
        home_values = train_data[home_feat] + epsilon
        away_values = train_data[away_feat] + epsilon

        pythagorean_feature = (away_values ** p) / ((away_values ** p) + (home_values ** p))
        # pythagorean_feature = (home_values ** p)

        pythagorean_feature = np.nan_to_num(pythagorean_feature, nan=0.0, posinf=0.0, neginf=0.0)

        corr, _ = pearsonr(pythagorean_feature, target)

        if abs(corr) > abs(best_corr):
            best_corr = abs(corr)
            best_p = p

    return best_p, best_corr

p_values = np.arange(1.01, 6.01, 0.01)
optimal_p_values_home_wins = {}
optimal_corr = 0
for home_feat, away_feat in zip(train_home.columns, train_away.columns):
    best_p, best_corr = optimize_p(home_feat, away_feat, train_scores['AWAY_WINS'], p_values)
    print(f"Best p for {home_feat} vs. {away_feat}: {best_p} (Correlation: {best_corr})")
    optimal_p_values_home_wins[home_feat] = best_p
    optimal_corr += best_corr

In [None]:
def calculate_pythagorean_feature(home_values, away_values, p):
    epsilon = 1e-5
    pythagorean_feature = (away_values + epsilon) ** p / ((away_values + epsilon) ** p + (home_values + epsilon) ** p)
    return np.nan_to_num(pythagorean_feature, nan=0.0, posinf=0.0, neginf=0.0)

pythagorean_train_df_away_wins = pd.DataFrame(index=train_data.index)

for home_feat, away_feat in zip(train_home.columns, train_away.columns):
    home_values = train_data[home_feat]
    away_values = train_data[away_feat]
    best_p = optimal_p_values_home_wins[home_feat]

    pythagorean_feature = calculate_pythagorean_feature(home_values, away_values, best_p)

    new_feature_name = f"PYTHAG_AWAY_WINS_{home_feat.replace('HOME_', '')}"
    pythagorean_train_df_away_wins[new_feature_name] = pythagorean_feature



In [None]:
pythagorean_test_df_away_wins = pd.DataFrame(index=test_data.index)

for home_feat, away_feat in zip(test_home.columns, test_away.columns):
    home_values = test_data[home_feat]
    away_values = test_data[away_feat]
    best_p = optimal_p_values_home_wins[home_feat]

    pythagorean_feature = calculate_pythagorean_feature(home_values, away_values, best_p)

    new_feature_name = f"PYTHAG_AWAY_WINS_{home_feat.replace('HOME_', '')}"
    pythagorean_test_df_away_wins[new_feature_name] = pythagorean_feature



In [None]:
pythagorean_test_df_away_wins.shape

### Concat pyth with base data

In [None]:
# train_data = pd.concat([train_data, pythagorean_train_df_away_wins, pythagorean_train_df_home_wins, pythagorean_train_small_away, pythagorean_train_small_home], axis=1)
# test_data = pd.concat([test_data, pythagorean_test_df_away_wins, pythagorean_test_df_home_wins, pythagorean_test_small_away, pythagorean_test_small_home], axis=1)

train_data = pd.concat([train_data, pythagorean_train_df_home_wins, pythagorean_train_small_away, pythagorean_train_small_home], axis=1)
test_data = pd.concat([test_data, pythagorean_test_df_home_wins, pythagorean_test_small_away, pythagorean_test_small_home], axis=1)


#test_data = pd.concat([test_data, pythagorean_test_df], axis=1)

In [None]:
train_data['is_train'] = 1
test_data['is_train'] = 0

combined_data = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)

In [None]:
test_data.shape

### FE

#### Offensive metrics

In [None]:
timeframes = ['season', '5_last_match']
for timeframe in timeframes:

    # Defensive and Offensive balance
    combined_data[f'defensive_offensive_balance_home_{timeframe}'] = combined_data[f'HOME_TEAM_SAVES_{timeframe}_sum'] / (combined_data[f'HOME_TEAM_GOALS_{timeframe}_sum'] + 1)
    combined_data[f'defensive_offensive_balance_away_{timeframe}'] = combined_data[f'AWAY_TEAM_SAVES_{timeframe}_sum'] / (combined_data[f'AWAY_TEAM_GOALS_{timeframe}_sum'] + 1)
    # Attack Efficiency
    combined_data[f'HOME_attack_efficiency_{timeframe}'] = combined_data[f'HOME_TEAM_GOALS_{timeframe}_sum'] / (combined_data[f'HOME_TEAM_SHOTS_ON_TARGET_{timeframe}_sum'] + 1)
    combined_data[f'AWAY_attack_efficiency_{timeframe}'] = combined_data[f'AWAY_TEAM_GOALS_{timeframe}_sum'] / (combined_data[f'AWAY_TEAM_SHOTS_ON_TARGET_{timeframe}_sum'] + 1)
    #Shot Accuracy
    combined_data[f'HOME_shot_accuracy_{timeframe}'] = combined_data[f'HOME_TEAM_SHOTS_ON_TARGET_{timeframe}_sum'] / (combined_data[f'HOME_TEAM_SHOTS_TOTAL_{timeframe}_sum'] + 1)
    combined_data[f'AWAY_shot_accuracy_{timeframe}'] = combined_data[f'AWAY_TEAM_SHOTS_ON_TARGET_{timeframe}_sum'] / (combined_data[f'AWAY_TEAM_SHOTS_TOTAL_{timeframe}_sum'] + 1)
    #penalty_success_rate
    combined_data[f'HOME_penalty_success_rate_{timeframe}'] = combined_data[f'HOME_TEAM_PENALTIES_{timeframe}_sum'] / (combined_data[f'HOME_TEAM_GOALS_{timeframe}_sum'] + 1)
    combined_data[f'AWAY_penalty_success_rate_{timeframe}'] = combined_data[f'AWAY_TEAM_PENALTIES_{timeframe}_sum'] / (combined_data[f'AWAY_TEAM_GOALS_{timeframe}_sum'] + 1)
    #possession_goal_ratio
    combined_data[f'HOME_possession_goal_ratio_{timeframe}'] = combined_data[f'HOME_TEAM_BALL_POSSESSION_{timeframe}_average'] / (combined_data[f'HOME_TEAM_GOALS_{timeframe}_sum'] + 1)
    combined_data[f'AWAY_possession_goal_ratio_{timeframe}'] = combined_data[f'AWAY_TEAM_BALL_POSSESSION_{timeframe}_average'] / (combined_data[f'AWAY_TEAM_GOALS_{timeframe}_sum'] + 1)
    #corners
    combined_data[f'set_piece_efficiency_home_{timeframe}'] = combined_data[f'HOME_TEAM_GOALS_{timeframe}_sum'] / (combined_data[f'HOME_TEAM_CORNERS_{timeframe}_sum'] + 1)
    combined_data[f'set_piece_efficiency_away_{timeframe}'] = combined_data[f'AWAY_TEAM_GOALS_{timeframe}_sum'] / (combined_data[f'AWAY_TEAM_CORNERS_{timeframe}_sum'] + 1)
    #possession to create dangerous attacks
    combined_data[f'possession_utilization_home_{timeframe}'] = combined_data[f'HOME_TEAM_DANGEROUS_ATTACKS_{timeframe}_sum'] / (combined_data[f'HOME_TEAM_BALL_POSSESSION_{timeframe}_average'] + 1)
    combined_data[f'possession_utilization_away_{timeframe}'] = combined_data[f'AWAY_TEAM_DANGEROUS_ATTACKS_{timeframe}_sum'] / (combined_data[f'AWAY_TEAM_BALL_POSSESSION_{timeframe}_average'] + 1)



#### Defensive metrics

In [None]:
timeframes = ['season', '5_last_match']
for timeframe in timeframes:
    # Defensive Strength
    combined_data[f'HOME_defensive_strength_{timeframe}'] = combined_data[f'HOME_TEAM_SAVES_{timeframe}_sum'] / (combined_data[f'AWAY_TEAM_SHOTS_ON_TARGET_{timeframe}_sum'] + 1)
    combined_data[f'AWAY_defensive_strength_{timeframe}'] = combined_data[f'AWAY_TEAM_SAVES_{timeframe}_sum'] / (combined_data[f'HOME_TEAM_SHOTS_ON_TARGET_{timeframe}_sum'] + 1)

    # Fouls to Yellow Card Ratio
    combined_data[f'HOME_fouls_to_yellow_ratio_{timeframe}'] = combined_data[f'HOME_TEAM_FOULS_{timeframe}_sum'] / (combined_data[f'HOME_TEAM_YELLOWCARDS_{timeframe}_sum'] + 1)
    combined_data[f'AWAY_fouls_to_yellow_ratio_{timeframe}'] = combined_data[f'AWAY_TEAM_FOULS_{timeframe}_sum'] / (combined_data[f'AWAY_TEAM_YELLOWCARDS_{timeframe}_sum'] + 1)



#### Engagement & Discipline Features

In [None]:
for timeframe in ['season', '5_last_match']:
    #Substitution
    combined_data[f'HOME_substitution_utilization_{timeframe}'] = combined_data[f'HOME_TEAM_SUBSTITUTIONS_{timeframe}_sum'] / (combined_data[f'HOME_TEAM_GAME_WON_{timeframe}_sum']  + 1)
    combined_data[f'AWAY_substitution_utilization_{timeframe}'] = combined_data[f'AWAY_TEAM_SUBSTITUTIONS_{timeframe}_sum'] / (combined_data[f'AWAY_TEAM_GAME_WON_{timeframe}_sum']  + 1)

    combined_data[f'HOME_fouls_to_cards_ratio_{timeframe}'] = combined_data[f'HOME_TEAM_FOULS_{timeframe}_sum'] / (combined_data[f'HOME_TEAM_YELLOWCARDS_{timeframe}_sum'] + combined_data[f'HOME_TEAM_REDCARDS_{timeframe}_sum'] + 1)
    combined_data[f'AWAY_fouls_to_cards_ratio_{timeframe}'] = combined_data[f'AWAY_TEAM_FOULS_{timeframe}_sum'] / (combined_data[f'AWAY_TEAM_YELLOWCARDS_{timeframe}_sum'] + combined_data[f'AWAY_TEAM_REDCARDS_{timeframe}_sum'] + 1)

    combined_data[f'HOME_card_severity_index_{timeframe}'] = (combined_data[f'HOME_TEAM_YELLOWCARDS_{timeframe}_sum'] + 2 * combined_data[f'HOME_TEAM_REDCARDS_{timeframe}_sum']) / (combined_data[f'HOME_TEAM_FOULS_{timeframe}_sum'] + 1)
    combined_data[f'AWAY_card_severity_index_{timeframe}'] = (combined_data[f'AWAY_TEAM_YELLOWCARDS_{timeframe}_sum'] + 2 * combined_data[f'AWAY_TEAM_REDCARDS_{timeframe}_sum']) / (combined_data[f'AWAY_TEAM_FOULS_{timeframe}_sum'] + 1)

In [None]:
train_team_data_fe = combined_data[combined_data['is_train'] == 1].drop('is_train', axis=1)
test_team_data_fe = combined_data[combined_data['is_train'] == 0].drop('is_train', axis=1)

In [None]:
test_team_data_fe.shape

## Correlation

In [None]:
def top_correlated_features(features_data, target_data, target_name):
    correlations = []
    target = target_data[target_name]

    numeric_features_data = features_data.select_dtypes(include=[np.number])

    for feature in numeric_features_data.columns:
      corr, _ = pearsonr(numeric_features_data[feature], target)
      correlations.append((feature, corr))
    corr_df = pd.DataFrame(correlations, columns=['Feature', 'Correlation'])
    corr_df['Correlation'] = abs(corr_df['Correlation'])
    corr_df = corr_df.sort_values(by='Correlation', ascending=False)

    return corr_df

train_team_data_fe = train_team_data_fe.loc[:, ~train_team_data_fe.columns.duplicated()]
train_team_data_fe = train_team_data_fe.astype(float)
train_team_data_fe.index.name = 'ID'

correlated_HOME_WINS = top_correlated_features(train_team_data_fe, train_scores, 'HOME_WINS')
correlated_AWAY_WINS = top_correlated_features(train_team_data_fe, train_scores, 'AWAY_WINS')
correlated_DRAW = top_correlated_features(train_team_data_fe, train_scores, 'DRAW')
correlated_Outcome = top_correlated_features(train_team_data_fe, train_scores, 'Outcome')


correlated_HOME_WINS = correlated_HOME_WINS.rename(columns={'Correlation': 'Correlation_HOME'})
correlated_AWAY_WINS = correlated_AWAY_WINS.rename(columns={'Correlation': 'Correlation_AWAY'})
correlated_DRAW = correlated_DRAW.rename(columns={'Correlation': 'Correlation_DRAW'})
correlated_Outcome = correlated_Outcome.rename(columns={'Correlation': 'Correlation_Outcome'})

corr_home_away = correlated_HOME_WINS.merge(correlated_AWAY_WINS, on='Feature', how ='left')
corr_data = corr_home_away.merge(correlated_DRAW, on='Feature', how ='left')
corr_data_team = corr_data.merge(correlated_Outcome, on='Feature', how ='left')

corr_data_team


In [None]:
corr_data_team['mean_Correlation'] = (corr_data_team['Correlation_HOME'] + corr_data_team['Correlation_AWAY'] + corr_data_team['Correlation_DRAW'])/3
corr_data_team['max_Correlation'] = corr_data_team[['Correlation_HOME', 'Correlation_AWAY', 'Correlation_DRAW']].max(axis=1)
corr_data_team.sort_values("Correlation_Outcome", ascending=False).head(20)

In [None]:
heatmap_data = corr_data_team.set_index('Feature').dropna()
plt.figure(figsize=(7, 7))
sns.heatmap(heatmap_data, cmap='coolwarm')
plt.title('Feature Correlations')
plt.show()

In [None]:
len(corr_data_team[(corr_data_team['max_Correlation'] > 0.01)])

In [None]:
print(len(corr_data_team[(corr_data_team['max_Correlation'] > 0.01)]))
print(train_team_data_fe.shape)
print(test_team_data_fe.shape)

selected_features = corr_data_team[(corr_data_team['max_Correlation'] > 0.01)]['Feature'].to_list()
train_team_data_fe = train_team_data_fe[selected_features]
test_team_data_fe = test_team_data_fe[selected_features]

print(train_team_data_fe.shape)
print(test_team_data_fe.shape)

## VIZ

In [None]:
train_data_all = pd.concat([train_team_data_fe, train_scores['HOME_WINS']], axis=1)

In [None]:
plt.figure(figsize=(12, 8))
feat = "TEAM_SHOTS_INSIDEBOX_diff_season_average"
plt.subplot(1,2,1)
sns.violinplot(x = 'HOME_WINS', y = feat, data = train_data_all[0:])

plt.subplot(1,2,2)
sns.distplot(train_data_all[train_data_all['HOME_WINS'] == 1.0][feat][0:] , label = "1", color = 'red')
sns.distplot(train_data_all[train_data_all['HOME_WINS'] == 0.0][feat][0:] , label = "0" , color = 'blue' )
sns.distplot(train_data_all[train_data_all['HOME_WINS'] == 2.0][feat][0:] , label = "2" , color = 'yellow' )

plt.show()

In [None]:
corr_data.Feature

In [None]:
n = train_data_all.shape[0]
sns.pairplot(train_data_all[['TEAM_SHOTS_INSIDEBOX_diff_season_average', 'TEAM_SHOTS_INSIDEBOX_diff_season_sum',
       'TEAM_GAME_WON_diff_season_average', 'TEAM_SHOTS_ON_TARGET_diff_season_average', 'HOME_WINS']][0:n], hue='HOME_WINS', vars=['TEAM_SHOTS_INSIDEBOX_diff_season_average', 'TEAM_SHOTS_INSIDEBOX_diff_season_sum',
       'TEAM_GAME_WON_diff_season_average', 'TEAM_SHOTS_ON_TARGET_diff_season_average'])
plt.show()

## FE Player Data

In [None]:
def count_positions(group):
    positions_count = {
        'midfielder': 0,
        'defender': 0,
        'goalkeeper': 0,
        'attacker': 0
    }
    for position in group['POSITION']:
        if position in positions_count:
            positions_count[position] += 1
    return pd.Series(positions_count)

def data_groupby(df):
  positions_counts = df.groupby('ID').apply(count_positions)
  positions_counts['players_count'] = positions_counts['midfielder'] + positions_counts['defender'] + positions_counts['goalkeeper'] + positions_counts['attacker']
  summed_metrics_per_id = df.drop(columns=['POSITION']).groupby('ID').sum()

  train_player_home_with_counts = summed_metrics_per_id.merge(positions_counts, on='ID', how='left')

  return train_player_home_with_counts

def add_missing_indicators(df, missing_threshold=0.01, consider_columns=None):
    if consider_columns is None:
        consider_columns = df.columns

    missing_stats = df[consider_columns].isna().mean()
    features_with_lots_of_nans = missing_stats[missing_stats > missing_threshold].index.tolist()
    print(features_with_lots_of_nans)
    for feature in features_with_lots_of_nans:
        indicator_feature_name = f"{feature}_is_nan"
        df[indicator_feature_name] = df[feature].isna().astype(int)
    return df

def process_dataframe(df):
    selected_features = df.columns[4:]
    df['Selected_NaN_Count'] = df[selected_features].isna().sum(axis=1)

    df_columns = df.iloc[:,4:].columns.to_list()
    for feat in df_columns:
        df[feat] = df.groupby(['TEAM_NAME', 'POSITION'])[feat].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else x.mean()))

    df.iloc[:, 4:] = df.iloc[:, 4:].fillna(
        df.iloc[:, 4:].mean()
    )

    return df

In [None]:

train_home_player_statistics_df = pd.read_csv('Train_Data/train_home_player_statistics_df.csv', index_col=0)
train_away_player_statistics_df = pd.read_csv('Train_Data/train_away_player_statistics_df.csv', index_col=0)

# train_home_player_statistics_df = add_missing_indicators(train_home_player_statistics_df, missing_threshold=0.2)
# train_home_player_statistics_df = process_dataframe(train_home_player_statistics_df)

# train_away_player_statistics_df = add_missing_indicators(train_away_player_statistics_df, missing_threshold=0.2)
# train_away_player_statistics_df = process_dataframe(train_away_player_statistics_df)

train_player_home = train_home_player_statistics_df.iloc[:,2:]
train_player_away = train_away_player_statistics_df.iloc[:,2:]

train_player_home = train_player_home.drop(columns=['PLAYER_NAME'])
train_player_away = train_player_away.drop(columns=['PLAYER_NAME'])

train_player_home = data_groupby(train_player_home)
train_player_away = data_groupby(train_player_away)

train_player_home.columns = 'HOME_' + train_player_home.columns
train_player_away.columns = 'AWAY_' + train_player_away.columns

train_player_home = train_player_home.fillna(train_player_home.median())
train_player_away = train_player_away.fillna(train_player_away.median())

train_player_data =  pd.concat([train_player_home,train_player_away],join='inner',axis=1)
train_player_data = train_player_data.replace({np.inf:np.nan,-np.inf:np.nan})


test_player_home = pd.read_csv('Test_Data/test_home_player_statistics_df.csv', index_col=0)
test_player_away = pd.read_csv('Test_Data/test_away_player_statistics_df.csv', index_col=0)

# test_player_home = add_missing_indicators(test_player_home, missing_threshold=0.2)
# test_player_away = add_missing_indicators(test_player_away, missing_threshold=0.2)

test_player_home = data_groupby(test_player_home)
test_player_away = data_groupby(test_player_away)

test_player_home.columns = 'HOME_' + test_player_home.columns
test_player_away.columns = 'AWAY_' + test_player_away.columns

test_player_home = test_player_home.fillna(test_player_home.median())
test_player_away = test_player_away.fillna(test_player_away.median())

test_player_data =  pd.concat([test_player_home,test_player_away],join='inner',axis=1)


In [None]:
test_player_data.shape

In [None]:
correlated_HOME_WINS = top_correlated_features(train_player_data, train_scores, 'HOME_WINS')
correlated_AWAY_WINS = top_correlated_features(train_player_data, train_scores, 'AWAY_WINS')
correlated_DRAW = top_correlated_features(train_player_data, train_scores, 'DRAW')
correlated_Outcome = top_correlated_features(train_player_data, train_scores, 'Outcome')


correlated_HOME_WINS = correlated_HOME_WINS.rename(columns={'Correlation': 'Correlation_HOME'})
correlated_AWAY_WINS = correlated_AWAY_WINS.rename(columns={'Correlation': 'Correlation_AWAY'})
correlated_DRAW = correlated_DRAW.rename(columns={'Correlation': 'Correlation_DRAW'})
correlated_Outcome = correlated_Outcome.rename(columns={'Correlation': 'Correlation_Outcome'})

corr_home_away = correlated_HOME_WINS.merge(correlated_AWAY_WINS, on='Feature', how ='left')
corr_data = corr_home_away.merge(correlated_DRAW, on='Feature', how ='left')
corr_data_player = corr_data.merge(correlated_Outcome, on='Feature', how ='left')

corr_data_player

In [None]:
corr_data_player = corr_data_player.fillna(0)
corr_data_player['mean_Correlation'] = (corr_data_player['Correlation_HOME'] + corr_data_player['Correlation_AWAY'] + corr_data_player['Correlation_DRAW'])/3
corr_data_player['max_Correlation'] = corr_data_player[['Correlation_HOME', 'Correlation_AWAY', 'Correlation_DRAW']].max(axis=1)

corr_data_player.sort_values("max_Correlation", ascending=False).head(20)

In [None]:
heatmap_data = corr_data_player.set_index('Feature').dropna()
plt.figure(figsize=(7, 7))
sns.heatmap(heatmap_data, cmap='coolwarm')
plt.title('Feature Correlations')
plt.show()

In [None]:
print(len(corr_data_player[(corr_data_player['max_Correlation'] > 0.01)]))

In [None]:
print(train_player_data.shape)
print(test_player_data.shape)

selected_features = corr_data_player[(corr_data_player['max_Correlation'] > 0.00001)]['Feature'].to_list()
train_player_data = train_player_data[selected_features]
test_player_data = test_player_data[selected_features]

print(train_player_data.shape)
print(test_player_data.shape)

## Save FE data

In [None]:
train_data_to_modeling = pd.concat([train_team_data_fe, train_player_data], axis = 1)
test_data_to_modeling = pd.concat([test_team_data_fe, test_player_data], axis = 1)
train_scores['Outcome'] = train_scores.apply(combine_outcomes, axis=1)
train_new_y = train_scores['Outcome']
train_scores_supp['GOAL_DIFF_HOME_AWAY'] = pd.to_numeric(train_scores_supp['GOAL_DIFF_HOME_AWAY'], errors='coerce')

train_new_y_supp = train_scores_supp['GOAL_DIFF_HOME_AWAY']

# test_data_to_modeling.shape
train_data_to_modeling.to_csv('FE_Data/train_data_to_modeling.csv')
test_data_to_modeling.to_csv('FE_Data/test_data_to_modeling.csv')
train_new_y.to_csv('FE_Data/train_new_y.csv')



In [None]:
train_data_to_modeling = pd.read_csv('FE_Data/train_data_to_modeling.csv', index_col=0)
test_data_to_modeling = pd.read_csv('FE_Data/test_data_to_modeling.csv', index_col=0)
train_new_y = pd.read_csv('FE_Data/train_new_y.csv', index_col=0)


# Modeling

## LGBM

In [None]:
import optuna
import lightgbm as lgb
from sklearn import metrics, model_selection

def tune_lgbm_params(train_data_fe, train_new_y):

  def objective(trial):
      n_splits = 5
      cv_results = []

      kf = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

      for train_index, valid_index in kf.split(train_data_fe, train_new_y):
          X_train, X_valid = train_data_fe.iloc[train_index], train_data_fe.iloc[valid_index]
          y_train, y_valid = train_new_y.iloc[train_index], train_new_y.iloc[valid_index]


          param = {
              'objective': 'multiclass',
              'boosting_type':'goss',
              # 'metric': 'multi_logloss',
              'num_class': 3,
              'verbosity': -1,
              # 'num_threads': -1,
              'early_stopping_rounds':200,
              'reg_lambda': trial.suggest_float('reg_lambda', 0, 0.3),
              'n_estimators': trial.suggest_int('n_estimators', 2000, 5000),
              'num_leaves': trial.suggest_int('num_leaves', 200, 5000),
              'max_depth': trial.suggest_int('max_depth', 3, 30),
              'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
              'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
              # 'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
              # 'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
              'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 15),
              'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 300),
          }

          lgb_train = lgb.Dataset(X_train, y_train)
          lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

          model = lgb.train(param, lgb_train, valid_sets=[lgb_eval])


          preds = model.predict(X_valid, num_iteration=model.best_iteration)
          preds = np.argmax(preds, axis=1)
          # conf_matrix = metrics.confusion_matrix(
          #                     y_valid, preds, sample_weight=None,
          #                     normalize=None
          #                     )
          # print(conf_matrix)
          acc = metrics.accuracy_score(y_valid, preds)

          cv_results.append(acc)

      avg_acc = np.mean(cv_results)

      return avg_acc

  study = optuna.create_study(direction='maximize')
  study.optimize(objective, n_trials=50)
  best_params_lgbm = study.best_trials.params

  print('Best trial:', study.best_trial.params)

  return best_params_lgbm


In [None]:
best_params_lgbm = tune_lgbm_params(train_data_to_modeling, train_new_y)
print(best_params_lgbm)

In [None]:
def test_predict(train, test, y, best_params_lgbm):
  best_params_lgbm['boosting_type'] = 'goss'
  best_params_lgbm['objective'] = 'multiclass'
  # best_params_lgbm['metric'] = 'multi_logloss'
  best_params_lgbm['num_class'] = 3
  best_params_lgbm['verbosity'] = -1
  lgb_full_train = lgb.Dataset(train, y)
  final_model = lgb.train(best_params_lgbm, lgb_full_train)
  test_predictions = final_model.predict(test)
  test_predictions = np.argmax(test_predictions, axis=1)
  return test_predictions


In [None]:
predictions = test_predict(train_data_to_modeling, test_data_to_modeling, train_new_y, best_params_lgbm)

In [None]:
def generate_predictions(test, predictions):

  submission_df = pd.DataFrame(0, index=np.arange(len(predictions)), columns=['HOME_WINS', 'DRAW', 'AWAY_WINS'])

  submission_df.loc[predictions == 1, 'HOME_WINS'] = 1
  submission_df.loc[predictions == 0, 'DRAW'] = 1
  submission_df.loc[predictions == 2, 'AWAY_WINS'] = 1

  submission_df.columns = ['HOME_WINS', 'DRAW', 'AWAY_WINS']
  submission_df.index = test.index
  submission_df = submission_df.reset_index()
  submission_df.columns = ['ID', 'HOME_WINS', 'DRAW', 'AWAY_WINS']
  # submission_df = submission_df.reset_index(drop=True)

  return submission_df


In [None]:
submission_df = generate_predictions(test_home, predictions)

In [None]:
submission_df[['HOME_WINS', 'DRAW', 'AWAY_WINS']].value_counts()/len(submission_df)

In [None]:
submission_df

In [None]:
submission_df.to_csv('Submissions/submission.csv', index=False)