In [None]:
import pandas as pd
import numpy as np

import requests

import os
import time
from datetime import datetime
import sys
import yaml
import gc


from data_loader import *
from utils import *
from nn import *
from models import *
from llm import *
from llm_hf import *
import folder_manager

import seaborn as sns
sns.set_style("darkgrid")
plt.rcParams.update({
    'axes.facecolor': '#1e1e1e',
    'figure.facecolor': '#1e1e1e',
    'axes.edgecolor': 'white',
    'axes.labelcolor': 'white',
    'xtick.color': 'white',
    'ytick.color': 'white',
    'text.color': 'white',
    'axes.grid': True,
    'grid.color': 'gray'
})

pd.set_option("display.max_column",None)
print(os.getcwd())


%load_ext autoreload
%reload_ext autoreload
%autoreload 2

pd.options.display.max_rows = 100


def auto_reload():
    %load_ext autoreload
    %reload_ext autoreload
    %autoreload 2

In [None]:
with open("config.yaml",'r') as f:
    config = yaml.safe_load(f)

home_dir = config['HOME_DIRECTORY']
home_dir

In [None]:
create_submodel("llm_new")

# API Football:
https://www.api-football.com

![PYTHON LOGO](https://www.api-football.com/public/img/news/archi-beta.jpg)

In [None]:
leagues_dat = get_leagues(home_dir +"/data/Leagues/leagues.parquet")
leagues_dat[['league_id','league_name','country_name']]

# Leagues subset:

In [None]:
# Configs
major_leagues = ["Premier League","La Liga","Serie A","Bundesliga","Eredivisie","Ligue 1"]
major_countries = ["England","Spain","Italy","Germany","Netherlands","France","Brazil"]
teams = ["Liverpool","Wolves"] # teams to pull players data of
seasons = [2022,2021,2023,2024] # seasons to pull players and teams stats of



leagues_subset = leagues_dat[leagues_dat.league_name.isin(major_leagues) & leagues_dat.country_name.isin(major_countries)] # league ID to pull from, current values: {39:premier league}, Add to dictionary as needed

In [None]:
leagues_subset

# Read All fixtures data

In [None]:
teams_dat = pd.read_parquet(home_dir + "/data/Teams/team_league.parquet")

In [None]:
fixtures_dir = home_dir + "/data/Fixtures"

complete_data = pd.DataFrame()
for file in os.listdir(fixtures_dir):
    dat = pd.read_parquet(os.path.join(fixtures_dir,file))
    complete_data = pd.concat([complete_data,dat],axis = 0)

complete_data = complete_data.reset_index()
complete_data.drop(columns = ['index'],inplace=True)

In [None]:
complete_data.columns

In [None]:
# Data checks
complete_data['passes_accuracy'] = complete_data['passes_accuracy'].astype("float64")
complete_data.rename(columns= {'passes_accuracy':'passes_accurate'},inplace =True)
complete_data['fixture_date'] = pd.to_datetime(complete_data.fixture_date)
complete_data['fixture_date_dt'] = complete_data['fixture_date'].dt.date
complete_data = create_datetime_columns(complete_data,'fixture_date')
complete_data['games_rating'] = pd.to_numeric(complete_data['games_rating'])

# Gemini Create season column here:


# Targets
complete_data['outcome_num'] = pd.Categorical(complete_data.outcome).codes

complete_data['win'] = np.where(complete_data.outcome.str.lower() == 'win', 1,0)

# Joins:
complete_data = complete_data.merge(teams_dat.drop_duplicates(),how = 'left', left_on= 'team',right_on = 'team_name').drop(columns = ['team_name'])


In [None]:
complete_data.head()

In [None]:
complete_data.shape

In [None]:
# This is the dictionary that contains all information about the features    
dat_dict = find_data_types(complete_data,config['OUTCOME_COLS'] + ['outcome_num','outcome'])
dat_dict = pd.DataFrame(list(dat_dict.items()),columns =['feature','type'])

# differentiate modeling features
non_modeling_features = config['FIXTURE_COLS'] + config['OUTCOME_COLS'] + config['MISC_COLS'] + ['outcome_num','league','win','fixture_date','fixture_date_dt','major_position']
dat_dict['modeling_feature'] = np.where(dat_dict['feature'].isin(non_modeling_features),0,1)
dat_dict['encoded'] = 0

print(dat_dict['type'].value_counts())
dat_dict.reset_index(drop= True)

## Encode Features
dat_dict = create_data_index(complete_data,dat_dict,'target',folder_manager.encoding_path)
dat_dict[dat_dict.modeling_feature ==1]

In [None]:
season_dt = complete_data.groupby(['league','fixture_date_dt'],as_index = False).agg(val = ('fixture_date_dt','nunique'))

season_dt = (
    season_dt.groupby("league", group_keys=False)
      .apply(lambda g: (
          g.set_index("fixture_date_dt")
           .asfreq("D")
           .fillna({"val": 0, "league": g.name})
      ))
      .reset_index()
)

fig, axes = plt.subplots(season_dt['league'].nunique(),1)

for ix,league in enumerate(season_dt['league'].unique()):
    sns.lineplot(season_dt[season_dt.league == league], x = "fixture_date_dt",y = "val",ax= axes[ix])


In [None]:
# primary position map:
player_position = complete_data.groupby(["player_id","games_position"],as_index = False).agg(games_played = ("player_id","size"))
player_position['multiple_records'] = player_position.groupby('player_id')['games_played'].transform("cumsum")
player_position['multiple_records'] = player_position.groupby('player_id')['multiple_records'].transform("max")
player_position['major_position'] = np.where(player_position.games_played/player_position.multiple_records >= .5, player_position.games_position,None)
player_position_map = player_position[['player_id','major_position']].dropna().drop_duplicates()
player_position_map

# Join back to complete_data

complete_data = pd.merge(complete_data,player_position_map,on = 'player_id',how = 'left')

In [None]:
# Run Player Comparison from LLm 
#player_compare  = compare_players_from_llm(complete_data,["Giovanni Leoni","Ibrahima Konaté"],years = [2025],normalize=True)

In [None]:
fixture_dat = calculate_fixture_stats(complete_data)

In [None]:
fixture_dat.head()

In [None]:
# team classification
from sklearn.tree import DecisionTreeClassifier, plot_tree
from category_encoders import OrdinalEncoder
team_class_dat = fixture_dat[['fixture_id','team','opponent','year_e','win']].drop_duplicates()
oe = OrdinalEncoder()
team_class_dat['team_encoded'] = oe.fit_transform(team_class_dat['team'])
team_class_dat['opponent_encoded'] = oe.transform(team_class_dat.drop(columns = 'team').rename(columns={"opponent":'team'})['team']).astype("int")
team_class_dat.head()
team_class_dat['team_cluster'] = 0

for year in team_class_dat.year_e.dropna().unique():
    if year != np.nan:
        print(f"for year {year}")
        dtc = DecisionTreeClassifier(max_depth=4)
        model = dtc.fit(team_class_dat[team_class_dat.year_e == year][['team_encoded','opponent_encoded']],team_class_dat[team_class_dat.year_e == year]['win'].values)
        y_pred = model.predict(team_class_dat[team_class_dat.year_e == year][['team_encoded','opponent_encoded']])
        team_class_dat.loc[team_class_dat['year_e'] == year,'team_cluster'] = model.predict_proba(team_class_dat[team_class_dat.year_e == year][['team_encoded','opponent_encoded']]).max(axis = 1)

team_cluster_map = team_class_dat[['year_e','team','team_cluster']].drop_duplicates().reset_index(drop= True)
team_class_dat = team_class_dat.merge(team_cluster_map,on = ['year_e','team'],how = 'left').rename(columns = {'team_cluster_y':'opponent_cluster',
                                                                                                              'team_cluster_x':'team_cluster'})
team_class_dat['year_e'] = team_class_dat['year_e'].astype('Int64')
oe_cluster = OrdinalEncoder()
team_class_dat['opponent_cluster_encoded'] = oe_cluster.fit_transform(team_class_dat['opponent_cluster'].astype("str"))

# Bayesian Team Ability Estimation

 - team_ability: alpha ~ Normal(mu,sig^2)
 - opposition_difficulty: beta ~ Normal(mu,sig^2)

 - P(w) ~ binomial(N,alpha - beta)

In [None]:
team_class_dat.head()

In [None]:
# Data for Binomial Model

team_class_dat_binom = team_class_dat.groupby(['year_e','team','team_encoded','opponent_cluster_encoded'],as_index = False).agg(wins = ('win','sum'), total_games = ('win','size'))
team_class_dat_binom.head()

In [None]:
team_idx = team_class_dat_binom['team_encoded']
cluster_idx = team_class_dat_binom['opponent_cluster_encoded']
year_idx = team_class_dat_binom['year_e']
coords = {"teams":team_class_dat_binom['team'].unique(),
          "year":team_class_dat_binom['year_e'].unique(),
          "cluster": team_class_dat_binom['opponent_cluster_encoded'].unique()
          }

teams_unique = np.sort(np.unique(team_idx))
team_map = {t:i for i,t in enumerate(teams_unique)}
team_idx_zero = np.array([team_map[t] for t in team_idx])

# Clusters
clusters_unique = np.sort(np.unique(cluster_idx))
cluster_map = {c:i for i,c in enumerate(clusters_unique)}
cluster_idx_zero = np.array([cluster_map[c] for c in cluster_idx])

# Years
years_unique = np.sort(np.unique(year_idx))
year_map = {y:i for i,y in enumerate(years_unique)}
year_idx_zero = np.array([year_map[y] for y in year_idx])

with pm.Model(coords=coords) as model:

    mu_team = pm.Normal("mu_team",0,1)
    sigma_team = pm.HalfNormal('sigma_team',2)

    mu_opponent = pm.Normal("mu_opponent",0,1)
    sigma_opponent = pm.HalfNormal('sigma_opponent',3)

    theta_team_raw = pm.Normal("theta_team_year",0,1,dims= ('teams','year'))
    #theta_team = pm.Deterministic("theta_team",mu_team + theta_team_raw * sigma_team,dims = ('teams','year'))
    theta_team = mu_team + theta_team_raw * sigma_team
    theta = pm.Deterministic('theta', theta_team - theta_team.mean(axis = 0,keepdims = True),dims = ('teams','year'))

    beta_opponent_raw = pm.Normal("beta_opponent_year",0,1,dims= ('cluster','year'))
    #beta_opponent = pm.Deterministic("beta_opponent",mu_opponent + beta_opponent_raw * sigma_opponent,dims = ('cluster','year'))
    beta_opponent = mu_opponent + beta_opponent_raw * sigma_opponent
    beta = pm.Deterministic('beta', beta_opponent - beta_opponent.mean(axis = 0,keepdims = True),dims = ('cluster','year'))

    logit = theta[team_idx_zero,year_idx_zero] - beta[cluster_idx_zero,year_idx_zero]
    p = pm.Deterministic('p',pm.math.sigmoid(logit))
    n = team_class_dat_binom.total_games.values
    outcome = pm.Binomial("outcome", n = n,p = p, observed = team_class_dat_binom.wins.values)

    trace = pm.sample()

   


In [None]:
pm.summary(trace)

In [None]:
[val for val in pm.summary(trace).index if 'Liverpool' in val]

In [None]:
liverpool_post = trace.posterior['theta'].sel(teams = "Liverpool")
liverpool_post.coords
#mancity_post = trace.posterior['theta_alpha'].sel(teams = "Manchester City")

In [None]:
az.summary(trace).loc["theta_team[Liverpool, 2024]"]

In [None]:
liverpool_post.coords

In [None]:
fig, axes = plt.subplots(4,1,figsize = (12,8))
sns.kdeplot(liverpool_post[3],ax= axes[3])
sns.kdeplot(liverpool_post[2],ax= axes[2])
sns.kdeplot(liverpool_post[1],ax= axes[1])
sns.kdeplot(liverpool_post[0],ax= axes[0])

In [None]:
year_map

In [None]:
team_class_dat[team_class_dat.team == 'Liverpool']

In [None]:
team_class_dat['team_cluster'] = team_class_dat['team_cluster'].round(5)

In [None]:
liverpool_25_cluster = team_class_dat[(team_class_dat.year_e == 2025) & (team_class_dat.team == "Real Madrid")]['team_cluster'].values[0]
liverpool_25_cluster

In [None]:
team_class_dat[(team_class_dat.team_cluster.round(5) == liverpool_25_cluster) & (team_class_dat.year_e == 2025)]['team'].value_counts()

In [None]:
plt.figure(figsize=(20,10))
plot_tree(model, feature_names=['team','opponent'], class_names=['Lose','Win'], filled=True)
plt.show()

In [None]:
model.decision_path(team_class_dat[team_class_dat.year_e == year][['team']])

In [None]:
pd.Series(y_pred_proba.max(axis=1)).value_counts()

In [None]:
filter = 'games_position.isin(["M","D"]) '
target = 'team_goals_scored'

col_subset = [['win','games_rating','shots_total','shots_on','goals_total','goals_saves','duels_won']]

cor_dat = fixture_dat.query(filter).corr(numeric_only=True)[[target]]
cor_dat.drop(target,inplace = True)

sorted_cols = cor_dat.sort_values(target,ascending = False).index.to_list()

fig, ax = plt.subplots(1,1,figsize = (15,10))
sns.heatmap(cor_dat.loc[sorted_cols],cmap = 'coolwarm',ax=ax)
ax.set_xticklabels(ax.get_xticklabels(),rotation =75)
fig.show()

In [None]:
find_player(complete_data,player_name="Leoni")

In [None]:
all_defenders_2025 = complete_data[(complete_data.major_position == 'D') & (complete_data.year_e == 2025)]['player_name'].unique()
all_defenders_2025

In [None]:
defenders_compare = compare_players(complete_data,all_defenders_2025,years = [2025],transpose = False)

In [None]:
per_90_cols = [col for col in defenders_compare.columns if "per_90" in col]
attack_per_90_cols  = ['total_shots_per_90','shots_on_target_per_90','goals_scored_per_90','assists_per_90',
                       'fouls_drawn_per_90','attempted_dribbles_per_90','successful_dribbles_per_90',
                       'dribble_success_rate_per_90','duels_contested_per_90','duels_won_per_90','duels_won_percentage_per_90']
defense_per_90_cols = ['yellow_cards_per_90','red_cards_per_90','fouls_drawn_per_90','fouls_committed_per_90',
                       'dribbled_past_per_90', 'total_tackles_per_90','blocks_per_90','interceptions_per_90', 
                       'duels_contested_per_90','duels_won_per_90','duels_won_percentage_per_90','penalties_committed_per_90']
pass_per_90_cols = [ 'total_passes_per_90','key_passes_per_90', 'average_passes_accurate_per_90','average_pass_accuracy_per_90']

In [None]:
defenders_compare.head()

In [None]:
# Calculate Clusters:
filter = 'total_minutes_played  > 1000'
defense_cluster = 'defense_cluster'
pass_cluster = 'pass_cluster'
defenders_compare_w_cluster = fit_kmeans(defenders_compare.query(filter),defense_per_90_cols,None,cluster_name)
defenders_compare_w_cluster = fit_kmeans(defenders_compare_w_cluster.query(filter),pass_per_90_cols,None,pass_cluster)

In [None]:
find_player(complete_data,player_name="Virgil van")

In [None]:
defenders_compare_w_cluster[defenders_compare_w_cluster.player_name.str.contains("William Saliba")]

In [None]:
defense_cluster

In [None]:
clusters = defenders_compare_w_cluster[defenders_compare_w_cluster.player_name.str.contains('William Saliba')][[defense_cluster,pass_cluster]].values
clusters
#defenders_compare_w_cluster[defenders_compare_w_cluster[cluster_name].isin(defenders_compare_w_cluster[condition][cluster_name])].sort_values("average_rating",ascending = False)

In [None]:
folder_manager.llm_code_path

In [None]:
question = "How are you doing?"
question_no_spec = re.sub(r"[?.,;:]","",question)
split_words = [word for word in question_no_spec.split(" ")]
split_words

In [None]:
complete_data.columns

In [None]:
defenders_compare_w_cluster[(defenders_compare_w_cluster.player_name.isin(["Mike Eerdhuijzen","Giovanni Leoni","Nikola Milenković","Marc Guéhi","Ladislav Krejčí"]))][['player_name'] + [col for col in defenders_compare.columns if "per_90" in col]].T

In [None]:
schema = {
        "columns": list(complete_data.columns),
        "nrows": [complete_data.shape[0]],
        "dtypes": {col : str(complete_data[col].dtype) for col in complete_data.columns}
    }


In [None]:
complete_data.columns

In [None]:
plot_from_llm(complete_data[complete_data.player_name == 'Olivier Boscagli'],"Plot Average games_rating with error cloud by month_e faceted by team")

In [None]:
plot_continuous_trend(complete_data[complete_data.player_name == 'Emmanuel Agbadou'],"month_e","games_rating")

In [None]:
filter_query = 'major_position.isin(["M"])'


# Stat to look at:
stat = 'target_shot_conversion_perc'
agg_fun = "mean"
rank_cutoff = 20

# configs 
min_appearance = 40

dribble_dat_g = complete_data.query(filter_query).reset_index().fillna(0).groupby("player_name").agg(n_apps = ("player_name","size"),stat = (stat,agg_fun)).reset_index()
dribble_dat_g = dribble_dat_g[dribble_dat_g.n_apps >= min_appearance]
dribble_dat_g['rank'] = dribble_dat_g["stat"].fillna(0).rank(ascending= False,method = 'dense')
dribble_dat_g.sort_values("rank",inplace = True)

fig, ax = plt.subplots(figsize=(13, 8))

# Plot correctly, no comma here
sns.boxplot(
    data=complete_data.query(filter_query)[complete_data.query(filter_query).player_name.isin(dribble_dat_g[dribble_dat_g['rank'] < rank_cutoff]['player_name'])],
    x="player_name",
    y=stat,
    order=dribble_dat_g[dribble_dat_g['rank'] < rank_cutoff]['player_name'],
    ax=ax,
    
)

# Now this works correctly on `ax`
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_title(f"Stat: {stat}")
plt.tight_layout()
plt.show()


In [None]:
fig = plt.subplots(nrows=1, ncols = 1, figsize = (20,10))
fig = sns.heatmap(complete_data.query(filter_query)[config['PASSING_COLS']  + ['team_goals_scored','team_non_penalty_goals_scored','team_goals_conceded']].corr(),cmap = 'coolwarm')
fig.set_xticklabels(fig.get_xticklabels(),rotation = 60)

In [None]:
sns.pairplot(complete_data.query(filter_query)[config['PASSING_COLS']  + ['team_goals_scored','team_non_penalty_goals_scored','team_goals_conceded']])

In [None]:
complete_data.columns

In [None]:
config['PASSING_COLS'] + config['DEFENSE_COLS']

In [None]:
# trial multiclass model:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(complete_data.query(filter_query)[list(set(config['PASSING_COLS'] + config['DEFENSE_COLS'] ))],
                                                    complete_data.query(filter_query)['win'],
                                                    stratify=complete_data.query(filter_query)['win'],
                                                    random_state=33)


In [None]:
create_submodel("catboost")

In [None]:
output_path

In [None]:
model = run_model_with_fs_tune(X_train, X_test, y_train, y_test,dat_dict,'catboost',output_path=folder_manager.output_path)

In [None]:
dat = NNDataFromPd(X_train.fillna(0), y_train.outcome_num, dat_dict)
train_loader = DataLoader(dat, batch_size = 128,shuffle= True)

In [None]:
train_loader.dataset.X_numeric_tensor.shape

In [None]:
# model params
n_features = X_train.shape[1]
n_classes = y_train.iloc[:,0].nunique()
model = MultiClassModel(n_features,n_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr = 0.001)

In [None]:
epochs = 500

for epoch in range(epochs):
    
    epoch_loss = 0

    for X_numeric_batch, X_categoric_batch, y_batch in train_loader:
        
        pred = model.forward(X_numeric_batch)
        
        loss = criterion(pred,y_batch)

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch: {epoch}, Loss: {epoch_loss}")

In [None]:
model.eval()

In [None]:
X_test['passes_accuracy'] = X_test['passes_accuracy'].astype("float64")

In [None]:
# test sets

test_dat = NNDataFromPd(X_test,y_test,dat_dict)
test_loader = DataLoader(test_dat,batch_size= X_test.shape[0],shuffle=True)

In [None]:
model.eval()
with torch.no_grad():
    for X_numeric_batch, X_categoric_batch, y_batch in test_loader:
        output = model(X_numeric_batch)
        pred_class = torch.argmax(output, dim = 1)

In [None]:
# Logistic Model:
X_train, X_test, y_train, y_test = train_test_split(complete_data[complete_data.games_position == 'F'][list(set(config['DEFENSE_COLS'] + config['PASSING_COLS'] + config['ATTACK_COLS'])) + ['win']].drop(columns = 'win'),
                                                    complete_data[complete_data.games_position == 'F']['win'],
                                                    stratify=complete_data['win'],
                                                    random_state=33)

In [None]:
train_dat = NNDataFromPd(X_train,y_train,dat_dict)
train_loader = DataLoader(train_dat,batch_size= 128,shuffle = True)

In [None]:
n_features = X_train.shape[1]
model = LogisticNNModelComplex(n_features)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(),lr = 0.005)

In [None]:
epochs = 500
for epoch in range(epochs):
    epoch_loss = 0
    
    for X_numeric, X_categoric, y in train_loader:

        pred = model(X_numeric)

        loss = criterion(pred,y.unsqueeze(1))

        optimizer.zero_grad()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch},  Loss: {loss}")


In [None]:
pred_proba.squeeze(1)

In [None]:
from validations import *


test_dat = NNDataFromPd(X_test,y_test,dat_dict)
test_loader = DataLoader(test_dat,batch_size= X_test.shape[0],shuffle=True)

model.eval()
with torch.no_grad():
    for X_numeric_batch, X_categoric_batch, y_batch in test_loader:
        output = model(X_numeric_batch)
        pred_proba = torch.softmax(output,dim =1)
        pred_class = torch.argmax(output, dim = 1)


discrete_evaluations(y_test,pred_class,pred_proba.squeeze(1),classification_type="Binary",model_path= folder_manager.output_path)

In [None]:
test_fixtures = get_team_fixtures("Liverpool",2)

In [None]:
test_fixtures

In [None]:
player_stat_url = "https://v3.football.api-sports.io/fixtures/players?fixture={}".format(1035045)
fixture_dat = requests.get(player_stat_url,headers=headers_api_sport)

In [None]:
pd.json_normalize(pd.json_normalize(fixture_dat.json()['response']))['players'][0]

In [None]:
fixture_dat_expanded = pd.concat([pd.json_normalize(pd.json_normalize(fixture_dat.json()['response'])['players'][0])[['player.id','player.name']],pd.json_normalize(pd.json_normalize(pd.json_normalize(pd.json_normalize(fixture_dat.json()['response'])['players'][0])['statistics']).rename(columns = {0:"player_stats"})['player_stats'])],axis = 1)

In [None]:
fixtures_stat = complete_data.groupby(['fixture_id','team'],as_index=False).agg(n_opponent = ('opponent','count'),total_passes = ('passes_total','sum')).sort_values('fixture_id',ascending= False)

In [None]:
fixtures_stat

In [None]:
complete_data[complete_data.fixture_id == 1376437][['team','opponent']]

In [None]:
teams_dat[teams_dat.team_name.str.contains("Tels")]

In [None]:
angers = pd.read_parquet(home_dir + "/data/Fixtures/angers_2024.parquet")

In [None]:
angers['fixture_date'] = pd.to_datetime(angers['fixture_date'])

In [None]:
angers['fixture_date']