In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append('/files/Projet_Barca/')


from Team_Data_Loader import load_team_data

def explanatory_analysis(df):
    # Analysis of opponent level
    
    df['Opponent_tier'] = df['Opponent'].apply(assign_opponent_tier)
    
    #Explanatory analysis for the team
    print("Descriptive Analysis")
    descriptive_stats = df[['Poss', 'GF', 'GA', 'xG', 'xGA','Equipe_type', 'Opponent_tier' ]].describe()
    print(descriptive_stats.round(2))
    
    
    
    

    # Correlations
    print("Correlation with victory")
    correlation_with_win = df[['Poss', 'GF', 'GA', 'xG', 'xGA', 'Equipe_type', 'Victory', 'Opponent_tier' ]].corr()['Victory'].sort_values(ascending=False)
    print(correlation_with_win.round(2))
    
    
    df['Result_3class'] = df['Result'].map({'W': 2, 'D': 1, 'L': 0})
    
    #Performences of Home game vs Away game
    
    print("\n" + "-" * 40)
    
    print("Home vs Away games")
    
    print("-" * 40)
    
    home_match= df[df['Venue'] == 1]
    away_match= df[df['Venue'] == 0]
    
    print(f"Number of home matches:{len(home_match)}")
    print(f"Home Win Rate: {(home_match['Victory'].mean()*100):.1f}%")
    print(f"Home Goals by game: {(home_match['GF'].mean()):.2f}")
    
    print(f"Number of Away matches:{len(away_match)}")
    print(f"Away Win Rate: {(away_match['Victory'].mean()*100):.1f}%")
    print(f"Away Goals by game: {(away_match['GF'].mean()):.2f}")
    
    
    # xG efficiency analysis
    print("\n" + "-" * 40)
    print("xG efficiency analysis")
    
    df['xG_efficiency'] = np.where(df['xG'] > 0, df['GF'] / df['xG'],0)
    df['xGA_efficiency'] = np.where(df['xGA'] > 0,  df['GA'] / df['xGA'],0)
    
    df['xG_efficiency'] = df['xG_efficiency'].round(2)
    df['xGA_efficiency'] = df['xGA_efficiency'].round(2)
    
    print(f"Overall xG Efficiency: {df['xG_efficiency'].mean():.2f}")
    print(f"Overall xGA Efficiency: {df['xGA_efficiency'].mean():.2f}")
    
        # 5. Key Insights
    print("\n" + "-" * 40)
    print("key of the analysis")
    print("-" * 40)
    
    strongest_predictor = correlation_with_win.index[1]  # Skip 'Victory' itself
    home_advantage = home_match['Victory'].mean() - away_match['Victory'].mean()
    
    print(f"Strongest Victory Predictor: {strongest_predictor} (r={correlation_with_win[1]:.3f})")
    print(f"Home Advantage: +{(home_advantage*100):.1f}% win rate")
    print(f"Best xG Efficiency: {df['xG_efficiency'].max():.3f}")
    
    print("\n" + "=" * 70)
    print("Analysis finished")
    print("=" * 70)
    
    result_df = df.copy()
    result_df = result_df.fillna(0).round(2)
    
    return {
        'correlation_with_win': correlation_with_win,
        'descriptive_stats': descriptive_stats,
        'processed_data': result_df }
def assign_opponent_tier(opponent_name):
    """
    Manual classification based on football expertise
    Tier 1: Elite teams
    Tier 2: Good teams 
    Tier 3: Mid-table teams
    Tier 4: Relegation teams
    """
    elite_teams = ['Real Madrid', 'Atlético Madrid']
    good_teams = ['Rayo Vallecano', 'Celta Vigo', 'Betis', 'Villarreal', 'Athletic Club', 'Osasuna', 'Real Sociedad']
    mid_teams = ['Mallorca', 'Girona', 'Getafe', 'Valencia', 'Alavés', 'Sevilla', 'Espanyol']
    relegation_teams = ['Leganés', 'Las Palmas', 'Valladolid']
    
    if opponent_name in elite_teams:
        return 1
    elif opponent_name in good_teams:
        return 2
    elif opponent_name in mid_teams:
        return 3
    elif opponent_name in relegation_teams:
        return 4
    
    


team_df = load_team_data()
results= explanatory_analysis(team_df)  





    
    
    


Dataset Shape: 38 matches, 13 columns
Columns: ['Date', 'Time', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'xG', 'xGA', 'Poss', 'Equipe_type']

First 5 matches preview:
         Date   Time        Round  Day  Venue Result  GF  GA        Opponent  \
0  17.08.2024  21:30  Matchweek 1  Sat      1      W   2   1        Valencia   
1  24.08.2024  19:00  Matchweek 2  Sat      0      W   2   1   Athletic Club   
2  27.08.2024  21:30  Matchweek 3  Tue      1      W   2   1  Rayo Vallecano   
3  31.08.2024  17:00  Matchweek 4  Sat      0      W   7   0      Valladolid   
4  15.09.2024  16:15  Matchweek 5  Sun      1      W   4   1          Girona   

    xG  xGA  Poss  Equipe_type  
0  3,2    1    63            0  
1  1,8    1    64            1  
2  1,4  0,4    64            1  
3  4,7  0,5    70            1  
4  1,9  1,3    55            1  

----------------------------------------
Results distribution
----------------------------------------
W    28
L     6
D     4
Name: Re