In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

def load_team_data():
    # We are going to load the data set of the team and clean this one
    
    # 1. Uploading the data
    df = pd.read_csv('/files/Projet_Barca/Data_set/ProjetBarca.csv', sep=';', encoding='latin1')

    # 2. Clean the dataset
    
    
    print("=" * 70)
    
    print(f"Dataset Shape: {df.shape[0]} matches, {df.shape[1]} columns")
    print(f"Columns: {df.columns.tolist()}")
    
    """" I wrote this for being sure that the dataset isn't wrong"""
    print("\nFirst 5 matches preview:")
    print(df.head())

    # Date removing
    df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y')
    
    # Convert commas to dots for numeric values to avoid issue with float and int
    numeric_columns = ['xG', 'xGA', 'Poss']
    for col in numeric_columns:
        df[col] = df[col].astype(str).str.replace(',', '.').astype(float)

    # Create some target variables
    df['Victory'] = (df['Result'] == 'W').astype(int)
    df['Defeat'] = (df['Result'] == 'L').astype(int)
    df['Draw'] = (df['Result'] == 'D').astype(int)

    print("\n" + "-" * 40)
    print("Results distribution")
    print("-" * 40)
    print(df['Result'].value_counts())
    print(f"Win rate: {(df['Victory'].mean()*100):.1f}%")
    
    
    
    
    return df


def load_raw_key_players_data():
    # Load and process key players (Pedri, Raphinia, Lamine Yamal and Inigo Martinez) performance data
    
    players_df = pd.read_csv('/files/Projet_Barca/Data_set/Key_players.csv', sep=';', encoding='latin1')
    
    # CORR : Fix encoding issues before any display
    
    """ Beacause, I got issues to find the column for the players because the column had some bugged caracter and also for Martinez
    I have used this code just to see the name of my differents features and for fixing this later:
    print(f"Shape: {players_df.shape}")
    print(f"Colonnes: {players_df.columns.tolist()}")
    print(players_df.head())"""
    
    players_df = players_df.rename(columns={'ï»¿Player': 'Players'})
    players_df['Players'] = players_df['Players'].replace('MartÃ­nez', 'Martinez')
    
    print("\n" + "=" * 70)
    print("Key Players Data analysis")
    print("=" * 70)
    
    print(f"Raw dataset shape: {players_df.shape[0]} players, {players_df.shape[1]} metrics")
    print(f"Available columns: {players_df.columns.tolist()}")
    
    print("\nFirst rows of cleaned data:")
    print(players_df.head())
    
    print("\n" + "-" * 40)

    
    print("Column 'Players' successfully standardized")
    print(f"Players identified: {players_df['Players'].tolist()}")
    
    print(f"Initial dataset shape: {players_df.shape}")
    
    return players_df



def validate_players_data(players_df):
    
    
    # Converting commas into points
    numeric_columns = ['Succes_P%', 'Sot%', 'TO%', 'xG', 'PrgC', 'PrgP', 'Pass', 'Shots', 'Take_On', 'Tkl', 'TklW', 'Int', 'Recov']
    
    for col in numeric_columns:
        if col in players_df.columns and players_df[col].dtype == 'object':
            players_df[col] = players_df[col].astype(str).str.replace(',', '.').astype(float)
    
    # Validation of the different percentages
    percentage_cols = ['Succes_P%', 'Sot%', 'TO%']
    for col in percentage_cols:
        if col in players_df.columns:
            players_df[col] = players_df[col].clip(0, 100)
    

    print("Data validation: Numeric conversion completed")
    
    return players_df
    
def calculate_impact_score(players_df):
    #Calculating composite impact score (0-100) like a Fifa card
    
    # Normalize key metrics to 0-1 scale
    metrics_to_include = [
        'Goals_p90', 'Assists_p90', 'Progressive_Passes_p90', 
        'Progressive_Carries_p90', 'Successful_Passes_p90'
    ]
    
    scores = []
    for metric in metrics_to_include:
        if metric in players_df.columns:
            # Min-max normalization
            min_val = players_df[metric].min()
            max_val = players_df[metric].max()
            if max_val > min_val:
                normalized = (players_df[metric] - min_val) / (max_val - min_val)
                scores.append(normalized)
    
    if scores:
        # Weighted average (goals and assists more important)
        weights = [0.3, 0.3, 0.15, 0.15, 0.1]  # Sum to 1.0
        weighted_scores = sum(score * weight for score, weight in zip(scores, weights))
        return (weighted_scores * 100).round(1)
    else:
        return 0
    
    

def classify_player_role(row):
    #Classifying the different players role into the team
    
    if row['Pos'] == 'DF':
        return 'Defender'
    elif row['Pos'] == 'MF':
        if row['Assists_p90'] > row['Goals_p90']:
            return 'Creative Midfielder'
        else:
            return 'Box-to-Box Midfielder'
    elif row['Pos'] == 'FW':
        if row['Goals_p90'] > 0.4:
            return 'Goalscorer'
        elif row['Assists_p90'] > 0.3:
            return 'Playmaker'
        else:
            return 'Winger'
    return 'Utility Player'

def create_advanced_measures(players_df):
    # Creating some other features for the players
    
    # CORR: To be sure that everything is correct we convert comas into points
    numeric_cols_to_convert = ['xG', 'PrgC', 'PrgP', 'Pass', 'Shots', 'Take_On', 'Tkl', 'TklW', 'Int', 'Recov']
    
    for col in numeric_cols_to_convert:
        if col in players_df.columns and players_df[col].dtype == 'object':
            players_df[col] = players_df[col].astype(str).str.replace(',', '.').astype(float)
    
    # Per 90 minutes statistics (essential for comparison specially for football because a game has a duration of 90 mins)
    players_df['Goals_p90'] = (players_df['Goals'] / players_df['Min']) * 90
    players_df['Assists_p90'] = (players_df['Assists'] / players_df['Min']) * 90
    players_df['xG_p90'] = (players_df['xG'] / players_df['Min']) * 90
    players_df['Shots_p90'] = (players_df['Shots'] / players_df['Min']) * 90
    players_df['Shots_on_target_p90'] = (players_df['Shots'] * players_df['Sot%'] / 100 / players_df['Min']) * 90
    
    # Offensive Efficiency Measures
    players_df['Conversion_Rate'] = (players_df['Goals'] / players_df['Shots'] * 100).round(1)
    players_df['Goal_Contribution_p90'] = players_df['Goals_p90'] + players_df['Assists_p90']
    players_df['xG_efficiency'] = (players_df['Goals'] / players_df['xG']).round(2)
    
    # Creative impact Measures
    players_df['Progressive_Passes_p90'] = (players_df['PrgP'] / players_df['Min']) * 90
    players_df['Progressive_Carries_p90'] = (players_df['PrgC'] / players_df['Min']) * 90
    players_df['Total_Progressive_Actions_p90'] = players_df['Progressive_Passes_p90'] + players_df['Progressive_Carries_p90']
    
    # Involvement measures
    players_df['Passes_p90'] = (players_df['Pass'] / players_df['Min']) * 90
    players_df['Successful_Passes_p90'] = (players_df['Pass'] * players_df['Succes_P%'] / 100 / players_df['Min']) * 90
    
    # Defensive measures per 90 min
    players_df['Tackles_p90'] = (players_df['Tkl'] / players_df['Min']) * 90
    players_df['Successful_Tackles_p90'] = (players_df['TklW'] / players_df['Min']) * 90
    players_df['Interceptions_p90'] = (players_df['Int'] / players_df['Min']) * 90
    players_df['Ball_Recoveries_p90'] = (players_df['Recov'] / players_df['Min']) * 90
    
    # Dribbling and 1v1 measures
    players_df['Take_Ons_p90'] = (players_df['Take_On'] / players_df['Min']) * 90
    players_df['Successful_Take_Ons_p90'] = (players_df['Take_On'] * players_df['TO%'] / 100 / players_df['Min']) * 90
    
    # Player role classification
    players_df['Player_Role'] = players_df.apply(classify_player_role, axis=1)
    
    # Overall impact score (composite metric)
    players_df['Overall_Impact_Score'] = calculate_impact_score(players_df)
    
    print("Advanced metrics creation completed")
    
    return players_df





def final_data_formatting(players_df):
    #Final formatting and cleaning
    
    # Round numeric columns for readability
    rounding_rules = {
        'Goals_p90': 2,
        'Assists_p90': 2,
        'xG_p90': 2,
        'Shots_p90': 1,
        'Conversion_Rate': 1,
        'Goal_Contribution_p90': 2,
        'Progressive_Passes_p90': 1,
        'Progressive_Carries_p90': 1,
        'Total_Progressive_Actions_p90': 1,
        'Tackles_p90': 2,
        'Interceptions_p90': 2,
        'Overall_Impact_Score': 1
    }
    
    for col, decimals in rounding_rules.items():
        if col in players_df.columns:
            players_df[col] = players_df[col].round(decimals)
    
    # Handle in case of infinite values from division
    players_df['xG_efficiency'] = players_df['xG_efficiency'].replace([np.inf, -np.inf], 0)
    
    # Sort by position and impact score
    players_df = players_df.sort_values(['Pos', 'Overall_Impact_Score'], ascending=[True, False])
    
    # Reset index for clean output
    players_df = players_df.reset_index(drop=True)
    
    return players_df

def get_player_statistics_summary(players_df):
    # Generating a summary
    
    summary = {
        'by_position': players_df.groupby('Pos').agg({
            'Goals_p90': 'mean',
            'Assists_p90': 'mean',
            'Overall_Impact_Score': 'mean'
        }).round(2),
        
        'top_performers': {
            'most_goals': players_df.loc[players_df['Goals_p90'].idxmax()]['Players'],
            'most_assists': players_df.loc[players_df['Assists_p90'].idxmax()]['Players'],
            'most_creative': players_df.loc[players_df['Progressive_Passes_p90'].idxmax()]['Players'],
            'highest_impact': players_df.loc[players_df['Overall_Impact_Score'].idxmax()]['Players']
        },
        
        'efficiency_stats': {
            'avg_conversion_rate': players_df['Conversion_Rate'].mean().round(1),
            'avg_passing_accuracy': players_df['Succes_P%'].mean().round(1),
            'avg_goal_contribution': players_df['Goal_Contribution_p90'].mean().round(2)
        }
    }
    
    return summary

"""" All

def clear_key_player_data(players_df):
    
    # Data validation
    players_df = validate_players_data(players_df)
    
    # Create advanced metrics
    players_df = create_advanced_measures(players_df)
    
    # Final formatting
    players_df = final_data_formatting(players_df)
    
    print("\n" + "-" * 40)
    print("Data processing completed")
    print("-" * 40)
    
    print(f"Final dataset shape: {players_df.shape}")
    advanced_metrics_count = len([col for col in players_df.columns if col not in ['Players', 'Pos']])
    print(f"Total advanced metrics created: {advanced_metrics_count}")
    
    print("\nFinal players data:")
    display_cols = ['Players', 'Pos', 'Goals_p90', 'Assists_p90', 'Overall_Impact_Score', 'Player_Role']
    print(players_df[display_cols].to_string(index=False))
    
    print("\n" + "=" * 70)






# Main execution
print("Fc Barcelona 2024-2025 season stats")
print("=" * 70)

team_df = load_team_data()
players_raw_df = load_raw_key_players_data()
player_clear_df= validate_players_data(players_df)

print("\n" + "=" * 70)

print(f"Team data: {team_df.shape[0]} matches ready for analysis")
print(f"Players data: {players_df.shape[0]} players with {players_df.shape[1]} metrics ready for analysis")
print("=" * 70)



Fc Barcelona 2024-2025 season stats
Dataset Shape: 38 matches, 13 columns
Columns: ['Date', 'Time', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'xG', 'xGA', 'Poss', 'Equipe_type']

First 5 matches preview:
         Date   Time        Round  Day  Venue Result  GF  GA        Opponent  \
0  17.08.2024  21:30  Matchweek 1  Sat      1      W   2   1        Valencia   
1  24.08.2024  19:00  Matchweek 2  Sat      0      W   2   1   Athletic Club   
2  27.08.2024  21:30  Matchweek 3  Tue      1      W   2   1  Rayo Vallecano   
3  31.08.2024  17:00  Matchweek 4  Sat      0      W   7   0      Valladolid   
4  15.09.2024  16:15  Matchweek 5  Sun      1      W   4   1          Girona   

    xG  xGA  Poss  Equipe_type  
0  3,2    1    63            0  
1  1,8    1    64            1  
2  1,4  0,4    64            1  
3  4,7  0,5    70            1  
4  1,9  1,3    55            1  

----------------------------------------
Results distribution
-------------------------------------