## Soccer_Performance_Score

## 1.1 | Combine 2 seasons

In [2]:
import pandas as pd
import os
from pathlib import Path

# Set the directory path
data_dir = "/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid"

# Get all CSV files in the directory
csv_files = []
for file in os.listdir(data_dir):
    if file.endswith('.csv'):
        csv_files.append(os.path.join(data_dir, file))

print(f"Found CSV files: {csv_files}")

# Read and combine all CSV files
dataframes = []
for file in csv_files:
    df = pd.read_csv(file)
    print(f"\nFile: {file}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    dataframes.append(df)

# Combine all dataframes
if len(dataframes) == 2:
    # If the CSV files have the same structure, use concat
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Alternative: if you want to merge on a common column instead
    # combined_df = pd.merge(dataframes[0], dataframes[1], on='common_column', how='outer')
    
elif len(dataframes) > 2:
    # For more than 2 files
    combined_df = pd.concat(dataframes, ignore_index=True)
else:
    combined_df = dataframes[0] if dataframes else pd.DataFrame()

print(f"\nCombined DataFrame Shape (before removing duplicates): {combined_df.shape}")

# Remove duplicates
initial_shape = combined_df.shape[0]
combined_df = combined_df.drop_duplicates()
final_shape = combined_df.shape[0]

print(f"Removed {initial_shape - final_shape} duplicate rows")
print(f"Final DataFrame Shape: {combined_df.shape}")
print(f"Combined DataFrame Columns: {list(combined_df.columns)}")

# Display the head of the combined dataframe
print("\nHead of combined DataFrame:")
print(combined_df.head())

# Create output directory and save the combined dataframe
output_dir = "/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid_combined"
os.makedirs(output_dir, exist_ok=True)

# Save the combined dataframe to the new folder
output_file = os.path.join(output_dir, 'combined_real_madrid.csv')
combined_df.to_csv(output_file, index=False)

print(f"\nCombined CSV saved to: {output_file}")

Found CSV files: ['/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid/real_madrid_23_24.csv', '/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid/real_madrid_24_25.csv']

File: /Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid/real_madrid_23_24.csv
Shape: (774, 73)
Columns: ['Date', 'Competition', 'Opponent', 'Player', '#', 'Nation', 'Pos', 'Age', 'Min', ' Gls', ' Ast', ' PK', ' PKatt', ' Sh', ' SoT', ' CrdY', ' CrdR', ' Touches', ' Tkl', ' Int', ' Blocks', 'Expected xG', 'Expected npxG', 'Expected xAG', 'SCA', 'GCA', 'Passes Cmp', 'Passes Att', 'Passes Cmp%', 'Passes PrgP', 'Carries Carries', 'Carries PrgC', 'Take-Ons Att', 'Take-Ons Succ', 'Tackles Tkl', 'Tackles TklW', 'Tackles Def 3rd', 'Tackles Mid 3rd', 'Tackles Att 3rd', 'Challenges Tkl', 'Challenges Att', 'Challenges Tkl%', 'Challenges Lost', 'Blocks Blocks', 'Blocks Sh', 'Blocks Pass', 'Int', 

## Create weighted metric

In [23]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load your combined Real Madrid data
df = pd.read_csv('/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid_combined/combined_real_madrid.csv')

# Check for and handle duplicates
print("=== HANDLING DUPLICATES ===")
print(f"Initial dataset shape: {df.shape}")
initial_count = len(df)

# Remove exact duplicate rows
df_no_duplicates = df.drop_duplicates()
duplicates_removed = initial_count - len(df_no_duplicates)
print(f"Removed {duplicates_removed} exact duplicate rows")

# Reset index to avoid duplicate index issues
df = df_no_duplicates.reset_index(drop=True)
print(f"Final dataset shape: {df.shape}")

# Convert Date to datetime and add Season
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y')
df['Season'] = df['Date'].apply(lambda x: 
    f"{x.year}-{str(x.year + 1)[-2:]}" if x.month >= 8 
    else f"{x.year - 1}-{str(x.year)[-2:]}"
)

# Position grouping
position_mapping = {
    'GK': 'Goalkeeper',
    'CB': 'Defense', 'LB': 'Defense', 'RB': 'Defense',
    'DM': 'Midfield', 'CM': 'Midfield', 'LM': 'Midfield', 'RM': 'Midfield', 'AM': 'Midfield',
    'FW': 'Forward'
}
df['Position_Group'] = df['Pos'].map(position_mapping)

# First, let's check what columns we actually have
print("=== DATASET COLUMN ANALYSIS ===")
print(f"Total columns: {len(df.columns)}")
print(f"Column names: {list(df.columns)}")

# Handle missing values and convert to numeric - check if columns exist first
potential_numeric_columns = ['Tkl', 'Int', 'Blocks', 'Clr', 'Total Cmp%', 'Long Cmp%', 'Passes PrgP', 
                            'Carries PrgC', 'SCA', 'KP', 'Ast', 'Gls', ' Gls', 'Sh', ' Sh', 'SoT', ' SoT',
                            'Take-Ons Att', 'Take-Ons Succ', 'Expected xG', 'Total Att', 'Min', ' Min',
                            'Total Cmp', 'Total TotDist', 'Total PrgDist']

# Find which columns actually exist (handle spaces in column names)
existing_numeric_columns = []
for col in potential_numeric_columns:
    if col in df.columns:
        existing_numeric_columns.append(col)

print(f"\nFound these numeric columns: {existing_numeric_columns}")

# Convert existing columns to numeric
for col in existing_numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Create standardized column names (remove leading spaces)
column_mapping = {}
for col in df.columns:
    clean_col = col.strip()
    if col != clean_col:
        column_mapping[col] = clean_col

if column_mapping:
    df.rename(columns=column_mapping, inplace=True)
    print(f"Renamed columns: {column_mapping}")

# Now check for the key columns we need
required_columns = ['Gls', 'Ast', 'Tkl', 'Int', 'Blocks', 'SCA', 'KP', 'Sh', 'SoT', 'Min']
missing_columns = []
for col in required_columns:
    if col not in df.columns:
        missing_columns.append(col)
        df[col] = 0  # Create missing columns with zeros

if missing_columns:
    print(f"Created missing columns with zeros: {missing_columns}")

# Calculate Take-on Success Rate - check if columns exist
if 'Take-Ons Att' in df.columns and 'Take-Ons Succ' in df.columns:
    df['Take-Ons Succ%'] = np.where(df['Take-Ons Att'] > 0, 
                                    (df['Take-Ons Succ'] / df['Take-Ons Att'] * 100), 0)
else:
    df['Take-Ons Succ%'] = 0
    print("Take-on columns not found, setting Take-Ons Succ% to 0")

# Check for alternate column names that might exist
alternate_names = {
    'Gls': [' Gls', 'Goals', 'G'],
    'Ast': [' Ast', 'Assists', 'A'], 
    'Sh': [' Sh', 'Shots', 'Shot'],
    'SoT': [' SoT', 'Shots on Target', 'ShotsonTarget'],
    'Min': [' Min', 'Minutes', 'Playing Time'],
    'Tkl': [' Tkl', 'Tackles'],
    'Int': [' Int', 'Interceptions'],
    'SCA': [' SCA', 'Shot Creating Actions'],
    'KP': [' KP', 'Key Passes']
}

# Map alternate column names
for standard_name, alternates in alternate_names.items():
    if standard_name not in df.columns:
        for alt_name in alternates:
            if alt_name in df.columns:
                df[standard_name] = df[alt_name]
                print(f"Mapped {alt_name} to {standard_name}")
                break

print(f"\nFinal check - Key columns available:")
for col in ['Gls', 'Ast', 'Sh', 'SoT', 'Min', 'Tkl', 'Int']:
    status = "✓" if col in df.columns else "✗"
    print(f"{status} {col}")

# GOALKEEPER PERFORMANCE SCORE
def calculate_gk_score(df_gk):
    if df_gk.empty:
        return df_gk
    
    # Reset index to ensure no duplicates
    df_gk = df_gk.reset_index(drop=True).copy()
    
    # Normalize metrics (0-100 scale)
    df_gk['GK_Distribution'] = np.where(df_gk['Total Cmp%'] > 0, df_gk['Total Cmp%'], 0)
    df_gk['GK_LongBall'] = np.where(df_gk['Long Cmp%'] > 0, df_gk['Long Cmp%'], 0)
    
    # Weighted score
    df_gk['Performance_Score'] = (
        df_gk['GK_Distribution'] * 0.6 +  # 60% - Distribution accuracy
        df_gk['GK_LongBall'] * 0.4       # 40% - Long ball accuracy
    )
    
    return df_gk

# DEFENSE PERFORMANCE SCORE
def calculate_def_score(df_def):
    if df_def.empty:
        return df_def
    
    # Reset index to ensure no duplicates
    df_def = df_def.reset_index(drop=True).copy()
    
    # Check if required columns exist, use 0 if not
    tkl_col = 'Tkl' if 'Tkl' in df_def.columns else None
    int_col = 'Int' if 'Int' in df_def.columns else None
    blocks_col = 'Blocks' if 'Blocks' in df_def.columns else None
    clr_col = 'Clr' if 'Clr' in df_def.columns else None
    pass_acc_col = 'Total Cmp%' if 'Total Cmp%' in df_def.columns else None
    
    # Convert to numpy arrays and ensure they're 1D
    min_values = np.array(df_def['Min']).flatten()
    
    # Normalize per 90 minutes using numpy operations
    if tkl_col is not None:
        tkl_values = np.array(df_def[tkl_col]).flatten()
        df_def['Tkl_per90'] = np.where(min_values > 0, tkl_values / min_values * 90, 0)
    else:
        df_def['Tkl_per90'] = 0
        
    if int_col is not None:
        int_values = np.array(df_def[int_col]).flatten()
        df_def['Int_per90'] = np.where(min_values > 0, int_values / min_values * 90, 0)
    else:
        df_def['Int_per90'] = 0
        
    if blocks_col is not None:
        blocks_values = np.array(df_def[blocks_col]).flatten()
        df_def['Blocks_per90'] = np.where(min_values > 0, blocks_values / min_values * 90, 0)
    else:
        df_def['Blocks_per90'] = 0
        
    if clr_col is not None:
        clr_values = np.array(df_def[clr_col]).flatten()
        df_def['Clr_per90'] = np.where(min_values > 0, clr_values / min_values * 90, 0)
    else:
        df_def['Clr_per90'] = 0
    
    # Scale to 0-100
    max_tkl = df_def['Tkl_per90'].max() if df_def['Tkl_per90'].max() > 0 else 1
    max_int = df_def['Int_per90'].max() if df_def['Int_per90'].max() > 0 else 1
    max_blocks = df_def['Blocks_per90'].max() if df_def['Blocks_per90'].max() > 0 else 1
    max_clr = df_def['Clr_per90'].max() if df_def['Clr_per90'].max() > 0 else 1
    
    df_def['DEF_Tackles'] = (df_def['Tkl_per90'] / max_tkl * 100)
    df_def['DEF_Interceptions'] = (df_def['Int_per90'] / max_int * 100)
    df_def['DEF_Blocks'] = (df_def['Blocks_per90'] / max_blocks * 100)
    df_def['DEF_Clearances'] = (df_def['Clr_per90'] / max_clr * 100)
    df_def['DEF_PassAccuracy'] = df_def[pass_acc_col] if pass_acc_col else 0
    
    # Weighted score
    df_def['Performance_Score'] = (
        df_def['DEF_Tackles'] * 0.25 +       # 25% - Tackles
        df_def['DEF_Interceptions'] * 0.25 + # 25% - Interceptions
        df_def['DEF_Blocks'] * 0.20 +        # 20% - Blocks
        df_def['DEF_Clearances'] * 0.15 +    # 15% - Clearances
        df_def['DEF_PassAccuracy'] * 0.15    # 15% - Pass accuracy
    )
    
    return df_def

# MIDFIELD PERFORMANCE SCORE
def calculate_mid_score(df_mid):
    if df_mid.empty:
        return df_mid
    
    # Reset index to ensure no duplicates
    df_mid = df_mid.reset_index(drop=True).copy()
    
    # Check if required columns exist
    total_att_col = 'Total Att' if 'Total Att' in df_mid.columns else None
    prog_passes_col = 'Passes PrgP' if 'Passes PrgP' in df_mid.columns else None
    sca_col = 'SCA' if 'SCA' in df_mid.columns else None
    kp_col = 'KP' if 'KP' in df_mid.columns else None
    ast_col = 'Ast' if 'Ast' in df_mid.columns else None
    pass_acc_col = 'Total Cmp%' if 'Total Cmp%' in df_mid.columns else None
    
    # Convert to numpy arrays and ensure they're 1D
    min_values = np.array(df_mid['Min']).flatten()
    
    # Normalize per 90 minutes using numpy operations
    if total_att_col is not None:
        passes_values = np.array(df_mid[total_att_col]).flatten()
        df_mid['Passes_per90'] = np.where(min_values > 0, passes_values / min_values * 90, 0)
    else:
        df_mid['Passes_per90'] = 0
        
    if prog_passes_col is not None:
        prog_values = np.array(df_mid[prog_passes_col]).flatten()
        df_mid['ProgPasses_per90'] = np.where(min_values > 0, prog_values / min_values * 90, 0)
    else:
        df_mid['ProgPasses_per90'] = 0
        
    if sca_col is not None:
        sca_values = np.array(df_mid[sca_col]).flatten()
        df_mid['SCA_per90'] = np.where(min_values > 0, sca_values / min_values * 90, 0)
    else:
        df_mid['SCA_per90'] = 0
        
    if kp_col is not None:
        kp_values = np.array(df_mid[kp_col]).flatten()
        df_mid['KP_per90'] = np.where(min_values > 0, kp_values / min_values * 90, 0)
    else:
        df_mid['KP_per90'] = 0
        
    if ast_col is not None:
        ast_values = np.array(df_mid[ast_col]).flatten()
        df_mid['Ast_per90'] = np.where(min_values > 0, ast_values / min_values * 90, 0)
    else:
        df_mid['Ast_per90'] = 0
    
    # Scale to 0-100
    max_passes = df_mid['Passes_per90'].max() if df_mid['Passes_per90'].max() > 0 else 1
    max_prog = df_mid['ProgPasses_per90'].max() if df_mid['ProgPasses_per90'].max() > 0 else 1
    max_sca = df_mid['SCA_per90'].max() if df_mid['SCA_per90'].max() > 0 else 1
    max_kp = df_mid['KP_per90'].max() if df_mid['KP_per90'].max() > 0 else 1
    
    df_mid['MID_PassVolume'] = (df_mid['Passes_per90'] / max_passes * 100)
    df_mid['MID_Progressive'] = (df_mid['ProgPasses_per90'] / max_prog * 100)
    df_mid['MID_Creativity'] = (df_mid['SCA_per90'] / max_sca * 100)
    df_mid['MID_KeyPasses'] = (df_mid['KP_per90'] / max_kp * 100)
    df_mid['MID_PassAccuracy'] = df_mid[pass_acc_col] if pass_acc_col else 80  # Default decent accuracy
    df_mid['MID_Assists'] = (df_mid['Ast_per90'] * 50)  # Assists bonus
    
    # Weighted score
    df_mid['Performance_Score'] = (
        df_mid['MID_PassAccuracy'] * 0.20 +   # 20% - Pass accuracy
        df_mid['MID_Progressive'] * 0.20 +    # 20% - Progressive passes
        df_mid['MID_Creativity'] * 0.20 +     # 20% - Shot creating actions
        df_mid['MID_KeyPasses'] * 0.15 +      # 15% - Key passes
        df_mid['MID_PassVolume'] * 0.15 +     # 15% - Pass volume
        df_mid['MID_Assists'] * 0.10          # 10% - Assists
    )
    
    return df_mid

# FORWARD PERFORMANCE SCORE
def calculate_fwd_score(df_fwd):
    if df_fwd.empty:
        print("  - Forward subset is empty")
        return df_fwd
    
    print(f"  - Processing {len(df_fwd)} forward records")
    
    # Ensure we have a clean copy with reset index
    df_fwd = df_fwd.copy().reset_index(drop=True)
    
    # Check if required columns exist
    gls_col = 'Gls' if 'Gls' in df_fwd.columns else None
    ast_col = 'Ast' if 'Ast' in df_fwd.columns else None
    sh_col = 'Sh' if 'Sh' in df_fwd.columns else None
    sot_col = 'SoT' if 'SoT' in df_fwd.columns else None
    xg_col = 'Expected xG' if 'Expected xG' in df_fwd.columns else None
    
    print(f"  - Available columns: Gls={gls_col is not None}, Ast={ast_col is not None}, Sh={sh_col is not None}")
    
    # Safely extract minutes as 1D array
    try:
        min_values = pd.to_numeric(df_fwd['Min'], errors='coerce').fillna(0).values
        min_values = np.array(min_values).flatten()
        print(f"  - Min values shape: {min_values.shape}")
    except Exception as e:
        print(f"  - Error extracting minutes: {e}")
        min_values = np.zeros(len(df_fwd))
    
    # Normalize per 90 minutes using safe operations
    if gls_col is not None:
        try:
            gls_values = pd.to_numeric(df_fwd[gls_col], errors='coerce').fillna(0).values
            gls_values = np.array(gls_values).flatten()
            print(f"  - Goals values shape: {gls_values.shape}")
            
            if len(gls_values) == len(min_values):
                df_fwd['Gls_per90'] = np.where(min_values > 0, gls_values / min_values * 90, 0)
            else:
                print(f"  - Shape mismatch for goals: {len(gls_values)} vs {len(min_values)}")
                df_fwd['Gls_per90'] = 0
        except Exception as e:
            print(f"  - Error processing goals: {e}")
            df_fwd['Gls_per90'] = 0
    else:
        df_fwd['Gls_per90'] = 0
        
    if ast_col is not None:
        try:
            ast_values = pd.to_numeric(df_fwd[ast_col], errors='coerce').fillna(0).values
            ast_values = np.array(ast_values).flatten()
            print(f"  - Assists values shape: {ast_values.shape}")
            
            if len(ast_values) == len(min_values):
                df_fwd['Ast_per90'] = np.where(min_values > 0, ast_values / min_values * 90, 0)
            else:
                print(f"  - Shape mismatch for assists: {len(ast_values)} vs {len(min_values)}")
                df_fwd['Ast_per90'] = 0
        except Exception as e:
            print(f"  - Error processing assists: {e}")
            df_fwd['Ast_per90'] = 0
    else:
        df_fwd['Ast_per90'] = 0
        
    if sh_col is not None:
        try:
            sh_values = pd.to_numeric(df_fwd[sh_col], errors='coerce').fillna(0).values
            sh_values = np.array(sh_values).flatten()
            if len(sh_values) == len(min_values):
                df_fwd['Shots_per90'] = np.where(min_values > 0, sh_values / min_values * 90, 0)
            else:
                df_fwd['Shots_per90'] = 0
        except Exception as e:
            print(f"  - Error processing shots: {e}")
            df_fwd['Shots_per90'] = 0
    else:
        df_fwd['Shots_per90'] = 0
        
    if sot_col is not None:
        try:
            sot_values = pd.to_numeric(df_fwd[sot_col], errors='coerce').fillna(0).values
            sot_values = np.array(sot_values).flatten()
            if len(sot_values) == len(min_values):
                df_fwd['SoT_per90'] = np.where(min_values > 0, sot_values / min_values * 90, 0)
            else:
                df_fwd['SoT_per90'] = 0
        except Exception as e:
            print(f"  - Error processing shots on target: {e}")
            df_fwd['SoT_per90'] = 0
    else:
        df_fwd['SoT_per90'] = 0
        
    if xg_col is not None:
        try:
            xg_values = pd.to_numeric(df_fwd[xg_col], errors='coerce').fillna(0).values
            xg_values = np.array(xg_values).flatten()
            if len(xg_values) == len(min_values):
                df_fwd['xG_per90'] = np.where(min_values > 0, xg_values / min_values * 90, 0)
            else:
                df_fwd['xG_per90'] = 0
        except Exception as e:
            print(f"  - Error processing xG: {e}")
            df_fwd['xG_per90'] = 0
    else:
        df_fwd['xG_per90'] = 0
    
    # Scale to 0-100
    max_goals = df_fwd['Gls_per90'].max() if df_fwd['Gls_per90'].max() > 0 else 1
    max_shots = df_fwd['Shots_per90'].max() if df_fwd['Shots_per90'].max() > 0 else 1
    max_sot = df_fwd['SoT_per90'].max() if df_fwd['SoT_per90'].max() > 0 else 1
    max_xg = df_fwd['xG_per90'].max() if df_fwd['xG_per90'].max() > 0 else 1
    
    df_fwd['FWD_Goals'] = (df_fwd['Gls_per90'] / max_goals * 100)
    df_fwd['FWD_Assists'] = (df_fwd['Ast_per90'] * 50)  # Assists bonus
    df_fwd['FWD_Shots'] = (df_fwd['Shots_per90'] / max_shots * 100)
    df_fwd['FWD_ShotsOnTarget'] = (df_fwd['SoT_per90'] / max_sot * 100)
    df_fwd['FWD_ExpectedGoals'] = (df_fwd['xG_per90'] / max_xg * 100)
    df_fwd['FWD_TakeOnSuccess'] = df_fwd['Take-Ons Succ%'] if 'Take-Ons Succ%' in df_fwd.columns else 0
    
    # Weighted score
    df_fwd['Performance_Score'] = (
        df_fwd['FWD_Goals'] * 0.35 +           # 35% - Goals
        df_fwd['FWD_Assists'] * 0.20 +         # 20% - Assists
        df_fwd['FWD_ShotsOnTarget'] * 0.15 +   # 15% - Shots on target
        df_fwd['FWD_ExpectedGoals'] * 0.15 +   # 15% - Expected goals
        df_fwd['FWD_TakeOnSuccess'] * 0.10 +   # 10% - Take-on success
        df_fwd['FWD_Shots'] * 0.05             # 5% - Shot volume
    )
    
    print(f"  - Completed forward scoring. Score range: {df_fwd['Performance_Score'].min():.1f} - {df_fwd['Performance_Score'].max():.1f}")
    
    return df_fwd

# Apply scoring by position group
df['Performance_Score'] = 0

# Process each position group separately to avoid index conflicts
for position_group in df['Position_Group'].unique():
    if pd.isna(position_group):
        continue
        
    print(f"\nProcessing {position_group} players...")
    
    # Create a clean subset with proper filtering
    mask = df['Position_Group'] == position_group
    subset = df[mask].copy()
    original_length = len(subset)
    
    # Reset index completely
    subset = subset.reset_index(drop=True)
    
    print(f"Found {len(subset)} {position_group} records")
    
    try:
        if position_group == 'Goalkeeper':
            subset = calculate_gk_score(subset)
        elif position_group == 'Defense':
            subset = calculate_def_score(subset)
        elif position_group == 'Midfield':
            subset = calculate_mid_score(subset)
        elif position_group == 'Forward':
            subset = calculate_fwd_score(subset)
        
        # Verify lengths match before updating
        if len(subset) == original_length:
            # Update the main dataframe with the calculated scores using loc indexing
            original_indices = df[mask].index
            df.loc[original_indices, 'Performance_Score'] = subset['Performance_Score'].values
            print(f"✓ Successfully calculated scores for {position_group}")
        else:
            print(f"✗ Length mismatch for {position_group}: {len(subset)} vs {original_length}")
            
    except Exception as e:
        print(f"✗ Error processing {position_group}: {e}")
        # Set default scores for this position group
        df.loc[mask, 'Performance_Score'] = 50  # Default middle score

# Cap scores at 100
df['Performance_Score'] = df['Performance_Score'].clip(0, 100)
print(f"\nCompleted performance score calculation. Score range: {df['Performance_Score'].min():.1f} - {df['Performance_Score'].max():.1f}")

# Create the combined dataset with all new columns
combined_df = df.copy()

# Create summary table by Season and Position
try:
    season_position_summary = combined_df.groupby(['Season', 'Position_Group']).agg({
        'Performance_Score': ['mean', 'max', 'min', 'std', 'count'],
        'Player': 'nunique',
        'Min': 'sum',
        'Gls': 'sum',
        'Ast': 'sum'
    }).round(2)

    season_position_summary.columns = ['Avg_Performance', 'Max_Performance', 'Min_Performance', 
                                      'Std_Performance', 'Total_Matches', 'Unique_Players', 
                                      'Total_Minutes', 'Total_Goals', 'Total_Assists']
    season_position_summary = season_position_summary.reset_index()
except Exception as e:
    print(f"Error creating season position summary: {e}")
    # Create a simple summary instead
    season_position_summary = combined_df.groupby(['Season', 'Position_Group'])['Performance_Score'].mean().reset_index()
    season_position_summary.columns = ['Season', 'Position_Group', 'Avg_Performance']

# Create summary table by Player, Season, and Position
try:
    player_season_summary = combined_df.groupby(['Player', 'Position_Group', 'Season']).agg({
        'Performance_Score': ['mean', 'max', 'min', 'count'],
        'Min': 'sum',
        'Gls': 'sum',
        'Ast': 'sum',
        'Date': ['min', 'max']
    }).round(2)

    player_season_summary.columns = ['Avg_Performance_Score', 'Max_Performance_Score', 
                                    'Min_Performance_Score', 'Games_Played', 'Total_Minutes', 
                                    'Total_Goals', 'Total_Assists', 'First_Game', 'Last_Game']
    player_season_summary = player_season_summary.reset_index()
except Exception as e:
    print(f"Error creating player season summary: {e}")
    # Create a simple summary instead
    player_season_summary = combined_df.groupby(['Player', 'Position_Group', 'Season']).agg({
        'Performance_Score': 'mean',
        'Min': 'sum'
    }).reset_index()
    player_season_summary.columns = ['Player', 'Position_Group', 'Season', 'Avg_Performance_Score', 'Total_Minutes']

# Display comprehensive summary
print("\n=== REAL MADRID PERFORMANCE SCORING SYSTEM ===")
if len(player_season_summary) > 0:
    print("Top 10 Players by Average Performance Score:")
    if 'Avg_Performance_Score' in player_season_summary.columns:
        top_players = player_season_summary.nlargest(10, 'Avg_Performance_Score')
        display_cols = [col for col in ['Player', 'Position_Group', 'Season', 'Avg_Performance_Score', 'Games_Played'] 
                       if col in top_players.columns]
        print(top_players[display_cols].to_string(index=False))

print("\n\n=== SUMMARY BY SEASON AND POSITION ===")
if len(season_position_summary) > 0:
    print(season_position_summary.to_string(index=False))

print("\n\n=== TOP PERFORMERS BY SEASON ===")
if len(player_season_summary) > 0 and 'Avg_Performance_Score' in player_season_summary.columns:
    for season in sorted(combined_df['Season'].unique()):
        if pd.notna(season):
            season_data = player_season_summary[player_season_summary['Season'] == season]
            if len(season_data) > 0:
                top_performers = season_data.nlargest(5, 'Avg_Performance_Score')
                print(f"\n{season} Season - Top 5 Performers:")
                display_cols = [col for col in ['Player', 'Position_Group', 'Avg_Performance_Score', 'Games_Played'] 
                               if col in top_performers.columns]
                print(top_performers[display_cols].to_string(index=False))

print("\n\n=== POSITION ANALYSIS ACROSS SEASONS ===")
try:
    position_across_seasons = combined_df.groupby(['Position_Group', 'Season'])['Performance_Score'].mean().unstack(fill_value=0).round(2)
    print(position_across_seasons.to_string())
except Exception as e:
    print(f"Could not create position analysis: {e}")
    # Simple alternative
    print(combined_df.groupby(['Position_Group', 'Season'])['Performance_Score'].mean().round(2).to_string())

# Save all files with error handling
try:
    # 1. Enhanced combined dataset with all columns
    enhanced_path = '/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid_combined/real_madrid_enhanced_complete.csv'
    combined_df.to_csv(enhanced_path, index=False)
    print(f"✓ Enhanced dataset saved: {enhanced_path}")
except Exception as e:
    print(f"✗ Error saving enhanced dataset: {e}")

try:
    # 2. Season-Position summary
    season_pos_path = '/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid_combined/summary_by_season_position.csv'
    season_position_summary.to_csv(season_pos_path, index=False)
    print(f"✓ Season-Position summary saved: {season_pos_path}")
except Exception as e:
    print(f"✗ Error saving season-position summary: {e}")

try:
    # 3. Player-Season summary
    player_season_path = '/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid_combined/player_summary_by_season.csv'
    player_season_summary.to_csv(player_season_path, index=False)
    print(f"✓ Player summary saved: {player_season_path}")
except Exception as e:
    print(f"✗ Error saving player summary: {e}")

try:
    # 4. Position performance across seasons
    position_seasons_path = '/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid_combined/position_performance_across_seasons.csv'
    if 'position_across_seasons' in locals():
        position_across_seasons.to_csv(position_seasons_path)
        print(f"✓ Position analysis saved: {position_seasons_path}")
    else:
        # Create simple version
        simple_position_analysis = combined_df.groupby(['Position_Group', 'Season'])['Performance_Score'].mean().reset_index()
        simple_position_analysis.to_csv(position_seasons_path, index=False)
        print(f"✓ Simple position analysis saved: {position_seasons_path}")
except Exception as e:
    print(f"✗ Error saving position analysis: {e}")

print(f"\n=== DATASET INFO ===")
print(f"Total records in enhanced dataset: {len(combined_df)}")
print(f"Seasons covered: {sorted(combined_df['Season'].unique())}")
print(f"Position groups: {combined_df['Position_Group'].value_counts().to_dict()}")
print(f"Unique players: {combined_df['Player'].nunique()}")

# Show sample of enhanced dataset
print(f"\n=== SAMPLE OF ENHANCED DATASET ===")
sample_cols = ['Date', 'Season', 'Player', 'Position_Group', 'Performance_Score', 'Min', 'Gls', 'Ast']
available_cols = [col for col in sample_cols if col in combined_df.columns]
print(combined_df[available_cols].head(15).to_string(index=False))

print("\n🏆 REAL MADRID PERFORMANCE ANALYSIS COMPLETE! 🏆")

=== HANDLING DUPLICATES ===
Initial dataset shape: (1550, 73)
Removed 0 exact duplicate rows
Final dataset shape: (1550, 73)
=== DATASET COLUMN ANALYSIS ===
Total columns: 75
Column names: ['Date', 'Competition', 'Opponent', 'Player', 'number', 'Nation', 'Pos', 'Age', 'Min', ' Gls', ' Ast', ' PK', ' PKatt', ' Sh', ' SoT', ' CrdY', ' CrdR', ' Touches', ' Tkl', ' Int', ' Blocks', 'Expected xG', 'Expected npxG', 'Expected xAG', 'SCA', 'GCA', 'Passes Cmp', 'Passes Att', 'Passes Cmp%', 'Passes PrgP', 'Carries Carries', 'Carries PrgC', 'Take-Ons Att', 'Take-Ons Succ', 'Tackles Tkl', 'Tackles TklW', 'Tackles Def 3rd', 'Tackles Mid 3rd', 'Tackles Att 3rd', 'Challenges Tkl', 'Challenges Att', 'Challenges Tkl%', 'Challenges Lost', 'Blocks Blocks', 'Blocks Sh', 'Blocks Pass', 'Int', 'Tkl+Int', 'Clr', 'Err', 'Total Cmp', 'Total Att', 'Total Cmp%', 'Total TotDist', 'Total PrgDist', 'Short Cmp', 'Short Att', 'Short Cmp%', 'Medium Cmp', 'Medium Att', 'Medium Cmp%', 'Long Cmp', 'Long Att', 'Long Cmp%'

  7.26760563  3.5        10.          0.          8.88383838  4.
  5.22222222  1.97674419  0.         16.39344262 10.11111111  3.61111111
  3.55555556  0.44444444  4.16666667  8.83928571  3.92156863  9.12790698
 10.          0.         10.86956522  8.20137694  0.          1.14864865
  0.          7.21126761  4.47368421  4.22222222  3.70689655  0.
  1.53846154  0.          4.61111111  3.92857143  0.          0.28089888
  0.         10.56451613  5.17857143  6.66666667  4.88888889 15.98684211
  0.         12.42165242  0.          9.96969697 12.88311688  0.38888889
  0.         10.93023256 15.29411765  9.58333333  0.          0.07246377
  6.47222222  6.64556962  0.          6.22222222  8.27777778 11.00401606
  0.          2.84019975  0.          6.4         0.          6.31961259
  0.          6.22222222  5.11111111  2.66666667  0.          5.04054054
  8.75        6.7816092   0.          1.26666667 11.21428571  7.
  0.          9.30555556  0.          3.76811594  6.66666667  0.96385542
  

## Second code

In [30]:
import pandas as pd
import numpy as np
import os

print("=== REAL MADRID DATA PREPROCESSING & PERFORMANCE ANALYSIS ===")

# Load the data
original_path = '/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid_combined/combined_real_madrid.csv'

try:
    df = pd.read_csv(original_path)
    print("✓ Loaded dataset:", df.shape)
except FileNotFoundError:
    print("❌ File not found:", original_path)
    exit()

print("Original columns:", len(df.columns))

# ==========================================
# PHASE 1: DATA CLEANING & PREPROCESSING
# ==========================================

print("\n🧹 PHASE 1: DATA CLEANING & PREPROCESSING")

# 1. Remove problematic columns
columns_to_remove = ['Match URL', 'Challenges Tkl%', 'number']
existing_cols_to_remove = [col for col in columns_to_remove if col in df.columns]
if existing_cols_to_remove:
    df = df.drop(columns=existing_cols_to_remove)
    print("✓ Removed columns:", existing_cols_to_remove)

# 2. Clean Age column (remove everything after the dash)
if 'Age' in df.columns:
    print("✓ Cleaning Age column...")
    df['Age'] = df['Age'].astype(str).str.split('-').str[0]
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce').fillna(0).astype(int)
    print("  Age range:", df['Age'].min(), "-", df['Age'].max())

# 3. Clean Nation column (keep only last 3 characters)
if 'Nation' in df.columns:
    print("✓ Cleaning Nation column...")
    df['Nation'] = df['Nation'].astype(str).str[-3:]
    print("  Sample nations:", df['Nation'].unique()[:10].tolist())

# 4. Clean column names
df.columns = df.columns.str.strip()
print("✓ Column names cleaned")

# 5. Setup Date and Season
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y', errors='coerce')
df['Season'] = df['Date'].apply(lambda x: 
    str(x.year) + "-" + str(x.year + 1)[-2:] if pd.notna(x) and x.month >= 8 
    else str(x.year - 1) + "-" + str(x.year)[-2:] if pd.notna(x) else "Unknown"
)

# 6. Position mapping
position_mapping = {
    'GK': 'Goalkeeper', 'CB': 'Defense', 'LB': 'Defense', 'RB': 'Defense',
    'DM': 'Midfield', 'CM': 'Midfield', 'LM': 'Midfield', 'RM': 'Midfield', 'AM': 'Midfield',
    'FW': 'Forward'
}
df['Position_Group'] = df['Pos'].map(position_mapping)

print("✓ Basic setup complete")
print("Cleaned dataset shape:", df.shape)
print("Position distribution:", df['Position_Group'].value_counts().to_dict())

# 7. Clean key statistical columns
def safe_numeric_conversion(series, default_value=0):
    """Convert series to numeric safely"""
    try:
        # Convert to string first, handle empty/null values
        series_str = series.astype(str).replace(['nan', 'NaN', '', ' ', 'None'], '0')
        # Convert to numeric
        result = pd.to_numeric(series_str, errors='coerce').fillna(default_value)
        return result
    except:
        # If anything fails, return default values
        return pd.Series([default_value] * len(series))

# Key columns we need for analysis
key_stats = {
    'Min': 0, 'Gls': 0, 'Ast': 0, 'Sh': 0, 'SoT': 0, 'Tkl': 0, 'Int': 0, 
    'Blocks': 0, 'Clr': 0, 'SCA': 0, 'KP': 0, 'Expected xG': 0,
    'Total Cmp%': 0, 'Long Cmp%': 0, 'Total Att': 0, 'Passes PrgP': 0
}

print("\nCleaning statistical columns...")
for col, default in key_stats.items():
    if col in df.columns:
        print("  Processing", col)
        df[col] = safe_numeric_conversion(df[col], default)
        
        # Safe min/max calculation with error handling
        try:
            col_values = df[col]
            if len(col_values) > 0:
                # Ensure all values are numeric
                numeric_values = pd.to_numeric(col_values, errors='coerce')
                if not numeric_values.isnull().all():
                    min_val = numeric_values.min()
                    max_val = numeric_values.max()
                    print("    ✓ Range:", round(min_val, 1), "-", round(max_val, 1))
                else:
                    print("    ✓ All values set to default:", default)
            else:
                print("    ✓ Empty column, set to default:", default)
        except Exception as e:
            print("    ⚠️ Could not calculate range, but column processed")
    else:
        df[col] = default
        print("  + Created", col, "=", default)

# 8. Handle Take-ons
if 'Take-Ons Att' in df.columns and 'Take-Ons Succ' in df.columns:
    df['Take-Ons Att'] = safe_numeric_conversion(df['Take-Ons Att'], 0)
    df['Take-Ons Succ'] = safe_numeric_conversion(df['Take-Ons Succ'], 0)
    df['Take-Ons Succ%'] = np.where(df['Take-Ons Att'] > 0, 
                                   (df['Take-Ons Succ'] / df['Take-Ons Att'] * 100), 0)
    print("✓ Take-on statistics calculated")
else:
    df['Take-Ons Succ%'] = 0
    print("✓ Take-on columns not found, set to 0")

print("✓ Data preprocessing completed")

# ==========================================
# PHASE 2: PERFORMANCE SCORING
# ==========================================

print("\n⚽ PHASE 2: CALCULATING PERFORMANCE SCORES")

df['Performance_Score'] = 0.0

# Helper function for per-90 calculations
def per_90(stat, minutes):
    return np.where(minutes > 0, stat / minutes * 90, 0)

# Safe function to get numeric min/max
def safe_min_max(series):
    try:
        numeric_series = pd.to_numeric(series, errors='coerce')
        if not numeric_series.isnull().all():
            return numeric_series.min(), numeric_series.max()
        else:
            return 0.0, 0.0
    except:
        return 0.0, 0.0

# GOALKEEPERS
gk_mask = df['Position_Group'] == 'Goalkeeper'
if gk_mask.sum() > 0:
    print("\nGoalkeepers:", gk_mask.sum(), "records")
    gk_score = df.loc[gk_mask, 'Total Cmp%'] * 0.6 + df.loc[gk_mask, 'Long Cmp%'] * 0.4
    df.loc[gk_mask, 'Performance_Score'] = gk_score
    min_score, max_score = safe_min_max(gk_score)
    print("  ✓ Score range:", round(min_score, 1), "-", round(max_score, 1))

# FORWARDS
fw_mask = df['Position_Group'] == 'Forward'
if fw_mask.sum() > 0:
    print("\nForwards:", fw_mask.sum(), "records")
    
    fw_data = df[fw_mask].copy()
    
    # Calculate per-90 stats
    fw_data['Gls_90'] = per_90(fw_data['Gls'], fw_data['Min'])
    fw_data['Ast_90'] = per_90(fw_data['Ast'], fw_data['Min'])
    fw_data['Sh_90'] = per_90(fw_data['Sh'], fw_data['Min'])
    fw_data['SoT_90'] = per_90(fw_data['SoT'], fw_data['Min'])
    fw_data['xG_90'] = per_90(fw_data['Expected xG'], fw_data['Min'])
    
    # Get max values for normalization (avoid division by zero)
    _, max_g = safe_min_max(fw_data['Gls_90'])
    _, max_s = safe_min_max(fw_data['Sh_90'])
    _, max_sot = safe_min_max(fw_data['SoT_90'])
    _, max_xg = safe_min_max(fw_data['xG_90'])
    
    max_g = max(max_g, 0.1)
    max_s = max(max_s, 0.1)
    max_sot = max(max_sot, 0.1)
    max_xg = max(max_xg, 0.1)
    
    # Calculate weighted score
    fw_score = (
        (fw_data['Gls_90'] / max_g * 100) * 0.35 +      # Goals 35%
        (fw_data['Ast_90'] * 25) * 0.20 +               # Assists 20%
        (fw_data['Sh_90'] / max_s * 100) * 0.05 +       # Shots 5%
        (fw_data['SoT_90'] / max_sot * 100) * 0.15 +    # SoT 15%
        (fw_data['xG_90'] / max_xg * 100) * 0.15 +      # xG 15%
        fw_data['Take-Ons Succ%'] * 0.10                # Take-ons 10%
    )
    
    df.loc[fw_mask, 'Performance_Score'] = fw_score
    min_score, max_score = safe_min_max(fw_score)
    print("  ✓ Score range:", round(min_score, 1), "-", round(max_score, 1))

# MIDFIELDERS
mid_mask = df['Position_Group'] == 'Midfield'
if mid_mask.sum() > 0:
    print("\nMidfielders:", mid_mask.sum(), "records")
    
    mid_data = df[mid_mask].copy()
    
    # Calculate per-90 stats
    mid_data['Pass_90'] = per_90(mid_data['Total Att'], mid_data['Min'])
    mid_data['Prog_90'] = per_90(mid_data['Passes PrgP'], mid_data['Min'])
    mid_data['SCA_90'] = per_90(mid_data['SCA'], mid_data['Min'])
    mid_data['KP_90'] = per_90(mid_data['KP'], mid_data['Min'])
    mid_data['Ast_90'] = per_90(mid_data['Ast'], mid_data['Min'])
    
    # Get max values for normalization
    _, max_pass = safe_min_max(mid_data['Pass_90'])
    _, max_prog = safe_min_max(mid_data['Prog_90'])
    _, max_sca = safe_min_max(mid_data['SCA_90'])
    _, max_kp = safe_min_max(mid_data['KP_90'])
    
    max_pass = max(max_pass, 0.1)
    max_prog = max(max_prog, 0.1)
    max_sca = max(max_sca, 0.1)
    max_kp = max(max_kp, 0.1)
    
    # Calculate weighted score
    mid_score = (
        mid_data['Total Cmp%'] * 0.20 +                         # Pass accuracy 20%
        (mid_data['Pass_90'] / max_pass * 100) * 0.15 +         # Pass volume 15%
        (mid_data['Prog_90'] / max_prog * 100) * 0.20 +         # Progressive 20%
        (mid_data['SCA_90'] / max_sca * 100) * 0.20 +           # Creativity 20%
        (mid_data['KP_90'] / max_kp * 100) * 0.15 +             # Key passes 15%
        (mid_data['Ast_90'] * 25) * 0.10                        # Assists 10%
    )
    
    df.loc[mid_mask, 'Performance_Score'] = mid_score
    min_score, max_score = safe_min_max(mid_score)
    print("  ✓ Score range:", round(min_score, 1), "-", round(max_score, 1))

# DEFENDERS
def_mask = df['Position_Group'] == 'Defense'
if def_mask.sum() > 0:
    print("\nDefenders:", def_mask.sum(), "records")
    
    def_data = df[def_mask].copy()
    
    # Calculate per-90 stats
    def_data['Tkl_90'] = per_90(def_data['Tkl'], def_data['Min'])
    def_data['Int_90'] = per_90(def_data['Int'], def_data['Min'])
    def_data['Blk_90'] = per_90(def_data['Blocks'], def_data['Min'])
    def_data['Clr_90'] = per_90(def_data['Clr'], def_data['Min'])
    
    # Get max values for normalization
    _, max_tkl = safe_min_max(def_data['Tkl_90'])
    _, max_int = safe_min_max(def_data['Int_90'])
    _, max_blk = safe_min_max(def_data['Blk_90'])
    _, max_clr = safe_min_max(def_data['Clr_90'])
    
    max_tkl = max(max_tkl, 0.1)
    max_int = max(max_int, 0.1)
    max_blk = max(max_blk, 0.1)
    max_clr = max(max_clr, 0.1)
    
    # Calculate weighted score
    def_score = (
        (def_data['Tkl_90'] / max_tkl * 100) * 0.25 +           # Tackles 25%
        (def_data['Int_90'] / max_int * 100) * 0.25 +           # Interceptions 25%
        (def_data['Blk_90'] / max_blk * 100) * 0.20 +           # Blocks 20%
        (def_data['Clr_90'] / max_clr * 100) * 0.15 +           # Clearances 15%
        def_data['Total Cmp%'] * 0.15                           # Pass accuracy 15%
    )
    
    df.loc[def_mask, 'Performance_Score'] = def_score
    min_score, max_score = safe_min_max(def_score)
    print("  ✓ Score range:", round(min_score, 1), "-", round(max_score, 1))

# Cap all scores at 100
df['Performance_Score'] = df['Performance_Score'].clip(0, 100)

print("\n✅ SCORING COMPLETED")
overall_min = float(df['Performance_Score'].min())
overall_max = float(df['Performance_Score'].max())
print("Overall range:", round(overall_min, 1), "-", round(overall_max, 1))

# ==========================================
# PHASE 3: ANALYSIS & RESULTS
# ==========================================

print("\n📊 PHASE 3: ANALYSIS & RESULTS")

# Check Antonio Rüdiger specifically
print("\n=== VALIDATION CHECK ===")
rudiger = df[df['Player'].str.contains('Rüdiger', na=False, case=False)]
if len(rudiger) > 0:
    rudiger_avg = rudiger['Performance_Score'].mean()
    print("✓ Rüdiger average score:", round(rudiger_avg, 1), "(should not be 50.0)")
    rudiger_best = rudiger.nlargest(3, 'Performance_Score')[
        ['Date', 'Performance_Score', 'Min', 'Tkl', 'Int', 'Blocks', 'Total Cmp%']
    ]
    print("✓ Rüdiger's top performances:")
    print(rudiger_best.to_string(index=False))

# Top performances
print("\n=== TOP 15 INDIVIDUAL PERFORMANCES ===")
top_individual = df.nlargest(15, 'Performance_Score')[
    ['Date', 'Player', 'Position_Group', 'Performance_Score', 'Min', 'Gls', 'Ast', 'Opponent']
]
print(top_individual.to_string(index=False))

# Player averages
print("\n=== BEST SEASON AVERAGES (500+ minutes) ===")
player_avg = df.groupby(['Player', 'Position_Group', 'Season']).agg({
    'Performance_Score': 'mean',
    'Min': 'sum',
    'Gls': 'sum',
    'Ast': 'sum',
    'Age': 'first',
    'Nation': 'first'
}).round(2).reset_index()

significant_players = player_avg[player_avg['Min'] >= 500]
top_averages = significant_players.nlargest(15, 'Performance_Score')
print(top_averages.to_string(index=False))

# ==========================================
# PHASE 4: SAVE RESULTS
# ==========================================

print("\n💾 PHASE 4: SAVING RESULTS")

output_dir = '/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid_combined'
os.makedirs(output_dir, exist_ok=True)

# Save complete cleaned dataset
final_path = output_dir + '/real_madrid_cleaned_with_scores.csv'
df.to_csv(final_path, index=False)

# Save player averages
averages_path = output_dir + '/player_season_averages_clean.csv'
player_avg.to_csv(averages_path, index=False)

print("✅ SAVED:")
print("📊 Complete cleaned data:", final_path)
print("🏆 Player averages:", averages_path)

print("\n🎯 FINAL SUMMARY:")
print("• Processed", len(df), "match records")
print("• Unique players:", df['Player'].nunique())
print("• Age range:", df['Age'].min(), "-", df['Age'].max(), "years")
print("• Nations:", df['Nation'].nunique(), "different countries")
print("• Players with 500+ minutes:", len(significant_players))
best_performer = df.loc[df['Performance_Score'].idxmax(), 'Player']
best_score = float(df['Performance_Score'].max())
print("• Best performer:", best_performer, "(" + str(round(best_score, 1)) + ")")

print("\n🏆 REAL MADRID ANALYSIS COMPLETE! 🏆")

=== REAL MADRID DATA PREPROCESSING & PERFORMANCE ANALYSIS ===
✓ Loaded dataset: (1550, 73)
Original columns: 73

🧹 PHASE 1: DATA CLEANING & PREPROCESSING
✓ Removed columns: ['Match URL', 'Challenges Tkl%', 'number']
✓ Cleaning Age column...
  Age range: 17 - 39
✓ Cleaning Nation column...
  Sample nations: ['BRA', 'CRO', 'ESP', 'ENG', 'FRA', 'GER', 'URU', 'AUT', 'UKR', 'MAR']
✓ Column names cleaned
✓ Basic setup complete
Cleaned dataset shape: (1550, 72)
Position distribution: {'Defense': 483, 'Midfield': 410, 'Forward': 211, 'Goalkeeper': 104}

Cleaning statistical columns...
  Processing Min
    ✓ Range: 1 - 120
  Processing Gls
    ✓ Range: 0 - 3
  Processing Ast
    ⚠️ Could not calculate range, but column processed
  Processing Sh
    ✓ Range: 0 - 11
  Processing SoT
    ✓ Range: 0 - 5
  Processing Tkl
    ✓ Range: 0 - 10
  Processing Int
    ⚠️ Could not calculate range, but column processed
  Processing Blocks
    ✓ Range: 0 - 7
  Processing Clr
    ✓ Range: 0 - 14
  Processing 

ValueError: cannot reindex on an axis with duplicate labels

In [31]:
import pandas as pd
import numpy as np
import os

print("=== REAL MADRID DATA PREPROCESSING & PERFORMANCE ANALYSIS ===")

# Load the data
original_path = '/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid_combined/combined_real_madrid.csv'

try:
    df = pd.read_csv(original_path)
    print("✓ Loaded dataset:", df.shape)
except FileNotFoundError:
    print("❌ File not found:", original_path)
    exit()

print("Original columns:", len(df.columns))

# ==========================================
# PHASE 1: DATA CLEANING & PREPROCESSING
# ==========================================

print("\n🧹 PHASE 1: DATA CLEANING & PREPROCESSING")

# 1. Remove problematic columns
columns_to_remove = ['Match URL', 'Challenges Tkl%', 'number']
existing_cols_to_remove = [col for col in columns_to_remove if col in df.columns]
if existing_cols_to_remove:
    df = df.drop(columns=existing_cols_to_remove)
    print("✓ Removed columns:", existing_cols_to_remove)

# 2. Clean Age column (remove everything after the dash)
if 'Age' in df.columns:
    print("✓ Cleaning Age column...")
    df['Age'] = df['Age'].astype(str).str.split('-').str[0]
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce').fillna(0).astype(int)
    print("  Age range:", df['Age'].min(), "-", df['Age'].max())

# 3. Clean Nation column (keep only last 3 characters)
if 'Nation' in df.columns:
    print("✓ Cleaning Nation column...")
    df['Nation'] = df['Nation'].astype(str).str[-3:]
    print("  Sample nations:", df['Nation'].unique()[:10].tolist())

# 4. Clean column names
df.columns = df.columns.str.strip()
print("✓ Column names cleaned")

# 5. Setup Date and Season
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y', errors='coerce')
df['Season'] = df['Date'].apply(lambda x: 
    str(x.year) + "-" + str(x.year + 1)[-2:] if pd.notna(x) and x.month >= 8 
    else str(x.year - 1) + "-" + str(x.year)[-2:] if pd.notna(x) else "Unknown"
)

# 6. Position mapping
position_mapping = {
    'GK': 'Goalkeeper', 'CB': 'Defense', 'LB': 'Defense', 'RB': 'Defense',
    'DM': 'Midfield', 'CM': 'Midfield', 'LM': 'Midfield', 'RM': 'Midfield', 'AM': 'Midfield',
    'FW': 'Forward'
}
df['Position_Group'] = df['Pos'].map(position_mapping)

print("✓ Basic setup complete")
print("Cleaned dataset shape:", df.shape)
print("Position distribution:", df['Position_Group'].value_counts().to_dict())

# 7. Clean key statistical columns
def safe_numeric_conversion(series, default_value=0):
    """Convert series to numeric safely"""
    try:
        # Convert to string first, handle empty/null values
        series_str = series.astype(str).replace(['nan', 'NaN', '', ' ', 'None'], '0')
        # Convert to numeric
        result = pd.to_numeric(series_str, errors='coerce').fillna(default_value)
        return result
    except:
        # If anything fails, return default values
        return pd.Series([default_value] * len(series))

# Key columns we need for analysis
key_stats = {
    'Min': 0, 'Gls': 0, 'Ast': 0, 'Sh': 0, 'SoT': 0, 'Tkl': 0, 'Int': 0, 
    'Blocks': 0, 'Clr': 0, 'SCA': 0, 'KP': 0, 'Expected xG': 0,
    'Total Cmp%': 0, 'Long Cmp%': 0, 'Total Att': 0, 'Passes PrgP': 0
}

print("\nCleaning statistical columns...")
for col, default in key_stats.items():
    if col in df.columns:
        print("  Processing", col)
        df[col] = safe_numeric_conversion(df[col], default)
        
        # Safe min/max calculation with error handling
        try:
            col_values = df[col]
            if len(col_values) > 0:
                # Ensure all values are numeric
                numeric_values = pd.to_numeric(col_values, errors='coerce')
                if not numeric_values.isnull().all():
                    min_val = numeric_values.min()
                    max_val = numeric_values.max()
                    print("    ✓ Range:", round(min_val, 1), "-", round(max_val, 1))
                else:
                    print("    ✓ All values set to default:", default)
            else:
                print("    ✓ Empty column, set to default:", default)
        except Exception as e:
            print("    ⚠️ Could not calculate range, but column processed")
    else:
        df[col] = default
        print("  + Created", col, "=", default)

# 8. Handle Take-ons
if 'Take-Ons Att' in df.columns and 'Take-Ons Succ' in df.columns:
    df['Take-Ons Att'] = safe_numeric_conversion(df['Take-Ons Att'], 0)
    df['Take-Ons Succ'] = safe_numeric_conversion(df['Take-Ons Succ'], 0)
    df['Take-Ons Succ%'] = np.where(df['Take-Ons Att'] > 0, 
                                   (df['Take-Ons Succ'] / df['Take-Ons Att'] * 100), 0)
    print("✓ Take-on statistics calculated")
else:
    df['Take-Ons Succ%'] = 0
    print("✓ Take-on columns not found, set to 0")

print("✓ Data preprocessing completed")

# ==========================================
# PHASE 2: PERFORMANCE SCORING
# ==========================================

print("\n⚽ PHASE 2: CALCULATING PERFORMANCE SCORES")

df['Performance_Score'] = 0.0

# Helper function for per-90 calculations
def per_90(stat, minutes):
    return np.where(minutes > 0, stat / minutes * 90, 0)

# Safe function to get numeric min/max
def safe_min_max(series):
    try:
        numeric_series = pd.to_numeric(series, errors='coerce')
        if not numeric_series.isnull().all():
            return numeric_series.min(), numeric_series.max()
        else:
            return 0.0, 0.0
    except:
        return 0.0, 0.0

# GOALKEEPERS
gk_mask = df['Position_Group'] == 'Goalkeeper'
if gk_mask.sum() > 0:
    print("\nGoalkeepers:", gk_mask.sum(), "records")
    gk_score = df.loc[gk_mask, 'Total Cmp%'] * 0.6 + df.loc[gk_mask, 'Long Cmp%'] * 0.4
    df.loc[gk_mask, 'Performance_Score'] = gk_score
    min_score, max_score = safe_min_max(gk_score)
    print("  ✓ Score range:", round(min_score, 1), "-", round(max_score, 1))

# FORWARDS
fw_mask = df['Position_Group'] == 'Forward'
if fw_mask.sum() > 0:
    print("\nForwards:", fw_mask.sum(), "records")
    
    # Create clean copy with reset index to avoid duplicate index issues
    fw_data = df[fw_mask].copy().reset_index(drop=True)
    
    # Calculate per-90 stats using numpy arrays to avoid pandas alignment issues
    min_values = fw_data['Min'].values
    gls_values = fw_data['Gls'].values
    ast_values = fw_data['Ast'].values
    sh_values = fw_data['Sh'].values
    sot_values = fw_data['SoT'].values
    xg_values = fw_data['Expected xG'].values
    takeon_values = fw_data['Take-Ons Succ%'].values
    
    # Calculate per-90 stats using numpy operations
    fw_data['Gls_90'] = np.where(min_values > 0, gls_values / min_values * 90, 0)
    fw_data['Ast_90'] = np.where(min_values > 0, ast_values / min_values * 90, 0)
    fw_data['Sh_90'] = np.where(min_values > 0, sh_values / min_values * 90, 0)
    fw_data['SoT_90'] = np.where(min_values > 0, sot_values / min_values * 90, 0)
    fw_data['xG_90'] = np.where(min_values > 0, xg_values / min_values * 90, 0)
    
    # Get max values for normalization
    _, max_g = safe_min_max(fw_data['Gls_90'])
    _, max_s = safe_min_max(fw_data['Sh_90'])
    _, max_sot = safe_min_max(fw_data['SoT_90'])
    _, max_xg = safe_min_max(fw_data['xG_90'])
    
    max_g = max(max_g, 0.1)
    max_s = max(max_s, 0.1)
    max_sot = max(max_sot, 0.1)
    max_xg = max(max_xg, 0.1)
    
    # Calculate weighted score
    fw_score = (
        (fw_data['Gls_90'] / max_g * 100) * 0.35 +      # Goals 35%
        (fw_data['Ast_90'] * 25) * 0.20 +               # Assists 20%
        (fw_data['Sh_90'] / max_s * 100) * 0.05 +       # Shots 5%
        (fw_data['SoT_90'] / max_sot * 100) * 0.15 +    # SoT 15%
        (fw_data['xG_90'] / max_xg * 100) * 0.15 +      # xG 15%
        takeon_values * 0.10                            # Take-ons 10%
    )
    
    # Update main dataframe using the original indices
    original_indices = df[fw_mask].index
    df.loc[original_indices, 'Performance_Score'] = fw_score.values
    
    min_score, max_score = safe_min_max(fw_score)
    print("  ✓ Score range:", round(min_score, 1), "-", round(max_score, 1))

# MIDFIELDERS
mid_mask = df['Position_Group'] == 'Midfield'
if mid_mask.sum() > 0:
    print("\nMidfielders:", mid_mask.sum(), "records")
    
    # Create clean copy with reset index
    mid_data = df[mid_mask].copy().reset_index(drop=True)
    
    # Calculate per-90 stats using numpy arrays
    min_values = mid_data['Min'].values
    att_values = mid_data['Total Att'].values
    prog_values = mid_data['Passes PrgP'].values
    sca_values = mid_data['SCA'].values
    kp_values = mid_data['KP'].values
    ast_values = mid_data['Ast'].values
    cmp_pct_values = mid_data['Total Cmp%'].values
    
    # Calculate per-90 stats
    mid_data['Pass_90'] = np.where(min_values > 0, att_values / min_values * 90, 0)
    mid_data['Prog_90'] = np.where(min_values > 0, prog_values / min_values * 90, 0)
    mid_data['SCA_90'] = np.where(min_values > 0, sca_values / min_values * 90, 0)
    mid_data['KP_90'] = np.where(min_values > 0, kp_values / min_values * 90, 0)
    mid_data['Ast_90'] = np.where(min_values > 0, ast_values / min_values * 90, 0)
    
    # Get max values for normalization
    _, max_pass = safe_min_max(mid_data['Pass_90'])
    _, max_prog = safe_min_max(mid_data['Prog_90'])
    _, max_sca = safe_min_max(mid_data['SCA_90'])
    _, max_kp = safe_min_max(mid_data['KP_90'])
    
    max_pass = max(max_pass, 0.1)
    max_prog = max(max_prog, 0.1)
    max_sca = max(max_sca, 0.1)
    max_kp = max(max_kp, 0.1)
    
    # Calculate weighted score
    mid_score = (
        cmp_pct_values * 0.20 +                                 # Pass accuracy 20%
        (mid_data['Pass_90'] / max_pass * 100) * 0.15 +         # Pass volume 15%
        (mid_data['Prog_90'] / max_prog * 100) * 0.20 +         # Progressive 20%
        (mid_data['SCA_90'] / max_sca * 100) * 0.20 +           # Creativity 20%
        (mid_data['KP_90'] / max_kp * 100) * 0.15 +             # Key passes 15%
        (mid_data['Ast_90'] * 25) * 0.10                        # Assists 10%
    )
    
    # Update main dataframe
    original_indices = df[mid_mask].index
    df.loc[original_indices, 'Performance_Score'] = mid_score.values
    
    min_score, max_score = safe_min_max(mid_score)
    print("  ✓ Score range:", round(min_score, 1), "-", round(max_score, 1))

# DEFENDERS
def_mask = df['Position_Group'] == 'Defense'
if def_mask.sum() > 0:
    print("\nDefenders:", def_mask.sum(), "records")
    
    # Create clean copy with reset index
    def_data = df[def_mask].copy().reset_index(drop=True)
    
    # Calculate per-90 stats using numpy arrays
    min_values = def_data['Min'].values
    tkl_values = def_data['Tkl'].values
    int_values = def_data['Int'].values
    blk_values = def_data['Blocks'].values
    clr_values = def_data['Clr'].values
    cmp_pct_values = def_data['Total Cmp%'].values
    
    # Calculate per-90 stats
    def_data['Tkl_90'] = np.where(min_values > 0, tkl_values / min_values * 90, 0)
    def_data['Int_90'] = np.where(min_values > 0, int_values / min_values * 90, 0)
    def_data['Blk_90'] = np.where(min_values > 0, blk_values / min_values * 90, 0)
    def_data['Clr_90'] = np.where(min_values > 0, clr_values / min_values * 90, 0)
    
    # Get max values for normalization
    _, max_tkl = safe_min_max(def_data['Tkl_90'])
    _, max_int = safe_min_max(def_data['Int_90'])
    _, max_blk = safe_min_max(def_data['Blk_90'])
    _, max_clr = safe_min_max(def_data['Clr_90'])
    
    max_tkl = max(max_tkl, 0.1)
    max_int = max(max_int, 0.1)
    max_blk = max(max_blk, 0.1)
    max_clr = max(max_clr, 0.1)
    
    # Calculate weighted score
    def_score = (
        (def_data['Tkl_90'] / max_tkl * 100) * 0.25 +           # Tackles 25%
        (def_data['Int_90'] / max_int * 100) * 0.25 +           # Interceptions 25%
        (def_data['Blk_90'] / max_blk * 100) * 0.20 +           # Blocks 20%
        (def_data['Clr_90'] / max_clr * 100) * 0.15 +           # Clearances 15%
        cmp_pct_values * 0.15                                   # Pass accuracy 15%
    )
    
    # Update main dataframe
    original_indices = df[def_mask].index
    df.loc[original_indices, 'Performance_Score'] = def_score.values
    
    min_score, max_score = safe_min_max(def_score)
    print("  ✓ Score range:", round(min_score, 1), "-", round(max_score, 1))

# Cap all scores at 100
df['Performance_Score'] = df['Performance_Score'].clip(0, 100)

print("\n✅ SCORING COMPLETED")
overall_min = float(df['Performance_Score'].min())
overall_max = float(df['Performance_Score'].max())
print("Overall range:", round(overall_min, 1), "-", round(overall_max, 1))

# ==========================================
# PHASE 3: ANALYSIS & RESULTS
# ==========================================

print("\n📊 PHASE 3: ANALYSIS & RESULTS")

# Check Antonio Rüdiger specifically
print("\n=== VALIDATION CHECK ===")
rudiger = df[df['Player'].str.contains('Rüdiger', na=False, case=False)]
if len(rudiger) > 0:
    rudiger_avg = rudiger['Performance_Score'].mean()
    print("✓ Rüdiger average score:", round(rudiger_avg, 1), "(should not be 50.0)")
    rudiger_best = rudiger.nlargest(3, 'Performance_Score')[
        ['Date', 'Performance_Score', 'Min', 'Tkl', 'Int', 'Blocks', 'Total Cmp%']
    ]
    print("✓ Rüdiger's top performances:")
    print(rudiger_best.to_string(index=False))

# Top performances
print("\n=== TOP 15 INDIVIDUAL PERFORMANCES ===")
top_individual = df.nlargest(15, 'Performance_Score')[
    ['Date', 'Player', 'Position_Group', 'Performance_Score', 'Min', 'Gls', 'Ast', 'Opponent']
]
print(top_individual.to_string(index=False))

# Player averages
print("\n=== BEST SEASON AVERAGES (500+ minutes) ===")
player_avg = df.groupby(['Player', 'Position_Group', 'Season']).agg({
    'Performance_Score': 'mean',
    'Min': 'sum',
    'Gls': 'sum',
    'Ast': 'sum',
    'Age': 'first',
    'Nation': 'first'
}).round(2).reset_index()

significant_players = player_avg[player_avg['Min'] >= 500]
top_averages = significant_players.nlargest(15, 'Performance_Score')
print(top_averages.to_string(index=False))

# ==========================================
# PHASE 4: SAVE RESULTS
# ==========================================

print("\n💾 PHASE 4: SAVING RESULTS")

output_dir = '/Users/mariamoramora/Documents/GitHub/ADS599_Capstone/Soccer_Performance_Score/data/real_madrid_combined'
os.makedirs(output_dir, exist_ok=True)

# Save complete cleaned dataset
final_path = output_dir + '/real_madrid_cleaned_with_scores.csv'
df.to_csv(final_path, index=False)

# Save player averages
averages_path = output_dir + '/player_season_averages_clean.csv'
player_avg.to_csv(averages_path, index=False)

print("✅ SAVED:")
print("📊 Complete cleaned data:", final_path)
print("🏆 Player averages:", averages_path)

print("\n🎯 FINAL SUMMARY:")
print("• Processed", len(df), "match records")
print("• Unique players:", df['Player'].nunique())
print("• Age range:", df['Age'].min(), "-", df['Age'].max(), "years")
print("• Nations:", df['Nation'].nunique(), "different countries")
print("• Players with 500+ minutes:", len(significant_players))
best_performer = df.loc[df['Performance_Score'].idxmax(), 'Player']
best_score = float(df['Performance_Score'].max())
print("• Best performer:", best_performer, "(" + str(round(best_score, 1)) + ")")

print("\n🏆 REAL MADRID ANALYSIS COMPLETE! 🏆")

=== REAL MADRID DATA PREPROCESSING & PERFORMANCE ANALYSIS ===
✓ Loaded dataset: (1550, 73)
Original columns: 73

🧹 PHASE 1: DATA CLEANING & PREPROCESSING
✓ Removed columns: ['Match URL', 'Challenges Tkl%', 'number']
✓ Cleaning Age column...
  Age range: 17 - 39
✓ Cleaning Nation column...
  Sample nations: ['BRA', 'CRO', 'ESP', 'ENG', 'FRA', 'GER', 'URU', 'AUT', 'UKR', 'MAR']
✓ Column names cleaned
✓ Basic setup complete
Cleaned dataset shape: (1550, 72)
Position distribution: {'Defense': 483, 'Midfield': 410, 'Forward': 211, 'Goalkeeper': 104}

Cleaning statistical columns...
  Processing Min
    ✓ Range: 1 - 120
  Processing Gls
    ✓ Range: 0 - 3
  Processing Ast
    ⚠️ Could not calculate range, but column processed
  Processing Sh
    ✓ Range: 0 - 11
  Processing SoT
    ✓ Range: 0 - 5
  Processing Tkl
    ✓ Range: 0 - 10
  Processing Int
    ⚠️ Could not calculate range, but column processed
  Processing Blocks
    ✓ Range: 0 - 7
  Processing Clr
    ✓ Range: 0 - 14
  Processing 

ValueError: operands could not be broadcast together with shapes (211,2) (211,) 