In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import io
import glob
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from matplotlib.backends.backend_pdf import PdfPages
import math

path = r"C:\Users\scorp\Desktop\TRAFO-HACKATHON\smart-performance-dashboard\Model\data\PlayerFatigue\Catapult Data"


# =========================================================
# PART 1: ROBUST DATA LOADER
# =========================================================

def load_sport_sessions(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    print(f"Found {len(all_files)} CSV files in '{folder_path}'.")
    
    df_list = []
    
    for filename in all_files:
        try:
            # 1. Parse Metadata (First 9 lines)
            # We open the file just to read the date from the first line
            with open(filename, 'r') as f:
                first_line = f.readline() # "Date:,25/06/2025"
                
            # Extract date string (assuming format "Date:,25/06/2025")
            # Splits by comma, takes the second part, strips whitespace
            session_date = first_line.split(',')[1].strip()
            
            # 2. Read Data (Skip metadata)
            # skiprows=9 skips the first 9 lines. 
            # 'header=0' means the 10th line (index 9) is the header.
            # We allow low_memory=False to prevent type warnings on 1500+ cols
            current_df = pd.read_csv(filename, skiprows=9, header=0, low_memory=False)
            
            # 3. Add Context
            current_df['Session_Date'] = session_date
            current_df['Source_File'] = os.path.basename(filename)
            
            df_list.append(current_df)
            
        except Exception as e:
            print(f"Error reading {filename}: {e}")
            continue

    if not df_list:
        print("No data loaded.")
        return None

    # Combine all days into one massive dataset
    full_df = pd.concat(df_list, ignore_index=True)
    print(f"Successfully loaded {len(full_df)} total rows with {full_df.shape[1]} columns.")
    return full_df

# =========================================================
# PART 2: FEATURE SELECTION (Find Irrelevant Features)
# =========================================================

def identify_useless_features(df):
    print("\n--- STARTING FEATURE ANALYSIS ---")
    useless_cols = []
    
    # A. Drop Metadata/Identifier columns from analysis (keep them safe)
    # We don't want to delete 'Player Name' even if it's constant in a single-player file
    protected_cols = ['Player Name', 'Period Name', 'Session_Date', 'Source_File', 'Position Name']
    numeric_df = df.drop(columns=[c for c in protected_cols if c in df.columns])
    
    # Only look at numeric data for variance analysis
    numeric_df = numeric_df.select_dtypes(include=[np.number])
    
    # 1. CONSTANT FEATURES (Variance == 0)
    # ---------------------------------------------------
    # Columns where every single value is 0 or the same number
    variances = numeric_df.var()
    constant_cols = variances[variances == 0].index.tolist()
    print(f"[1] Constant/Dead Columns (Variance=0): {len(constant_cols)}")
    useless_cols.extend(constant_cols)
    
    # 2. ALL NULL FEATURES
    # ---------------------------------------------------
    # Columns that are completely empty
    null_cols = df.columns[df.isnull().all()].tolist()
    print(f"[2] Empty Columns (All NaN): {len(null_cols)}")
    useless_cols.extend(null_cols)
    
    # 3. CORRELATION ANALYSIS (Redundancy)
    # ---------------------------------------------------
    print("[3] Checking for Redundant Features (Correlation > 0.98)...")
    
    # Drop the constant columns first to save memory/time
    candidates = numeric_df.drop(columns=constant_cols, errors='ignore')
    
    # Fill NaN with 0 for correlation calculation
    candidates = candidates.fillna(0)
    
    # Calculate correlation matrix
    corr_matrix = candidates.corr().abs()
    
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find index of feature columns with correlation greater than 0.98
    correlated_cols = [column for column in upper.columns if any(upper[column] > 0.98)]
    
    print(f"Found {len(correlated_cols)} highly redundant features.")
    
    # Show examples
    if len(correlated_cols) > 0:
        print("   Examples of redundancy:")
        for i in range(min(5, len(correlated_cols))):
            col = correlated_cols[i]
            # Find what it correlates with
            match = upper.index[upper[col] > 0.98][0]
            print(f"   - '{col}' is basically the same as '{match}'")
            
    useless_cols.extend(correlated_cols)
    
    # REMOVE DUPLICATES from list
    useless_cols = list(set(useless_cols))
    
    print(f"\n--- ANALYSIS RESULT ---")
    print(f"Total columns to DROP: {len(useless_cols)}")
    print(f"Remaining useful features: {df.shape[1] - len(useless_cols)}")
    
    return useless_cols

# =========================================================
# EXECUTION BLOCK
# =========================================================
# Update this path to where your CSVs are stored
# For example: './data/' or 'C:/Users/You/Documents/Basketball/'
FOLDER_PATH = './data/PlayerFatigue/catapult_data' 

# 1. Load
df = load_sport_sessions(FOLDER_PATH)

if df is not None:
    # 2. Analyze
    bad_features = identify_useless_features(df)
    
    # 3. Clean
    df_clean = df.drop(columns=bad_features)
    
    # 4. Save Results
    # Save the list of bad features so you know what you deleted
    pd.DataFrame(bad_features, columns=['Irrelevant_Features']).to_csv('irrelevant_features_list.csv', index=False)
    
    # Save the clean dataset
    df_clean.to_csv('cleaned_basketball_dataset.csv', index=False)
    print("\nSuccess! Clean dataset saved as 'cleaned_basketball_dataset.csv'")
    print("List of removed columns saved as 'irrelevant_features_list.csv'")


Found 129 CSV files in './data/PlayerFatigue/catapult_data'.
Error reading ./data/PlayerFatigue/catapult_data\2025_07_01_1056-01.07.25_MT_anon.csv: list index out of range
Successfully loaded 7031 total rows with 1545 columns.

--- STARTING FEATURE ANALYSIS ---
[1] Constant/Dead Columns (Variance=0): 798
[2] Empty Columns (All NaN): 1
[3] Checking for Redundant Features (Correlation > 0.98)...
Found 160 highly redundant features.
   Examples of redundancy:
   - 'Unix End Time' is basically the same as 'Unix Start Time'
   - 'Total PL' is basically the same as 'Total Distance'
   - 'Velocity Band 2 Avg Effort Duration' is basically the same as 'Velocity Band 2 Avg Effort Distance'
   - 'Velocity Band 3 Avg Effort Duration' is basically the same as 'Velocity Band 3 Avg Effort Distance'
   - 'Velocity Band 4 Avg Effort Duration' is basically the same as 'Velocity Band 4 Avg Effort Distance'

--- ANALYSIS RESULT ---
Total columns to DROP: 959
Remaining useful features: 586

Success! Clean 

In [None]:
import pandas as pd

df = pd.read_csv('./cleaned_fussball_dataset.csv')
print(df.shape)
df.head()



(7031, 586)


  df = pd.read_csv('./cleaned_fussball_dataset.csv')


Unnamed: 0,Player Name,Period Name,Period Number,Position Name,Unix Start Time,Total Duration,Total Distance,Player Load Per Minute,Player Load Per Metre,Distance Per Minute,...,FMP Running High Duration,FMP Running High Duration Average (Session),FMP Running Medium Duration,FMP Running Medium Duration Average (Session),FMP Total Running Duration,FMP Total Running Duration Average (Session),FMP Very Low Duration,FMP Very Low Duration Average (Session),Session_Date,Source_File
0,Edmund Frei,Session,0,AV,1750583000.0,00:33:38,4010.53491,11.55661,0.09695,119.19974,...,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:33:38,00:33:38,22/06/2025,2025_06_22_1108-22.06.25_IFT_Test_anon.csv
1,Konrad Feldt,Session,0,OM,1750583000.0,00:33:38,0.0,0.07839,0.0,0.0,...,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:33:38,00:33:38,22/06/2025,2025_06_22_1108-22.06.25_IFT_Test_anon.csv
2,Albern Bergfalk,Session,0,AM,1750583000.0,00:33:38,3704.97705,9.74572,0.0885,110.11806,...,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:33:38,00:33:38,22/06/2025,2025_06_22_1108-22.06.25_IFT_Test_anon.csv
3,Wendell Brauer,Session,0,AV,1750583000.0,00:33:38,3473.26709,10.70135,0.10366,103.23125,...,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:33:38,00:33:38,22/06/2025,2025_06_22_1108-22.06.25_IFT_Test_anon.csv
4,Gernot Becke,Session,0,IV,1750583000.0,00:33:38,3638.77124,10.40699,0.09623,108.15031,...,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:33:38,00:33:38,22/06/2025,2025_06_22_1108-22.06.25_IFT_Test_anon.csv
