In [1]:
import pandas as pd
# from dash import Dash, html, dcc, Input, Output
from functools import reduce

In [2]:
# Players dataset
# Function to load data
def load_csv_data(file_path):
    df = pd.read_csv(file_path)
    df.dropna(how='all', inplace=True)
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode().dropna().iloc[0])
        else:
            df[col] = df[col].fillna(df[col].median())
    return df

# Function to merge data
def merge_data(data_frames):
    merged_df = reduce(lambda left, right: pd.merge(left, right, on='player', how='outer', suffixes=('', '_drop')), data_frames)
    merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('_drop')]
    return merged_df

# Function to remove duplicate columns after merge
def remove_duplicate_columns(merged_df):
    for col in list(merged_df.columns):
        if '_x' in col:
            base_col = col.replace('_x', '')
            x_col = base_col + '_x'
            y_col = base_col + '_y'
            if x_col in merged_df.columns and y_col in merged_df.columns:
                merged_df[base_col] = merged_df[x_col].fillna(merged_df[y_col])
                merged_df.drop([x_col, y_col], axis=1, inplace=True)
            elif x_col in merged_df.columns:
                merged_df.rename(columns={x_col: base_col}, inplace=True)

# Define file paths grouped by their final destinations
stats_files = ["player_misc.csv", "player_stats.csv"]
radar_files = ["player_defense.csv", "player_shooting.csv", "player_passing.csv"]

# Load and preprocess each dataset
stats_data = [load_csv_data(f"Players/{name}") for name in stats_files]
radar_data = [load_csv_data(f"Players/{name}") for name in radar_files]

# Merge datasets
merged_stats_data = merge_data(stats_data)
merged_radar_data = merge_data(radar_data)

# Remove any remaining duplicate columns
remove_duplicate_columns(merged_stats_data)
remove_duplicate_columns(merged_radar_data)

# Calculate new columns
merged_radar_data['defensive'] = merged_radar_data['tackles_won'] + merged_radar_data['blocks'] + merged_radar_data['interceptions']
merged_radar_data['shots_percentage'] = merged_radar_data['shots_on_target'] / merged_radar_data['shots']
merged_radar_data['passes_percentage'] = merged_radar_data['passes_completed'] / merged_radar_data['passes']

# Handle any potential division by zero issues by replacing infinite results with NaN
merged_radar_data.replace([float('inf'), -float('inf')], pd.NA, inplace=True)

# Export the merged data to different CSV files
merged_stats_data.to_csv("CleanedData/player_stats_cleaned.csv", index=False)
merged_radar_data.to_csv("CleanedData/player_radar.csv", index=False)

# Output the head of the merged DataFrame to check
print("Stats Data Merged:")
print(merged_stats_data.head())
print("\n Radar Data Merged:")
print(merged_radar_data.head())


Stats Data Merged:
                 player position       team     age  birth_year  minutes_90s  \
0            Aaron Mooy       MF  Australia  32-094        1990          4.0   
1          Aaron Ramsey       MF      Wales  31-357        1990          3.0   
2     Abdelhamid Sabiri       MF    Morocco  26-020        1996          2.0   
3     Abdelkarim Hassan       DF      Qatar  29-112        1993          3.0   
4  Abderrazak Hamdallah       FW    Morocco  32-001        1990          0.8   

   cards_yellow  cards_red  cards_yellow_red  fouls  ...  \
0             1          0                 0      4  ...   
1             1          0                 0      3  ...   
2             1          0                 0      2  ...   
3             0          0                 0      1  ...   
4             0          0                 0      3  ...   

   goals_assists_pens_per90   xg  npxg  xg_assist  npxg_xg_assist  xg_per90  \
0                       0.0  0.0   0.0        0.1           

In [3]:
# Team dataset
# Load CSV data
def load_csv_data(file_path):
    data = pd.read_csv(file_path)
    # Fill missing values with the median for numerical and mode for categorical columns
    for col in data.columns:
        if data[col].dtype == 'object':
            data[col] = data[col].fillna(data[col].mode()[0])
        else:
            data[col] = data[col].fillna(data[col].median())
    return data

# Load JSON data
def load_json_data(file_path):
    return pd.read_json(file_path, orient='index')

# Data files
group_stats_path = "Team/group_stats.csv"
team_data_path = "Team/team_data.csv"
team_tips_path = "Team/team_tips.json"

# Loading data
df_group_stats = load_csv_data(group_stats_path)
df_team_data = load_csv_data(team_data_path)
df_team_tips = load_json_data(team_tips_path)

# Ensure both dataframes have a 'team' column. Check if column names need adjusting.
print("Team Data Columns:", df_team_data.columns)
print("Group Stats Columns:", df_group_stats.columns)

# Merge DataFrames on 'team' column
df_merged_team = pd.merge(df_team_data, df_group_stats, on='team', how='inner')

# Display the first few entries of the data
print(df_merged_team.head())

# Function to export data
def export_data(df, filename):
    print(filename)
    df.to_csv(f"CleanedData/{filename}.csv", index=False)

# Export merged data
export_data(df_merged_team, "merged_team_data")

# Show merged data
print(df_merged_team.head())


Team Data Columns: Index(['team', 'players_used', 'avg_age', 'possession', 'games',
       'games_starts', 'minutes', 'minutes_90s', 'goals', 'assists',
       ...
       'fouls', 'fouled', 'offsides', 'pens_won', 'pens_conceded', 'own_goals',
       'ball_recoveries', 'aerials_won', 'aerials_lost', 'aerials_won_pct'],
      dtype='object', length=189)
Group Stats Columns: Index(['Unnamed: 0', 'group', 'rank', 'team', 'matches_played', 'wins',
       'draws', 'losses', 'goals_scored', 'goals_against', 'goal_difference',
       'points', 'expected_goal_scored', 'exp_goal_conceded',
       'exp_goal_difference', 'exp_goal_difference_per_90'],
      dtype='object')
        team  players_used  avg_age  possession  games  games_starts  minutes  \
0  Argentina            24     28.4        57.4      7            77      690   
1  Australia            20     28.7        37.8      4            44      360   
2    Belgium            20     30.6        57.0      3            33      270   
3    

In [4]:

# Match dataset
# Load the data
def load_data(file_path):
    data = pd.read_csv(file_path)

    # Handle missing values: fill numerical with median and categorical with mode
    numeric_cols = data.select_dtypes(include=['number']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())
    data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

    return data


# Define the path to the dataset
match_data_path = "Match Data/data.csv"

# Load and preprocess the data
df_match_data = load_data(match_data_path)

# Display the first few rows of the processed data
print(df_match_data.head())

# Further transformations or aggregations here

# Saving the processed data back to a new CSV for easier access in future analyses
df_match_data.to_csv('CleanedData/processed_match_data.csv', index=False)

   match dayofweek           match_time      home_team     away_team  home_xg  \
0      1       Sun  2022-11-20 19:00:00          Qatar       Ecuador      0.3   
1      2       Mon  2022-11-21 16:00:00        England       IR Iran      2.1   
2      3       Mon  2022-11-21 19:00:00        Senegal   Netherlands      0.9   
3      4       Mon  2022-11-21 22:00:00  United States         Wales      0.8   
4      5       Tue  2022-11-22 13:00:00      Argentina  Saudi Arabia      2.2   

   away_xg score  attendance                          venue  ...  \
0      1.2   0–2       67372                Al Bayt Stadium  ...   
1      1.4   6–2       45334  Khalifa International Stadium  ...   
2      0.7   0–2       41721             Al Thumama Stadium  ...   
3      1.5   1–1       43418          Ahmed bin Ali Stadium  ...   
4      0.1   1–2       88012          Lusail Iconic Stadium  ...   

  home_clearances away_clearances home_offsides away_offsides home_gks  \
0              18             


## Issue because the Images dataset has really crappy names so this doesnt work unless we create a new file with just one image and the correct name per player 


# Images dataset
# Load your player and team data
df_players_names = pd.read_csv('Images/List Of All Players Names.csv')
df_teams = pd.read_csv('Players/player_misc.csv', usecols=['player', 'team'])

# Group to team mapping (hard-coded for simplicity)
group_mapping = {
    'Group A': ['Ecuador Players', 'Netherland Players', 'Qatar Players', 'Senegal Players'],
    'Group B': ['England Players', 'Iran Players', 'United States Players', 'Wales Players'],
    'Group C': ['Argentina Players', 'Mexico Players', 'Poland Players', 'Saudi Arabia Players'],
    'Group D': ['Australia Players', 'Denmark Players', 'France Players', 'Tunisia Players'],
    'Group E': ['Costa Rica Players', 'Germany Players', 'Japan Players', 'Spain Players'],
    'Group F': ['Belgium Players', 'Canada Players', 'Croatia Players', 'Morocco Players'],
    'Group G': ['Brazil Players', 'Cameroon Players', 'Serbia Players', 'Switzerland Players'],
    'Group H': ['Ghana Players', 'Portugal Players', 'South Korea Players', 'Uruguay Players']
}

# Reverse the mapping for easy lookup: Team -> Group
team_to_group = {team: group for group, teams in group_mapping.items() for team in teams}

app = Dash(__name__)

app.layout = html.Div([
    dcc.Dropdown(
        id='player-dropdown',
        options=[{'label': player, 'value': player} for player in df_players_names['Name_Player']],
        value='Select a player'
    ),
    html.Img(id='player-image')
])


@app.callback(
    Output('player-image', 'src'),
    [Input('player-dropdown', 'value')]
)
def update_image(selected_player):
    if selected_player and selected_player != 'Select a player':
        # Find the team for the selected player
        team = df_teams.loc[df_teams['player'] == selected_player, 'team'].iloc[0]
        # Find the group for the team
        group = team_to_group.get(team, 'Unknown')
        # Construct the path to the image
        image_path = f"/Images/Images/Images/{group}/{team} Players/Images_{selected_player}/{selected_player}1.jpg"
        return app.get_asset_url(image_path)
    return None


if __name__ == '__main__':
    app.run_server(debug=True)
    
