In [3]:
import pandas as pd

# Load data
df = pd.read_csv("raw_player.csv")

# Drop columns where all values are missing
df_clean = df.dropna(axis=1, how='all')

# Save intermediate version
df_clean.to_csv("player_clean_step1.csv", index=False)

# Show shape and columns after drop
print("Shape after drop:", df_clean.shape)
print("Remaining columns (first 20):", df_clean.columns[:20].tolist())

Shape after drop: (570, 47)
Remaining columns (first 20): ['full_name', 'age', 'birthday', 'birthday_GMT', 'league', 'season', 'position', 'Current Club', 'minutes_played_overall', 'minutes_played_home', 'minutes_played_away', 'nationality', 'appearances_overall', 'appearances_home', 'appearances_away', 'goals_overall', 'goals_home', 'goals_away', 'assists_overall', 'assists_home']


In [5]:
# Rename all columns to lowercase snake_case
df_clean.columns = (
    df_clean.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)

In [6]:
print(df_clean.columns[:10].tolist())

['full_name', 'age', 'birthday', 'birthday_gmt', 'league', 'season', 'position', 'current_club', 'minutes_played_overall', 'minutes_played_home']


In [7]:
# Convert 'birthday' to datetime format
df_clean['birthday'] = pd.to_datetime(df_clean['birthday'], errors='coerce')

# Extract year (optional but useful for age checks)
df_clean['birth_year'] = df_clean['birthday'].dt.year

# Preview
print(df_clean[['birthday', 'birth_year']].head())

                       birthday  birth_year
0 1970-01-01 00:00:00.629712000        1970
1 1970-01-01 00:00:00.545554800        1970
2 1970-01-01 00:00:00.653382000        1970
3 1970-01-01 00:00:00.662198400        1970
4 1970-01-01 00:00:00.968310000        1970


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['birthday'] = pd.to_datetime(df_clean['birthday'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['birth_year'] = df_clean['birthday'].dt.year


In [8]:
# Show top 15 columns with missing values
missing = df_clean.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("Missing values:\n", missing.head(15))

Missing values:
 Series([], dtype: int64)


In [9]:
# Save final cleaned version
df_clean.to_csv("player_clean.csv", index=False)

print("✅ Cleaned dataset saved as 'player_clean.csv'")

✅ Cleaned dataset saved as 'player_clean.csv'
