# **Importing Libraries**

In [None]:
import pandas as pd

# **Importing Data**

In [None]:
# 1. Load the CSV
df = pd.read_csv("athlete_events.csv")

# Quick look
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'athlete_events.csv'

In [None]:
print(df.isna().sum())    # missing values per column

# **Data Cleaning**

In [None]:
# Critical columns must not be null
critical_cols = ["Name", "Sex", "Age", "Year", "Sport", "Event"]

# Drop rows where any critical column is NaN
df = df.dropna(subset=critical_cols)

# Impute non-critical numeric columns with median
for col in ["Height", "Weight"]:
    if col in df.columns:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)

# For Medal, fill missing with the string "None"
if "Medal" in df.columns:
    df["Medal"] = df["Medal"].fillna("None")

# Check remaining missing values
print("Missing values after cleaning:")
print(df.isna().sum())


In [None]:
# Convert Year to datetime (year-only)
df["Year"] = pd.to_datetime(df["Year"].astype(int), format="%Y", errors="coerce")

# Optionally convert Age to integer if currently float
if pd.api.types.is_float_dtype(df["Age"]):
    df["Age"] = df["Age"].astype("Int64")  # nullable integer

print(df.dtypes)

In [None]:
# Quick look
df.head(20)

In [None]:
# 5. Remove exact duplicate rows
before_dups = len(df)
df = df.drop_duplicates()
after_dups = len(df)

print(f"Removed {before_dups - after_dups} duplicate rows")


# **Data wrangling**

In [None]:
# Create Age_Group column with custom bins
age_bins = [0, 18, 25, 35, 100]
age_labels = ['0-18', '19-25', '26-35', '36+']
df['Age_Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=True)


In [None]:
df.head()

In [None]:
df['Age_Group'].value_counts().sort_index()


In [None]:
# Create Century column from Year
df['Century'] = (df['Year'].dt.year // 100 + 1).astype(str) + 'th'
