Install Dependencies

In [None]:
%pip install -r requirements.txt

Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

Load Data

In [None]:
column_names = [
    "matchId",

    "blueTeamControlWardsPlaced", "blueTeamWardsPlaced", "blueTeamTotalKills",
    "blueTeamDragonKills", "blueTeamHeraldKills", "blueTeamTowersDestroyed",
    "blueTeamInhibitorsDestroyed", "blueTeamTurretPlatesDestroyed",
    "blueTeamFirstBlood", "blueTeamMinionsKilled", "blueTeamJungleMinions",
    "blueTeamTotalGold", "blueTeamXp", "blueTeamTotalDamageToChamps",

    "redTeamControlWardsPlaced", "redTeamWardsPlaced", "redTeamTotalKills",
    "redTeamDragonKills", "redTeamHeraldKills", "redTeamTowersDestroyed",
    "redTeamInhibitorsDestroyed", "redTeamTurretPlatesDestroyed",
    "redTeamMinionsKilled", "redTeamJungleMinions", "redTeamTotalGold",
    "redTeamXp", "redTeamTotalDamageToChamps",

    "blueWin", "end"
]

df = pd.read_csv("data/match_data_v5.csv", names=column_names, header=0, index_col=0)

df.head() # Display the first 5 rows of the dataset

Explore Basic Feature Attributes

In [None]:
df.info()

In [None]:
df.describe()

Handle Missing Values and Duplicates

In [None]:
# Remove last column
df = df.drop(columns=["end"])
df.head()

In [None]:
# Handle duplicates
df.duplicated().sum()

# Check if the matchId is unique
df.index.is_unique

# Drop duplicates
df = df.drop_duplicates()
df.duplicated().sum()

Explore Data Distribution

In [None]:
# Check if the dataset is balanced
plt.figure(figsize=(6, 2))
df['blueWin'].value_counts().plot(kind='bar', color=['red', 'blue'])
plt.title('Proportion of Blue Win')
plt.xlabel('Blue Win')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['0', '1'])
plt.show()

In [None]:
# Generate report
profile = ProfileReport(df, title="EDA Report", explorative=True)
profile.to_file("eda_report.html")

Handle Outliers

In [None]:
# Decide whether outliers should be removed
plt.figure(figsize=(10, 5))
df.boxplot()
plt.xticks(rotation=45)
plt.show()

Create new features

In [None]:
df['diffMinionsKilled'] = (df['blueTeamMinionsKilled'] - df['redTeamMinionsKilled'])
df['diffJungleMinions'] = (df['blueTeamJungleMinions'] - df['redTeamJungleMinions'])
df['diffTotalGold'] = (df['blueTeamTotalGold'] - df['redTeamTotalGold'])
df['diffTotalKills'] = (df['blueTeamTotalKills'] - df['redTeamTotalKills'])
df['diffXp'] = (df['blueTeamXp'] - df['redTeamXp'])
df['diffTotalDamageToChamps'] = (df['blueTeamTotalDamageToChamps'] - df['redTeamTotalDamageToChamps'])
df['diffDragonKills'] = (df['blueTeamDragonKills'] - df['redTeamDragonKills'])
df['diffHeraldKills'] = (df['blueTeamHeraldKills'] - df['redTeamHeraldKills'])
df['diffTowersDestroyed'] = (df['blueTeamTowersDestroyed'] - df['redTeamTowersDestroyed'])
df['diffInhibitorsDestroyed'] = (df['blueTeamInhibitorsDestroyed'] - df['redTeamInhibitorsDestroyed'])
df['diffTurretPlatesDestroyed'] = (df['blueTeamTurretPlatesDestroyed'] - df['redTeamTurretPlatesDestroyed'])

df.head()

Export processed dataset

In [None]:
# Export the processed dataset to a CSV file for training in another notebook
df.to_csv("data/processed_dataset.csv", index=True)
print("Dataset exported successfully to 'data/processed_dataset.csv'")