In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import umap.umap_ as umap

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
import urllib.request
from PIL import Image
%matplotlib inline

warnings.filterwarnings("ignore")

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

In [2]:
soloq_games_euw = pd.read_csv("../data/soloq/Europe_stats.csv")
soloq_games_kr = pd.read_csv("../data/soloq/Asia_stats.csv")

soloq_games = pd.concat([soloq_games_euw, soloq_games_kr])

soloq_games = soloq_games.dropna()
# delete games with < 15 mins
soloq_games = soloq_games[soloq_games['gameEndedInEarlySurrender'] == False]
# creates a patch column
soloq_games['patch'] = soloq_games.apply(lambda x: str(x['gameVersion'].split('.')[0] + '.' + x['gameVersion'].split('.')[1]), axis=1 )

relevant_cols = [
    "patch", "teamPosition", "championId", "championName", "gameDuration", "win",
    "neutralMinionsKilled", "totalMinionsKilled", "cs_diff_at_15",
    "champExperience", "xp_diff", "xp_diff_per_min", "xp_per_min_3_15",
    "damageDealtToBuildings", "damageDealtToObjectives", "damageDealtToTurrets", "damageSelfMitigated", "magicDamageDealt", "magicDamageDealtToChampions", "magicDamageTaken",
    "physicalDamageDealt", "physicalDamageDealtToChampions", "physicalDamageTaken", "totalDamageDealt", "totalDamageDealtToChampions", "totalDamageShieldedOnTeammates",
    "totalDamageTaken", "totalHeal", "totalHealsOnTeammates", "totalUnitsHealed", "trueDamageDealt", "trueDamageDealtToChampions", "trueDamageTaken",
    "totalTimeCCDealt", "timeCCingOthers", "totalTimeSpentDead", "dmg_per_minute_diff", "dmg_per_minute_diff_15", "kills", "deaths", "assists", "kill_share", "kill_participation",
    "doubleKills", "tripleKills", "quadraKills", "pentaKills", "firstBloodAssist", "firstBloodKill", "killingSprees", "largestKillingSpree", "largestMultiKill",
    "goldEarned", "goldSpent", "gold_share", "gold_earned_per_min", "gold_diff_15", "gold_10k_time",
    "inhibitorKills", "inhibitorTakedowns", "inhibitorsLost", 
    "itemsPurchased", "consumablesPurchased",
    "largestCriticalStrike", "longestTimeSpentLiving",
    "firstTowerAssist", "firstTowerKill", "objectivesStolen", "objectivesStolenAssists", "turretKills", "turretTakedowns", "turretsLost",
    "sightWardsBoughtInGame", "visionScore", "visionWardsBoughtInGame", "detectorWardsPlaced", "wardsKilled", "wardsPlaced",
    "spell1Casts", "spell2Casts", "spell3Casts", "spell4Casts", "summoner1Casts", "summoner2Casts",
    "lane_proximity", "jungle_proximity", "percent_mid_lane", "percent_side_lanes", "forward_percentage", "counter_jungle_time_percentage",
]


# select only relevant cols
soloq = soloq_games[ relevant_cols ]


In [3]:
def clean_data(df, role = "None", patch = "All", stratified_sampling = False):
    games_df = df
    if patch != "All":
        games_df = games_df[ games_df['patch'] == patch]
    else:
        games_df = games_df
        
    if role != "None":
        games_df = games_df[ games_df['teamPosition'] == role ]
    else:
        games_df = games_df
    # list of champions with more than 100 games played
    top_champs = [i for i, x in games_df.championName.value_counts().to_dict().items() if x > 100]
    games_df = games_df[games_df['championName'].isin(top_champs)]
    if stratified_sampling:
        games_df = games_df.groupby(by='championName').apply(lambda x: x.sample(n=100)).reset_index(level=1, drop=True).drop(['championName'], axis=1).reset_index()
    try:
        games_df = games_df.drop(['teamPosition'], axis=1)
        games_df = games_df.drop(['patch'], axis=1)
        games_df = games_df.drop(['Unnamed: 0'], axis=1)
    except Exception as e:
        print(e)
    return games_df

In [4]:
general_soloq = clean_data(soloq, role="None", patch="12.5", stratified_sampling = False)

top_soloq = clean_data(soloq, role="TOP", patch="12.5", stratified_sampling = False)
jungle_soloq = clean_data(soloq, role="JUNGLE", patch="12.5", stratified_sampling = False)
mid_soloq = clean_data(soloq, role="MIDDLE", patch="12.5", stratified_sampling = False)
bottom_soloq = clean_data(soloq, role="BOTTOM", patch="12.5", stratified_sampling = False)
utility_soloq = clean_data(soloq, role="UTILITY", patch="12.5", stratified_sampling = False)

"['Unnamed: 0'] not found in axis"
"['Unnamed: 0'] not found in axis"
"['Unnamed: 0'] not found in axis"
"['Unnamed: 0'] not found in axis"
"['Unnamed: 0'] not found in axis"
"['Unnamed: 0'] not found in axis"


In [8]:
general_soloq.to_csv("../data/soloq/clean/general_soloq.csv")
top_soloq.to_csv("../data/soloq/clean/top_soloq.csv")
jungle_soloq.to_csv("../data/soloq/clean/jungle_soloq.csv")
mid_soloq.to_csv("../data/soloq/clean/mid_soloq.csv")
bottom_soloq.to_csv("../data/soloq/clean/bottom_soloq.csv")
utility_soloq.to_csv("../data/soloq/clean/utility_soloq.csv")