In [1]:
import pandas as pd
import numpy as np

In [2]:
cols = ['gameid', 'league', 'year', 'game',
        'patch', 'side', 'position', 'teamname',
        'champion', 'ban1', 'ban2', 'ban3', 'ban4', 'ban5', 
        'gamelength', 'result', 'kills', 'deaths', 
        'assists', 'teamkills','teamdeaths', 'damagetochampions',
        'wardsplaced', 'wardskilled', 'controlwardsbought',
        'totalgold', 'total cs'
        ]

In [3]:
df_2021 = pd.read_csv(
    './data/2021_LoL_esports_match_data_from_OraclesElixir.csv', usecols=cols)
df_2020 = pd.read_csv(
    './data/2020_LoL_esports_match_data_from_OraclesElixir.csv', usecols=cols)
df_2019 = pd.read_csv(
    './data/2019_LoL_esports_match_data_from_OraclesElixir.csv', usecols=cols)
df_2018 = pd.read_csv(
    './data/2018_LoL_esports_match_data_from_OraclesElixir.csv', usecols=cols)
df_2022 = pd.read_csv(
    './data/2022_LoL_esports_match_data_from_OraclesElixir.csv', usecols=cols)
df_2022.head()

Unnamed: 0,gameid,league,year,game,patch,side,position,teamname,champion,ban1,...,deaths,assists,teamkills,teamdeaths,damagetochampions,wardsplaced,wardskilled,controlwardsbought,totalgold,total cs
0,ESPORTSTMNT01_2690210,LCK CL,2022,1,12.01,Blue,top,Fredit BRION Challengers,Renekton,Karma,...,3,2,9,19,15768.0,8.0,6.0,5.0,10934,231.0
1,ESPORTSTMNT01_2690210,LCK CL,2022,1,12.01,Blue,jng,Fredit BRION Challengers,Xin Zhao,Karma,...,5,6,9,19,11765.0,6.0,18.0,6.0,9138,148.0
2,ESPORTSTMNT01_2690210,LCK CL,2022,1,12.01,Blue,mid,Fredit BRION Challengers,LeBlanc,Karma,...,2,3,9,19,14258.0,19.0,7.0,7.0,9715,193.0
3,ESPORTSTMNT01_2690210,LCK CL,2022,1,12.01,Blue,bot,Fredit BRION Challengers,Samira,Karma,...,4,2,9,19,11106.0,12.0,6.0,4.0,10605,226.0
4,ESPORTSTMNT01_2690210,LCK CL,2022,1,12.01,Blue,sup,Fredit BRION Challengers,Leona,Karma,...,5,6,9,19,3663.0,29.0,14.0,11.0,6678,42.0


In [4]:
df = pd.concat([df_2022, df_2021, df_2020, df_2019, df_2018])

I only want the professional series games which include:

- LCK
- LPL
- LEC
- LCS
- LJL
- PCS
- VCS
- LLA
- LAS

In [5]:
df['league'].unique()

array(['LCK CL', 'LPL', 'NLC', 'SL', 'Proving Grounds Circuit', 'UL',
       'PRM', 'LCK', 'LFL', 'LEC', 'LCS', 'LFL2', 'GLL', 'HM', 'ESLOL',
       'EBL', 'LPLOL', 'PGN', 'LCSA', 'LVP DDH', 'TAL', 'TCL', 'CBLOL',
       'LCO', 'LHE', 'GL', 'EL', 'CBLOLA', 'LMF', 'VL', 'LLA', 'HC',
       'LDL', 'LJL', 'PCS', 'VCS', 'UPL', 'LCL', 'NEXO', 'EM', 'LAS',
       'MSI', 'LJLA', 'CT', 'WCS', 'CDF', 'IC', 'DC', 'KeSPA', 'CU', 'BL',
       'RCL', 'DL', 'UKLC', 'OTBLX', 'BIG', 'BM', 'UGP', 'GSG', 'HS',
       'AOL', 'NERD', 'EGL', 'NASG', 'SLO', 'OPL', 'BRCC', 'CK', 'OCS',
       'Riot', 'MSC', 'NEST', 'LMS', 'LGL', 'EU LCS', 'LLN', 'CLS',
       'NA LCS', 'TCS', 'LJLCS', 'CIS CL', 'GPL'], dtype=object)

In [6]:
print(f'Start shape: {df.shape}')
professional_leagues = set('LCK LPL LEC LCS LJL PCS VCS LLA CBLOL'.split())
df = df[df['league'].isin(professional_leagues)].reset_index(drop=True)
print(f'End shape: {df.shape}')

Start shape: (596762, 27)
End shape: (159998, 27)


In [7]:
df.isna().sum()

gameid                   48
league                    0
year                      0
game                     24
patch                     0
side                      0
position                  0
teamname                  0
champion              26668
ban1                    228
ban2                     78
ban3                    264
ban4                    228
ban5                    540
gamelength                0
result                    0
kills                     0
deaths                    0
assists                   0
teamkills                 0
teamdeaths                0
damagetochampions       216
wardsplaced             216
wardskilled             216
controlwardsbought      216
totalgold                60
total cs              25250
dtype: int64

We need to have champion in the final dataset so we'll remove any rows that dont have a champion

In [8]:
clean = df.dropna(subset=['champion', 'gameid', 'damagetochampions',
                          'wardsplaced', 'wardskilled', 'controlwardsbought',
                          'totalgold',
                          'total cs' ])
clean.head()

Unnamed: 0,gameid,league,year,game,patch,side,position,teamname,champion,ban1,...,deaths,assists,teamkills,teamdeaths,damagetochampions,wardsplaced,wardskilled,controlwardsbought,totalgold,total cs
0,8401-8401_game_1,LPL,2022,1.0,12.01,Blue,top,Oh My God,Gwen,Renekton,...,0,4,13,6,11188.0,7.0,5.0,3.0,9123.0,172.0
1,8401-8401_game_1,LPL,2022,1.0,12.01,Blue,jng,Oh My God,Jarvan IV,Renekton,...,1,13,13,6,4426.0,7.0,10.0,5.0,9041.0,145.0
2,8401-8401_game_1,LPL,2022,1.0,12.01,Blue,mid,Oh My God,Syndra,Renekton,...,0,5,13,6,12577.0,8.0,2.0,4.0,9928.0,212.0
3,8401-8401_game_1,LPL,2022,1.0,12.01,Blue,bot,Oh My God,Jinx,Renekton,...,1,6,13,6,9618.0,23.0,12.0,7.0,10778.0,199.0
4,8401-8401_game_1,LPL,2022,1.0,12.01,Blue,sup,Oh My God,Nautilus,Renekton,...,4,7,13,6,2276.0,34.0,4.0,13.0,6598.0,22.0


In [9]:
clean.isna().sum()

gameid                  0
league                  0
year                    0
game                   20
patch                   0
side                    0
position                0
teamname                0
champion                0
ban1                  190
ban2                   65
ban3                  220
ban4                  190
ban5                  450
gamelength              0
result                  0
kills                   0
deaths                  0
assists                 0
teamkills               0
teamdeaths              0
damagetochampions       0
wardsplaced             0
wardskilled             0
controlwardsbought      0
totalgold               0
total cs                0
dtype: int64

If banX is N/A, the team simply didn't ban for that slot and we will set it to None

In [10]:
clean.loc[:, ['ban1', 'ban2', 'ban3', 'ban4', 'ban5']
      ] = clean[['ban1', 'ban2', 'ban3', 'ban4', 'ban5']].fillna('None')
clean.loc[:, ['game']] = clean.loc[:, ['game']].fillna(0)
clean.isna().sum().sum()

0

## Now that we have no NA values, I'm going to build our tabular data
We want to be able to do the following:
- Kills per champion
- Deaths per champion
- Playrate per champion
- Gold difference for each team per year
- Win percentages for each team per year

In [31]:
champ_wins_by_year = clean.groupby(['champion'])[
    ['result']].mean().reset_index()
champ_wins_by_year

Unnamed: 0,champion,result
0,Aatrox,0.495035
1,Ahri,0.527273
2,Akali,0.520000
3,Akshan,0.500000
4,Alistar,0.504217
...,...,...
156,Zeri,0.553826
157,Ziggs,0.487179
158,Zilean,0.497297
159,Zoe,0.505302


## Build our geo-json dataset
We want to be able to do the following
- Teams that win the most in each region
- Champions that win the most in each region
- Champions that lose the most in each region

In [28]:
regional_data = clean.groupby(['gameid', 'league'])[['deaths', 'assists', 'kills', 'damagetochampions', 'wardsplaced', 'wardskilled', 'controlwardsbought', 'totalgold', 'total cs']].sum().reset_index()

In [29]:
regional_data

Unnamed: 0,gameid,league,deaths,assists,kills,damagetochampions,wardsplaced,wardskilled,controlwardsbought,totalgold,total cs
0,2899-3157,LPL,16,45,16,88856.0,242.0,96.0,56.0,97008.0,1913.0
1,2899-3158,LPL,16,45,16,112750.0,198.0,66.0,53.0,105411.0,2047.0
2,2899-3159,LPL,32,72,32,149765.0,277.0,91.0,87.0,135678.0,2569.0
3,2900-3160,LPL,29,63,28,143467.0,379.0,151.0,74.0,134594.0,2570.0
4,2900-3161,LPL,36,97,36,197290.0,399.0,159.0,86.0,157946.0,2777.0
...,...,...,...,...,...,...,...,...,...,...,...
13302,TRTW/1850192,VCS,61,126,61,263834.0,269.0,118.0,88.0,177317.0,2809.0
13303,TRTW/1850197,VCS,22,52,22,74881.0,128.0,42.0,41.0,91069.0,1594.0
13304,TRTW/1850198,VCS,55,107,55,163286.0,256.0,90.0,107.0,138529.0,2132.0
13305,TRTW/1850214,VCS,36,58,36,107742.0,164.0,68.0,76.0,96900.0,1536.0


## Build our node data
We want to be able to do the following:
- champion playrate with each other
- champion win percentage with each other

In [12]:
import itertools

In [13]:
# Create a new DataFrame by grouping the original DataFrame by 'gameid' and 'side'
grouped_df = clean.groupby(['gameid', 'side'])

# Create an empty dictionary to store champion pairs and their counts
champion_pairs = {}
champion_pairs_wins = {}

# Iterate over the grouped DataFrame
for group_name, group_data in grouped_df:

    # Extract the champions in the current group
    champions = group_data['champion'].unique()

    # Generate all possible combinations of champions
    combinations = list(itertools.combinations(champions, 2))

    # Increment the count for each champion pair
    for pair in combinations:
        if pair in champion_pairs:
            champion_pairs[pair] += 1
            champion_pairs_wins[pair] += group_data['result'].iloc[0]
        else:
            champion_pairs[pair] = 1
            champion_pairs_wins[pair] = group_data['result'].iloc[0]

# Create a new DataFrame from the champion_pairs dictionary
champion_pairs_df = pd.DataFrame(champion_pairs.items(), columns=[
                                 'champion_pair', 'count']).sort_values('count', ascending=False)

# Find the champion pair playrate
champion_plays = clean.groupby(['champion'])['side'].count()

def calc_playrate(pair, comb_plays):
    return (comb_plays / (champion_plays[list(pair)].sum() - comb_plays))


# Divide the champion pairs df by the sum of the times each individual champion was played
champion_pairs_df['playrate'] = champion_pairs_df.apply(
    lambda x: calc_playrate(x[0], x[1]), axis=1)


In [14]:
champion_pairs_df = champion_pairs_df.reset_index(drop=True)
champion_pairs_df.head()

Unnamed: 0,champion_pair,count,playrate
0,"(Xayah, Rakan)",908,0.290746
1,"(Aphelios, Thresh)",753,0.16857
2,"(Aphelios, Nautilus)",560,0.102903
3,"(Ezreal, Braum)",485,0.094838
4,"(Kai'Sa, Alistar)",484,0.109875


In [15]:
champion_pair_wins = pd.DataFrame(champion_pairs_wins.items(), columns=[
    'champion_pair', 'wins']).sort_values('wins', ascending=False)
node_data = pd.merge(champion_pairs_df, champion_pair_wins,
                     how='left', on='champion_pair')
node_data[['champ1', 'champ2']
          ] = node_data['champion_pair'].apply(pd.Series)

In [16]:
node_data[node_data.columns[1:]].to_csv('node_data.csv', index=False)