In [1]:
# Dependencies
import pandas as pd
import numpy as np
from pprint import pprint

In [4]:
# Read CSV 
csv = "lichess_2015.csv"
games = pd.read_csv(csv)
games.head()

Unnamed: 0.1,Unnamed: 0,White Elo,Black Elo,Result,Opening,ECO,White Score
0,0,1489,1797,0-1,French Defense: Carlson Gambit,C01,-1
1,1,1379,1558,0-1,Lion Defense: Lion's Jaw,B07,-1
2,2,1666,1258,1-0,"King's Gambit Accepted, Fischer Defense",C34,1
3,3,1528,1662,0-1,Scandinavian Defense: Mieses-Kotroc Variation,B01,-1
4,4,1528,1504,1-0,Scandinavian Defense: Mieses-Kotroc Variation,B01,1


In [5]:
winner = []
translate = ["Black", "Draw", "White"]
for score in games["White Score"]:
    winner.append(translate[score + 1])

In [7]:
games["Winner"] = winner
games.head()

Unnamed: 0.1,Unnamed: 0,White Elo,Black Elo,Result,Opening,ECO,White Score,Winner
0,0,1489,1797,0-1,French Defense: Carlson Gambit,C01,-1,Black
1,1,1379,1558,0-1,Lion Defense: Lion's Jaw,B07,-1,Black
2,2,1666,1258,1-0,"King's Gambit Accepted, Fischer Defense",C34,1,White
3,3,1528,1662,0-1,Scandinavian Defense: Mieses-Kotroc Variation,B01,-1,Black
4,4,1528,1504,1-0,Scandinavian Defense: Mieses-Kotroc Variation,B01,1,White


In [15]:
games["Skill Disparity"] = abs(games["White Elo"] - games["Black Elo"])
# Favored
favored = []
favor = games["White Elo"] > games["Black Elo"]
for value in favor:
    if value == True:
        favored.append("White")
    else:
        favored.append("Black")
games["Favored"] = favored

# Upset
upset = []
ups = (games["Favored"] != games["Winner"]) & (games["Winner"] != "Draw")
for value in ups:
    if value == True:
        upset.append("True")
    else:
        upset.append("False")
games["Upset"] = upset

# Skill Disparity Bin
bin = (0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 3000)
label = ["0-100", "101-200", "201-300", "301-400", "401-500", "501-600", "601-700", "701-800", "801-900", "901-1000", "1000+"]

games["Disparity Group"] = pd.cut(games["Skill Disparity"], bin, labels=label)
games.head()

Unnamed: 0.1,Unnamed: 0,White Elo,Black Elo,Result,Opening,ECO,White Score,Winner,Skill Disparity,Favored,Upset,Disparity Group
0,0,1489,1797,0-1,French Defense: Carlson Gambit,C01,-1,Black,308,Black,False,301-400
1,1,1379,1558,0-1,Lion Defense: Lion's Jaw,B07,-1,Black,179,Black,False,101-200
2,2,1666,1258,1-0,"King's Gambit Accepted, Fischer Defense",C34,1,White,408,White,False,401-500
3,3,1528,1662,0-1,Scandinavian Defense: Mieses-Kotroc Variation,B01,-1,Black,134,Black,False,101-200
4,4,1528,1504,1-0,Scandinavian Defense: Mieses-Kotroc Variation,B01,1,White,24,White,False,0-100


In [16]:
games_upset = games.loc[games["Upset"] == "True"]
upset = games_upset["Upset"].count()
games_expected = games.loc[games["Upset"] == "False"]
expected = games_expected["Upset"].count()

total_games = upset + expected
upset_percent = round(100 * upset/total_games, 2)
expected_percent = round(100 * expected/total_games, 2)


print(f"Total games: {total_games}")
print(f"Number of upset victories: {upset}, ({upset_percent}%)")
print(f"Number of expected victories: {expected}, ({expected_percent}%)")

Total games: 2000000
Number of upset victories: 670434, (33.52%)
Number of expected victories: 1329566, (66.48%)


In [18]:
# Disparity groups of each match
disparity_matches = games.groupby(games["Disparity Group"]).count()["Result"]
disparity_matches

Disparity Group
0-100       945662
101-200     531463
201-300     264116
301-400     130391
401-500      62953
501-600      31371
601-700      14845
701-800       6684
801-900       3096
901-1000      1449
1000+          824
Name: Result, dtype: int64

In [19]:
# Upsets by disparity group
group_1 = games.loc[games["Disparity Group"] == "0-100"]
group_2 = games.loc[games["Disparity Group"] == "101-200"]
group_3 = games.loc[games["Disparity Group"] == "201-300"]
group_4 = games.loc[games["Disparity Group"] == "301-400"]
group_5 = games.loc[games["Disparity Group"] == "401-500"]
group_6 = games.loc[games["Disparity Group"] == "501-600"]
group_7 = games.loc[games["Disparity Group"] == "601-700"]
group_8 = games.loc[games["Disparity Group"] == "701-800"]
group_9 = games.loc[games["Disparity Group"] == "801-900"]
group_10 = games.loc[games["Disparity Group"] == "901-1000"]
group_11 = games.loc[games["Disparity Group"] == "1000+"]

In [21]:
# Upsets in 0-100 group
upset_1 = group_1.loc[group_1["Upset"] == "True"].count()["Upset"]
upset_1_p = round(100 * upset_1/disparity_matches[0], 2)
print(f"Upsets: {upset_1} ({upset_1_p}%)")

# Upsets in 101-200 group
upset_2 = group_2.loc[group_2["Upset"] == "True"].count()["Upset"]
upset_2_p = round(100 * upset_2/disparity_matches[1], 2)
print(f"Upsets (101-200): {upset_2} ({upset_2_p}%)")

# Upsets in 201-300 group
upset_3 = group_3.loc[group_3["Upset"] == "True"].count()["Upset"]
upset_3_p = round(100 * upset_3/disparity_matches[2], 2)
print(f"Upsets (201-300): {upset_3} ({upset_3_p}%)")

# Upsets in 301-400 group
upset_4 = group_4.loc[group_4["Upset"] == "True"].count()["Upset"]
upset_4_p = round(100 * upset_4/disparity_matches[3], 2)
print(f"Upsets (301-400): {upset_4} ({upset_4_p}%)")

# Upsets in 401-500 group
upset_5 = group_5.loc[group_5["Upset"] == "True"].count()["Upset"]
upset_5_p = round(100 * upset_5/disparity_matches[4], 2)
print(f"Upsets (401-500): {upset_5} ({upset_5_p}%)")

# Upsets in 501-600 group
upset_6 = group_6.loc[group_6["Upset"] == "True"].count()["Upset"]
upset_6_p = round(100 * upset_6/disparity_matches[5], 2)
print(f"Upsets (501-600): {upset_6} ({upset_6_p}%)")

# Upsets in 601-700 group
upset_7 = group_7.loc[group_7["Upset"] == "True"].count()["Upset"]
upset_7_p = round(100 * upset_7/disparity_matches[6], 2)
print(f"Upsets (601-700): {upset_7} ({upset_7_p}%)")

# Upsets in 701-800 group
upset_8 = group_8.loc[group_8["Upset"] == "True"].count()["Upset"]
upset_8_p = round(100 * upset_8/disparity_matches[7], 2)
print(f"Upsets (701-800): {upset_8} ({upset_8_p}%)")

# Upsets in 801-900 group
upset_9 = group_9.loc[group_9["Upset"] == "True"].count()["Upset"]
upset_9_p = round(100 * upset_9/disparity_matches[8], 2)
print(f"Upsets (801-900): {upset_9} ({upset_9_p}%)")

# Upsets in 901-1000 group 
upset_10 = group_10.loc[group_10["Upset"] == "True"].count()["Upset"]
upset_10_p = round(100 * upset_10/disparity_matches[9], 2)
print(f"Upsets (901-1000): {upset_10} ({upset_10_p}%)")

# Upsets in 1000+ group
upset_11 = group_11.loc[group_11["Upset"] == "True"].count()["Upset"]
upset_11_p = round(100 * upset_11/disparity_matches[10], 2)
print(f"Upsets (1000+): {upset_11} ({upset_11_p}%)")

Upsets: 401826 (42.49%)
Upsets (101-200): 169817 (31.95%)
Upsets (201-300): 60997 (23.09%)
Upsets (301-400): 21718 (16.66%)
Upsets (401-500): 7729 (12.28%)
Upsets (501-600): 3000 (9.56%)
Upsets (601-700): 1088 (7.33%)
Upsets (701-800): 442 (6.61%)
Upsets (801-900): 192 (6.2%)
Upsets (901-1000): 76 (5.24%)
Upsets (1000+): 35 (4.25%)
