In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns=100
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
import sklearn.metrics
from sklearn.preprocessing import StandardScaler as SSc
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.neighbors import KNeighborsRegressor as KNR
import matplotlib.pyplot as plt
%matplotlib inline

#set width of window to preference
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
display(HTML("<style>.output.output_scroll{ height:100% !important; }</style>")) #breaks scroll output vertical so you see the whole output, disable this if you prefer.

In [2]:
data = pd.read_csv("Data-Prepped.csv",index_col=0)
data = data.astype(np.float32)
data.head()

Unnamed: 0,Bronze,Silver,Gold,Platinum,Diamond,Master,GrandMaster,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,27.0,10.0,3000.0,143.718002,0.003515,0.00022,7.0,0.00011,0.000392,0.004849,32.667702,40.867298,4.7508,28.0,0.001397,6.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,23.0,10.0,5000.0,129.232193,0.003304,0.000259,4.0,0.000294,0.000432,0.004307,32.919399,42.345402,4.8434,22.0,0.001194,5.0,0.0,0.000208
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0,30.0,10.0,200.0,69.961197,0.001101,0.000336,4.0,0.000294,0.000461,0.002926,44.647499,75.354797,4.043,22.0,0.000745,6.0,0.0,0.000189
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,19.0,20.0,400.0,107.601601,0.001034,0.000213,1.0,5.3e-05,0.000543,0.003783,29.220301,53.735199,4.9155,19.0,0.000426,7.0,0.0,0.000384
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,32.0,10.0,500.0,122.8908,0.001136,0.000327,2.0,0.0,0.001329,0.002368,22.688499,62.081299,9.374,15.0,0.001174,4.0,0.0,1.9e-05


---
## 1 - Trying to predict the player's APM (Actions Per Minute)

#### Split data into X,Y where Y is the length of the game, then normalize the inputs.

In [3]:
X = data.iloc[:,data.columns != 'APM']
Y = data.iloc[:,data.columns == 'APM']
#transform input data (normalize scaling)
ssc = SSc()
Xft = ssc.fit_transform(X)
X = pd.DataFrame(Xft)
print("Xtr(Xtrain),Xtst(Xtest),Ytr(Ytrain),Ytst(Ytest) shapes: ")
Xtr,Xtst,Ytr,Ytst = train_test_split(X,Y.values.ravel(),test_size=0.2,random_state=2021)
print(Xtr.shape,Xtst.shape,Ytr.shape,Ytst.shape)

Xtr(Xtrain),Xtst(Xtest),Ytr(Ytrain),Ytst(Ytest) shapes: 
(2670, 25) (668, 25) (2670,) (668,)


In [4]:
print("Classifier scores:\n"+"-"*18+"\n")
for i in range(3,8,2):
    tree = DTR(max_depth=i+1)
    tree.fit(Xtr,Ytr)
    scr = cross_val_score(tree,Xtst,Ytst, cv=5)
    print("Tree of depth "+str(i)+"\nscore avg:"+str(sum(scr)/5)+"\nscore = "+str(scr)+"\n\n"+"-"*64)

for i in range(1,28,3):
    dpth = 7
    forest = RFR(n_estimators=i,max_depth=dpth)
    forest.fit(Xtr,Ytr)
    scr = cross_val_score(forest,Xtst,Ytst, cv=5)
    print("\nRandom Forest trees = "+str(i)+" depth = "+str(dpth)+" \nscore avg: "+str(sum(scr)/5)+" \nscores: "+str(scr))

print("\n"+"-"*64)
    
for i in range(1,10):
    knn = KNR(n_neighbors=i)
    knn.fit(Xtr,Ytr)
    scr = cross_val_score(knn,Xtst,Ytst, cv=5)
    print("\nK-Nearest Neighbors "+str(i)+"-neighbors\nscore avg:"+str(sum(scr)/5)+"\nscore = "+
          str(scr))

Classifier scores:
------------------

Tree of depth 3
score avg:0.8032197759729188
score = [0.8271212  0.83885572 0.69060868 0.78965485 0.86985842]

----------------------------------------------------------------
Tree of depth 5
score avg:0.845846191947276
score = [0.88849921 0.8856103  0.74909063 0.83008348 0.87594734]

----------------------------------------------------------------
Tree of depth 7
score avg:0.8696179557268688
score = [0.8737981  0.89376504 0.86539501 0.83220082 0.88293081]

----------------------------------------------------------------

Random Forest trees = 1 depth = 7 
score avg: 0.8478416594793909 
scores: [0.83496781 0.8392719  0.85509484 0.85377152 0.85610223]

Random Forest trees = 4 depth = 7 
score avg: 0.9143970869915241 
scores: [0.92281844 0.90360367 0.92070465 0.90471198 0.9201467 ]

Random Forest trees = 7 depth = 7 
score avg: 0.9193531761594309 
scores: [0.93155436 0.92729023 0.88264547 0.92572652 0.9295493 ]

Random Forest trees = 10 depth = 7 
s

##### Random Forests can get up to ~94% accuracy with 16-23 trees of depth 7-8
##### K-Nearest Neighbors optimizes at ~76% accuracy with 6-7 neighbors
##### I think this is somewhat easy to predict because a player will almost always being doing approximately the same number of things within a certain timeframe in a game even if those things vary based on how the game progresses.

---
## 2 - Trying to predict a player's league

#### Split data into X,Y where Y is the kills, deaths and assists, then normalize the inputs.

In [5]:
X = data.iloc[:,8:]
Y = data.iloc[:,7]
#transform input data (normalize scaling)
ssc = SSc()
Xft = ssc.fit_transform(X)
X = pd.DataFrame(Xft)
print("Xtr(Xtrain),Xtst(Xtest),Ytr(Ytrain),Ytst(Ytest) shapes: ")
Xtr,Xtst,Ytr,Ytst = train_test_split(X,Y,test_size=0.2,random_state=2021)
print(Xtr.shape,Xtst.shape,Ytr.shape,Ytst.shape)

Xtr(Xtrain),Xtst(Xtest),Ytr(Ytrain),Ytst(Ytest) shapes: 
(2670, 18) (668, 18) (2670,) (668,)


In [11]:
print("Classifier scores:\n"+"-"*18+"\n")
for i in range(1,5):
    tree = DTR(max_depth=i+1)
    tree.fit(Xtr,Ytr)
    scr = cross_val_score(tree,Xtst,Ytst, cv=5)
    print("Tree of depth "+str(i)+"\nscore avg:"+str(sum(scr)/5)+"\nscore = "+str(scr)+"\n\n"+"-"*64)

for i in range(1,28,3):
    dpth = 12
    forest = RFR(n_estimators=i,max_depth=dpth)
    forest.fit(Xtr,Ytr)
    scr = cross_val_score(forest,Xtst,Ytst, cv=5)
    print("\nRandom Forest trees = "+str(i)+" depth = "+str(dpth)+" \nscore avg: "+str(sum(scr)/5)+" \nscores: "+str(scr))

print("\n"+"-"*64)
    
for i in range(1,10):
    knn = KNR(n_neighbors=i)
    knn.fit(Xtr,Ytr)
    scr = cross_val_score(knn,Xtst,Ytst, cv=5)
    print("\nK-Nearest Neighbors "+str(i)+"-neighbors\nscore avg:"+str(sum(scr)/5)+"\nscore = "+
          str(scr))

Classifier scores:
------------------

Tree of depth 1
score avg:0.3403440334409956
score = [0.38913359 0.36448855 0.34194207 0.24783451 0.35832145]

----------------------------------------------------------------
Tree of depth 2
score avg:0.3576686385080065
score = [0.43001907 0.33181201 0.33993957 0.30824462 0.37832793]

----------------------------------------------------------------
Tree of depth 3
score avg:0.35534726368757885
score = [0.42045359 0.33843993 0.38518068 0.27673255 0.35592957]

----------------------------------------------------------------
Tree of depth 4
score avg:0.3437868693835167
score = [0.45785779 0.36198845 0.35051617 0.23465786 0.31391407]

----------------------------------------------------------------

Random Forest trees = 1 depth = 12 
score avg: 0.16651361836639175 
scores: [0.19109599 0.38560725 0.03772168 0.06057304 0.15757012]

Random Forest trees = 4 depth = 12 
score avg: 0.4299119387716333 
scores: [0.46009776 0.40339719 0.4225009  0.39959224 0

##### It seems very difficult to predict someone's league based on the data I have. I think that this is partially due to the skill levels not being vastly different between leagues and different players may be good at some skills and poor at others, i.e. platinum level in worker production but only silver level in army management

---
## 3 - Trying to predict a player's Action Latency (How long between focusing on a location and giving a command in that location)

In [12]:
X = data.iloc[:,data.columns != 'ActionLatency']
Y = data.iloc[:,data.columns == 'ActionLatency']
#transform input data (normalize scaling)
ssc = SSc()
Xft = ssc.fit_transform(X)
X = pd.DataFrame(Xft)
print("Xtr(Xtrain),Xtst(Xtest),Ytr(Ytrain),Ytst(Ytest) shapes: ")
Xtr,Xtst,Ytr,Ytst = train_test_split(X,Y.values.ravel(),test_size=0.2,random_state=2021)
print(Xtr.shape,Xtst.shape,Ytr.shape,Ytst.shape)

Xtr(Xtrain),Xtst(Xtest),Ytr(Ytrain),Ytst(Ytest) shapes: 
(2670, 25) (668, 25) (2670,) (668,)


In [21]:
print("Classifier scores:\n"+"-"*18+"\n")
for i in range(1,6):
    tree = DTR(max_depth=i+1)
    tree.fit(Xtr,Ytr)
    scr = cross_val_score(tree,Xtst,Ytst, cv=5)
    print("Tree of depth "+str(i)+"\nscore avg:"+str(sum(scr)/5)+"\nscore = "+str(scr)+"\n\n"+"-"*64)

for i in range(1,28,3):
    dpth = 6
    forest = RFR(n_estimators=i,max_depth=dpth)
    forest.fit(Xtr,Ytr)
    scr = cross_val_score(forest,Xtst,Ytst, cv=5)
    print("\nRandom Forest trees = "+str(i)+" depth = "+str(dpth)+" \nscore avg: "+str(sum(scr)/5)+" \nscores: "+str(scr))

print("\n"+"-"*64)
    
for i in range(1,10):
    knn = KNR(n_neighbors=i)
    knn.fit(Xtr,Ytr)
    scr = cross_val_score(knn,Xtst,Ytst, cv=5)
    print("\nK-Nearest Neighbors "+str(i)+"-neighbors\nscore avg:"+str(sum(scr)/5)+"\nscore = "+
          str(scr))

Classifier scores:
------------------

Tree of depth 1
score avg:0.6368445728023168
score = [0.67453979 0.69461853 0.6032955  0.53266655 0.6791025 ]

----------------------------------------------------------------
Tree of depth 2
score avg:0.7292429401021139
score = [0.71571997 0.77642274 0.76269725 0.65691945 0.7344553 ]

----------------------------------------------------------------
Tree of depth 3
score avg:0.7338384142991337
score = [0.71933252 0.78908394 0.77211738 0.6944908  0.69416744]

----------------------------------------------------------------
Tree of depth 4
score avg:0.7621940043128811
score = [0.76226648 0.78999556 0.79599369 0.73398329 0.72873101]

----------------------------------------------------------------
Tree of depth 5
score avg:0.71589816741241
score = [0.72061511 0.78254823 0.75697259 0.65591432 0.66344059]

----------------------------------------------------------------

Random Forest trees = 1 depth = 6 
score avg: 0.6954008106423517 
scores: [0.73602

##### Random Forests can get up to ~82% accuracy with 10-13 trees of depth 4-6
##### K-Nearest Neighbors optimizes at ~65% accuracy with 7 neighbors
##### This seems fairly easy to predict which is likely because players will be more or less consistent in their reaction speed (especially since these are ranked games most people won't play if they're overly fatigued so they will be more consistent).

---
## 4 - Trying to predict how many PACs a player goes through in a game

In [24]:
X = data.iloc[:,data.columns != 'NumberOfPACs']
Y = data.iloc[:,data.columns == 'NumberOfPACs']
#transform input data (normalize scaling)
ssc = SSc()
Xft = ssc.fit_transform(X)
X = pd.DataFrame(Xft)
print("Xtr(Xtrain),Xtst(Xtest),Ytr(Ytrain),Ytst(Ytest) shapes: ")
Xtr,Xtst,Ytr,Ytst = train_test_split(X,Y.values.ravel(),test_size=0.2,random_state=2021)
print(Xtr.shape,Xtst.shape,Ytr.shape,Ytst.shape)

Xtr(Xtrain),Xtst(Xtest),Ytr(Ytrain),Ytst(Ytest) shapes: 
(2670, 25) (668, 25) (2670,) (668,)


In [39]:
print("Classifier scores:\n"+"-"*18+"\n")
for i in range(1,6):
    tree = DTR(max_depth=i+1)
    tree.fit(Xtr,Ytr)
    scr = cross_val_score(tree,Xtst,Ytst, cv=5)
    print("Tree of depth "+str(i)+"\nscore avg:"+str(sum(scr)/5)+"\nscore = "+str(scr)+"\n\n"+"-"*64)

for i in range(5,29,2):
    dpth = 12
    forest = RFR(n_estimators=i,max_depth=dpth)
    forest.fit(Xtr,Ytr)
    scr = cross_val_score(forest,Xtst,Ytst, cv=5)
    print("\nRandom Forest trees = "+str(i)+" depth = "+str(dpth)+" \nscore avg: "+str(sum(scr)/5)+" \nscores: "+str(scr))

print("\n"+"-"*64)
    
for i in range(1,13):
    knn = KNR(n_neighbors=i)
    knn.fit(Xtr,Ytr)
    scr = cross_val_score(knn,Xtst,Ytst, cv=5)
    print("\nK-Nearest Neighbors "+str(i)+"-neighbors\nscore avg:"+str(sum(scr)/5)+"\nscore = "+
          str(scr))

Classifier scores:
------------------

Tree of depth 1
score avg:0.6711838824362658
score = [0.65221136 0.73829029 0.66072585 0.64460341 0.6600885 ]

----------------------------------------------------------------
Tree of depth 2
score avg:0.7524418025822117
score = [0.79386859 0.79296622 0.69178476 0.71300382 0.77058562]

----------------------------------------------------------------
Tree of depth 3
score avg:0.7853211685955301
score = [0.80870162 0.79723699 0.76438274 0.75593614 0.80034836]

----------------------------------------------------------------
Tree of depth 4
score avg:0.7954759079649211
score = [0.81214763 0.7889671  0.80237457 0.76950567 0.80438458]

----------------------------------------------------------------
Tree of depth 5
score avg:0.795451799869187
score = [0.82894022 0.76949658 0.78655605 0.78639268 0.80587347]

----------------------------------------------------------------

Random Forest trees = 1 depth = 12 
score avg: 0.7233187484025182 
scores: [0.806

##### Random Forests can get up to ~88-89% accuracy with 15-20 trees of depth 7-9
##### K-Nearest Neighbors optimizes at ~63% accuracy with 6 neighbors

---
## 5 - Trying to predict how many workers a player produces per a given amount of time

In [42]:
X = data.iloc[:,data.columns != 'WorkersMade']
Y = data.iloc[:,data.columns == 'WorkersMade']
#transform input data (normalize scaling)
ssc = SSc()
Xft = ssc.fit_transform(X)
X = pd.DataFrame(Xft)
print("Xtr(Xtrain),Xtst(Xtest),Ytr(Ytrain),Ytst(Ytest) shapes: ")
Xtr,Xtst,Ytr,Ytst = train_test_split(X,Y.values.ravel(),test_size=0.2,random_state=2021)
print(Xtr.shape,Xtst.shape,Ytr.shape,Ytst.shape)

Xtr(Xtrain),Xtst(Xtest),Ytr(Ytrain),Ytst(Ytest) shapes: 
(2670, 25) (668, 25) (2670,) (668,)


In [47]:
print("Classifier scores:\n"+"-"*18+"\n")
for i in range(1,4):
    tree = DTR(max_depth=i+1)
    tree.fit(Xtr,Ytr)
    scr = cross_val_score(tree,Xtst,Ytst, cv=5)
    print("Tree of depth "+str(i)+"\nscore avg:"+str(sum(scr)/5)+"\nscore = "+str(scr)+"\n\n"+"-"*64)

for i in range(1,29,3):
    dpth = 7
    forest = RFR(n_estimators=i,max_depth=dpth)
    forest.fit(Xtr,Ytr)
    scr = cross_val_score(forest,Xtst,Ytst, cv=5)
    print("\nRandom Forest trees = "+str(i)+" depth = "+str(dpth)+" \nscore avg: "+str(sum(scr)/5)+" \nscores: "+str(scr))

print("\n"+"-"*64)
    
for i in range(1,100,5):
    knn = KNR(n_neighbors=i)
    knn.fit(Xtr,Ytr)
    scr = cross_val_score(knn,Xtst,Ytst, cv=5)
    print("\nK-Nearest Neighbors "+str(i)+"-neighbors\nscore avg:"+str(sum(scr)/5)+"\nscore = "+
          str(scr))

Classifier scores:
------------------

Tree of depth 1
score avg:0.09921620818689791
score = [ 0.15621548  0.04931906  0.17505456  0.14254622 -0.02705429]

----------------------------------------------------------------
Tree of depth 2
score avg:0.03793248019195825
score = [ 0.15725063  0.00895247  0.16464879  0.14688575 -0.28807523]

----------------------------------------------------------------
Tree of depth 3
score avg:-0.0816491485962854
score = [ 0.07337114 -0.06509747  0.19142933  0.11847134 -0.72642009]

----------------------------------------------------------------

Random Forest trees = 1 depth = 7 
score avg: -0.7414144706368789 
scores: [-0.43475375 -1.42011903 -0.74392732 -0.17368379 -0.93458846]

Random Forest trees = 4 depth = 7 
score avg: 0.08577150304789016 
scores: [0.1715485  0.06118097 0.06990596 0.05953024 0.06669184]

Random Forest trees = 7 depth = 7 
score avg: 0.07146769135154671 
scores: [ 0.12213111  0.03732887  0.1954776   0.14339019 -0.14098931]

Rando

##### It seems almost impossible to accurately predict this metric. I think this may be at least partially due to the nature of the game being that if players are playing aggressively then much fewer workers will be made while if players are playing defensive they will be making a lot of workers. I think it would be possible to predict this in a given game with more specific game statistics, but since these are average player stats it is almost impossible.