In [60]:
import csv
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statistics as sts
from sklearn.preprocessing import (StandardScaler, RobustScaler, MinMaxScaler)
from sklearn import linear_model, metrics, preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline

## Modeling with Standard RL Stats
#### This sections includes the use of Cross Validation and the use of regression metrics to evaluate which model is the best.

In [61]:
small_data = pd.read_csv("joined_data.csv")
small_data
data = small_data[small_data.columns[4:]] # Dropping first columns.
data.head()

Unnamed: 0,core_goals,core_assists,core_saves,core_shots,core_score,demo_inflicted,demo_taken,advanced_rating
0,3,0,4,9,1119,2,2,0.86359
1,2,5,2,13,1209,2,0,1.3505
2,4,2,2,12,1077,2,5,1.23297
3,1,1,8,5,1086,1,2,0.806692
4,3,0,4,4,948,3,3,1.077983


In [62]:
mp = pd.read_csv("matches_by_players.csv")
mp
#player_list = {mp.iloc[row]["player_id"]:mp.iloc[row]["player_tag"] for row in range(len(mp))}
player_list = dict(zip(mp.player_id, mp.player_tag))

In [63]:
pd.DataFrame(list(zip(mp.player_id, mp.player_tag, mp.match_id)))

Unnamed: 0,0,1,2
0,5f3d8fdd95f40596eae2412e,Amphis,6159ad3d143c37878b2384a9
1,5f3d8fdd95f40596eae23e01,Torsos,6159ad3d143c37878b2384a9
2,5f3d8fdd95f40596eae23e53,Express,6159ad3d143c37878b2384a9
3,5f7ca648ea8a0f0714fb9a20,Laxin,6159ad3d143c37878b2384a9
4,5f3d8fdd95f40596eae24503,Baked Potato,6159ad3d143c37878b2384a9
...,...,...,...
25618,5f3d8fdd95f40596eae23f8f,Maxeew,62a3988cda9d7ca1c7bb22ba
25619,5f3d8fdd95f40596eae23f3f,Abscrazy,62a3988cda9d7ca1c7bb22ba
25620,5f9c7cde5246bf27936b4572,mikan,62a3988cda9d7ca1c7bb22ba
25621,5f3d8fdd95f40596eae2414a,Burn,62a3988cda9d7ca1c7bb22ba


In [64]:
# Splitting Data
Y = data["advanced_rating"]
X = data[['core_goals','core_assists', 'core_saves', 'core_shots','demo_inflicted', 'demo_taken']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=404)


In [65]:
model = linear_model.LinearRegression()
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print(['core_goals','core_assists', 'core_saves', 'core_shots','demo_inflicted', 'demo_taken'])
print(f'model intercept is {model.intercept_} and model coefficients are {model.coef_}')
print(f'model score = {model.score(X_test, Y_test)}')
print(f'mse = {metrics.mean_squared_error(Y_test, y_pred)}')


['core_goals', 'core_assists', 'core_saves', 'core_shots', 'demo_inflicted', 'demo_taken']
model intercept is 0.5789492603233665 and model coefficients are [ 0.1347999   0.08098808  0.00997382 -0.00948071 -0.01101295 -0.01653932]
model score = 0.7630103711600398
mse = 0.027826834519614107


In [66]:
vs = cross_val_score(model, X_test,Y_test)
print(f'Cross validation score = {sts.mean(vs)}')

Cross validation score = 0.7619424236325764


**Feature Engineering**

In [67]:
mp1 = mp.dropna() # dropped
mp1

Unnamed: 0,match_id,team_id,team_region,player_id,player_tag,core_shots,core_goals,core_saves,core_assists,core_score,...,positioning_percent_most_back,positioning_percent_most_forward,positioning_percent_closest_to_ball,positioning_percent_farthest_from_ball,demo_inflicted,demo_taken,advanced_goal_participation,advanced_rating,score,winner
0,6159ad3d143c37878b2384a9,6020bc8ef1e4807cc700391a,Oceania,5f3d8fdd95f40596eae2412e,Amphis,9.0,3.0,4.0,0.0,1119.0,...,36.110946,32.808284,32.244507,35.198802,2.0,2.0,33.333333,0.863590,3.0,True
1,6159ad3d143c37878b2384a9,6020bc8ef1e4807cc700391a,Oceania,5f3d8fdd95f40596eae23e01,Torsos,13.0,2.0,2.0,5.0,1209.0,...,30.298075,35.344441,37.177801,32.454502,2.0,0.0,77.777778,1.350500,3.0,True
2,6159ad3d143c37878b2384a9,6020bc8ef1e4807cc700391a,Oceania,5f3d8fdd95f40596eae23e53,Express,12.0,4.0,2.0,2.0,1077.0,...,35.348030,32.218735,30.874362,34.141855,2.0,5.0,66.666667,1.232970,3.0,True
3,6159ad3d143c37878b2384a9,614c8930f8090ec745286474,Oceania,5f7ca648ea8a0f0714fb9a20,Laxin,5.0,1.0,8.0,1.0,1086.0,...,33.660595,35.306617,33.799467,36.263424,1.0,2.0,40.000000,0.806692,0.0,False
4,6159ad3d143c37878b2384a9,614c8930f8090ec745286474,Oceania,5f3d8fdd95f40596eae24503,Baked Potato,4.0,3.0,4.0,0.0,948.0,...,29.400720,35.309532,35.846261,27.821551,3.0,3.0,60.000000,1.077983,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25618,62a3988cda9d7ca1c7bb22ba,621cd6a8c437fde7e02d129b,Asia-Pacific North,5f3d8fdd95f40596eae23f8f,Maxeew,18.0,6.0,10.0,8.0,2886.0,...,36.359210,33.060243,32.934982,37.126787,6.0,5.0,77.777778,1.306242,4.0,True
25619,62a3988cda9d7ca1c7bb22ba,621cd6a8c437fde7e02d129b,Asia-Pacific North,5f3d8fdd95f40596eae23f3f,Abscrazy,12.0,3.0,6.0,4.0,1719.0,...,31.465353,34.059881,33.400311,30.880090,7.0,9.0,38.888889,0.755061,4.0,True
25620,62a3988cda9d7ca1c7bb22ba,60b63cfdfec4a0857e7ba00c,Asia-Pacific North,5f9c7cde5246bf27936b4572,mikan,11.0,6.0,11.0,3.0,2462.0,...,41.291549,30.235650,33.263766,34.548356,8.0,5.0,69.230769,1.148616,3.0,False
25621,62a3988cda9d7ca1c7bb22ba,60b63cfdfec4a0857e7ba00c,Asia-Pacific North,5f3d8fdd95f40596eae2414a,Burn,27.0,6.0,5.0,2.0,2251.0,...,30.365883,36.652330,33.744413,35.952833,10.0,10.0,61.538462,0.949304,3.0,False


In [68]:
full_train = mp1.iloc[:,5:]

Using StandardScaler, Robust Scaler and MinMaxScaler and each data set will be used for a linear regression.

In [69]:
Y = full_train['advanced_rating']

X = full_train.iloc[:,:-3]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=404)

In [70]:
l_model = linear_model.LinearRegression()
l_model.fit(X_train, Y_train)
l_model.score(X_test, Y_test)

0.9462058703287378

In [71]:
feature_names = l_model.feature_names_in_
coefs = pd.DataFrame({
    'coefficients': list(feature_names),
    'value': list(l_model.coef_)
})
coefs

Unnamed: 0,coefficients,value
0,core_shots,0.013578
1,core_goals,0.070748
2,core_saves,0.022906
3,core_assists,0.076211
4,core_score,0.000075
...,...,...
77,positioning_percent_closest_to_ball,0.000751
78,positioning_percent_farthest_from_ball,-0.001180
79,demo_inflicted,0.000185
80,demo_taken,0.000455


In [72]:
lasso_model = linear_model.Lasso()
lasso_model.fit(X_train,Y_train)
lasso_score = lasso_model.score(X_test, Y_test)
print(f'Lasso model score = {lasso_score}')
coefs = pd.DataFrame({
    'coefficients': list(lasso_model.feature_names_in_),
    'value': list(lasso_model.coef_)
})
rem_coefs = coefs[coefs['value'] != 0 ] # coeffecients that 
rem_coefs

Lasso model score = 0.7617427573960649


Unnamed: 0,coefficients,value
4,core_score,0.0007457459
5,core_shooting_percentage,0.002066155
6,boost_bpm,-4.623732e-05
11,boost_amount_collected_big,2.108466e-05
14,boost_amount_stolen_small,1.255777e-05
19,boost_amount_overfill,-1.323042e-06
34,movement_avg_speed,-3.598027e-05
35,movement_total_distance,-1.386625e-07
53,positioning_avg_distance_to_ball_possession,6.540245e-05
54,positioning_avg_distance_to_ball_no_possession,-1.432022e-05


### Scalers
MinMax Scaler

In [73]:

pipe_standard = Pipeline([('scaler', StandardScaler()), ('linear', linear_model.LinearRegression())])
pipe_standard.fit(X_train, Y_train)
m_score1 = pipe_standard.score(X_test, Y_test)

las_model2 = pipe_las_standard = Pipeline([('scaler', StandardScaler()), ('linear', linear_model.Lasso())])
las_model2.fit(X_train, Y_train)
las_score2 = las_model2.score(X_test, Y_test)

print(f'Linear Model Score = {m_score1}| Lasso Model Score = {las_score2}')

Linear Model Score = 0.9461294441190606| Lasso Model Score = -4.522752905522509e-05


MinMax Scaler

In [74]:
pipe_minmax = Pipeline([('scaler', MinMaxScaler()),('linear', linear_model.LinearRegression())])
pipe_minmax.fit(X_train, Y_train)
m_score2 = pipe_minmax.score(X_test, Y_test)

m_score2

0.9462058702899372

Robust Scaler

In [75]:
pipe_robust = Pipeline([('scaler', RobustScaler()),('linear', linear_model.LinearRegression())])
pipe_minmax.fit(X_train, Y_train)
m_score3 = pipe_minmax.score(X_test, Y_test)
m_score3

0.9462058702899372

### Feature Engineering

In [77]:
from sklearn.feature_selection import SelectFromModel
new_linear_model = Pipeline([('scaler', MinMaxScaler()),('linear', linear_model.LinearRegression())])
new_linear_model.fit(X_train,Y_train)
new_linear_model.score(X_test,Y_test)

0.9462058702899372

In [98]:
importance = np.abs(new_linear_model.named_steps['linear'].coef_)
feature_names = np.array(new_linear_model.feature_names_in_)
feature_names
threshold = np.sort(importance)[-3] + 0.01
scal = MinMaxScaler()
new_x = scal.fit_transform(X)
new_y = scal.fit_transform(np.array(Y).reshape(-1,1))

sfm = SelectFromModel(new_linear_model, threshold=threshold).fit(new_x, new_y)


In [101]:
sfm.nam

<bound method SelectorMixin.get_feature_names_out of SelectFromModel(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                          ('linear', LinearRegression())]),
                threshold=15208.932577304025)>