In [1]:
import numpy as np
import pandas as pd
from seaborn import pairplot
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from pickle import dump
from pathlib import Path

In [2]:
df = pd.read_csv("../data/final_cutted.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,GOLDEARNED,TOTALMINIONSKILLED,KILLS,ASSISTS,DEATHS,VISIONSCORE,TOTALDAMAGEDEALTTOCHAMPIONS,P_MATCH_ID,WIN,CHAMPION,PUUID,SUMMONERNAME,GAMEVERSION
0,0,0.0,5218.0,16.0,1.0,6.0,6.0,28.0,4500.0,BR1_2304032235_utility,False,Bard,UNNl1KcPO98UoXiuRpQBefEKJbtCF_80b0_2s0Cwa5FiYi...,batata 12121212,11.13.382.1241
1,1,1.0,7515.0,29.0,1.0,19.0,1.0,40.0,7716.0,BR1_2304032235_utility,True,Blitzcrank,w2DLeo91qdfD72dpGgapMOKh_4IZ9IMF29neabiS0QTe8W...,love yourseIf,11.13.382.1241
2,2,2.0,9197.0,47.0,5.0,5.0,5.0,17.0,9696.0,BR1_2304032235_jungle,False,Nocturne,wDtmVguiopT93yrxtv2L88LxAVWC8E2fj_F3FDW81nCuSU...,NTM HACKER,11.13.382.1241
3,3,3.0,10564.0,37.0,6.0,8.0,4.0,12.0,15291.0,BR1_2304032235_jungle,True,Kayn,zVKtTZrdKVIpXwIMlsuSQjwOgqxx0DMhnWDFL7MrAKxXZq...,unsuri,11.13.382.1241
4,4,4.0,10598.0,158.0,6.0,8.0,7.0,17.0,20568.0,BR1_2303451507_top,False,Pantheon,sTevUOXxKjNW7dpbtyu9wjn8KZxzN63_f2MfGc1EALDjtq...,Nome e Numeros,11.13.382.1241


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   10000 non-null  int64  
 1   index                        10000 non-null  float64
 2   GOLDEARNED                   10000 non-null  float64
 3   TOTALMINIONSKILLED           10000 non-null  float64
 4   KILLS                        10000 non-null  float64
 5   ASSISTS                      10000 non-null  float64
 6   DEATHS                       10000 non-null  float64
 7   VISIONSCORE                  10000 non-null  float64
 8   TOTALDAMAGEDEALTTOCHAMPIONS  10000 non-null  float64
 9   P_MATCH_ID                   10000 non-null  object 
 10  WIN                          10000 non-null  bool   
 11  CHAMPION                     10000 non-null  object 
 12  PUUID                        10000 non-null  object 
 13  SUMMONERNAME     

In [5]:
numeric_features = ['TOTALMINIONSKILLED', 'KILLS', 
                    'ASSISTS', 'DEATHS', 'VISIONSCORE', 'TOTALDAMAGEDEALTTOCHAMPIONS']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", StandardScaler()),
    ]
)

In [6]:
categorical_features = ["WIN"]
categorical_transformer = Pipeline(
    steps=[
        ("onehot-encoder", OneHotEncoder(drop="first", 
                                         handle_unknown="ignore")),
    ]
)

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numeric_features),
        ("categorical", categorical_transformer, categorical_features),
    ]
)

In [8]:
regressor = Ridge()
tr_regressor = TransformedTargetRegressor(regressor=regressor,
                                          transformer=MinMaxScaler())
pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("regressor", tr_regressor)
    ]
)

In [9]:
train, test = train_test_split(df, test_size=0.3)

In [10]:
train.head()

Unnamed: 0.1,Unnamed: 0,index,GOLDEARNED,TOTALMINIONSKILLED,KILLS,ASSISTS,DEATHS,VISIONSCORE,TOTALDAMAGEDEALTTOCHAMPIONS,P_MATCH_ID,WIN,CHAMPION,PUUID,SUMMONERNAME,GAMEVERSION
8242,8242,8242.0,7461.0,32.0,1.0,8.0,6.0,10.0,6704.0,KR_5303143306_jungle,False,Nunu,oDXRHbz27jQ5au1fcH22ykT6ax4TFwzkSPHuPEsrlhvOkd...,정글프로게이머,11.13.382.1241
5224,5224,5224.0,9588.0,172.0,3.0,1.0,9.0,13.0,15872.0,EUW1_5361454164_top,False,Garen,O369ICwrR1tphNru4_oJJJyUllLW6cU8MfezuVfvShvOWG...,OTZ H ι r o,11.14.385.9967
3772,3772,3772.0,14060.0,25.0,13.0,7.0,6.0,10.0,22069.0,EUW1_5394292187_jungle,True,XinZhao,DjWSmxmXzTUwvhQ8thSB8rEuqm4TTqFwYwv3nCtt8pP7P_...,nie moge spać,11.15.389.2308
4390,4390,4390.0,4836.0,30.0,1.0,3.0,5.0,22.0,8295.0,EUW1_5389744822_utility,False,Brand,9nv8dhtY7oL6xd3rWkgh41Dv3sy1o7TbWpul1SCOwh0c7x...,Hungry4di,11.15.389.2308
3861,3861,3861.0,4911.0,25.0,1.0,6.0,0.0,18.0,3906.0,KR_5281016392_utility,True,Blitzcrank,xQbsgLgQUzXkMN65_UQw5-Gbhoi8R33GvMmol6ADLYZvIF...,뮤탈리스크,11.13.382.1241


In [11]:
X_train = train[['TOTALMINIONSKILLED', 'KILLS', 
                    'ASSISTS', 'DEATHS', 'VISIONSCORE', 'TOTALDAMAGEDEALTTOCHAMPIONS', 'WIN']]
Y_train = train[['GOLDEARNED']]

In [12]:
X_test = test[['TOTALMINIONSKILLED', 'KILLS', 
                    'ASSISTS', 'DEATHS', 'VISIONSCORE', 'TOTALDAMAGEDEALTTOCHAMPIONS', 'WIN']]
Y_test = test[['GOLDEARNED']]

Подберем коэффициент регуляризации для линейной регрессии.

In [13]:
param_grid = {
    "regressor__regressor__alpha": np.linspace(0.0, 3.0, num=31),
}

In [14]:
search_cv = GridSearchCV(pipe, param_grid)

In [15]:
search_cv.fit(X_train, Y_train)

In [16]:
print("Best params:")
print(search_cv.best_params_)

Best params:
{'regressor__regressor__alpha': 3.0}


In [17]:
search_cv.score(X_train, Y_train)

0.8719264139355215

In [18]:
search_cv.score(X_test, Y_test)

0.8688367386770199

Сохраним полученный конвейер обработки данных для дальнейшего использования.

In [19]:
Path("../models").mkdir(parents=True, exist_ok=True)

with open("../models/pipeline.pkl", "wb") as file:
    dump(search_cv, file)