<a href="https://colab.research.google.com/github/AdamMcCloskey381/RTA_project/blob/main/MLPRegressor_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
### import potrzebnych bibliotek
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

In [30]:
### załadowanie danych
df = pd.DataFrame()
for i in range(118, 1555):
  try:
    df1 = pd.read_csv("biegi_{}.csv".format(i))
    df = df.append(df1)
  except:
    pass

In [31]:
### zastąpienie części braków danych
df["Klub"].isnull().sum()
df["Klub"] = df["Klub"].replace(np.nan, "Brak klubu")

In [32]:
### usunięcie pozostałych braków danych
df = df.dropna()

In [33]:
df.isnull().sum()

Miejsce        0
Nr             0
Nazwisko       0
Imię           0
Miasto         0
Klub           0
Rocznik        0
Kategoria      0
Czas netto     0
Czas brutto    0
Data           0
Rok            0
Miesiąc        0
Płeć           0
dtype: int64

In [79]:
from datetime import date
df["New_Klub"] = 0
df.loc[df["Klub"] != "Brak klubu", "New_Klub"] = 1
df["Wiek"] = date.today().year - df["Rocznik"]

In [35]:
df_final = df.loc[:, ["Czas netto", "Miesiąc", "New_Klub", "Wiek", "Płeć"]]
df_final["Miesiąc"], df_final["Płeć"] = df_final["Miesiąc"].astype("category"), df_final["Płeć"].astype("category") 
df_final["New_Klub"], df_final["Wiek"] = df_final["New_Klub"].astype("float64"), df_final["Wiek"].astype("float64")

In [36]:
num_var = ["Wiek"]
cat_var = ["Miesiąc", "Płeć"]

num_trans = Pipeline(steps = [("scaler", StandardScaler())])
cat_trans = Pipeline(steps = [("Encoder", OneHotEncoder())])

preproc = ColumnTransformer(transformers = [("num_preproc", num_trans, num_var), ("cat_preproc", cat_trans, cat_var)])

In [67]:
pipeline = Pipeline(steps = [("preproc", preproc), ("Model", MLPRegressor())])

from sklearn import set_config
set_config(display = "diagram")
pipeline

In [59]:
x = df_final.loc[:, df_final.columns != "Czas netto"]
y = df_final.loc[:, df_final.columns == "Czas netto"]

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)

In [60]:
y_train = np.ravel(y_train)
pipeline.fit(x_train, y_train)

In [61]:
pred = pipeline.predict(x_test)
pred

array([27.3383619 , 22.53040908, 22.88605738, ..., 21.91386608,
       26.94254788, 27.94717151])

In [62]:
### MSE
from math import sqrt
error = []
y_test = np.ravel(y_test)
for i in range(len(pred)):
  error.append((pred[i] - y_test[i])**2)
sum(error)/len(y_test)

20.40087168037028

In [63]:
### MAE
error = []
y_test = np.ravel(y_test)
for i in range(len(pred)):
  error.append(sqrt((pred[i] - y_test[i])**2))
sum(error)/len(y_test)

3.4335558761305114

In [64]:
### R^2
r2_score(y_test, pred)

0.2539177832695271

In [71]:
from sklearn.model_selection import GridSearchCV

grid_params = [
               {"Model__hidden_layer_sizes": [(4, ), (4, 2), (8, ), (8, 2)],
                "Model__activation": ["relu", "logistic"],
                "Model__learning_rate": ["constant", "adaptive"],
                "Model__learning_rate_init": [0.01, 0.001],
                "Model__max_iter": [200, 400]}
]

In [72]:
grid_search = GridSearchCV(pipeline, grid_params, cv = 10, verbose = True, n_jobs = -1)
grid_search.fit(x_train, y_train)
grid_search.best_params_

Fitting 10 folds for each of 64 candidates, totalling 640 fits


{'Model__activation': 'logistic',
 'Model__hidden_layer_sizes': (4, 2),
 'Model__learning_rate': 'adaptive',
 'Model__learning_rate_init': 0.001,
 'Model__max_iter': 400}

In [73]:
grid_search.best_params_["Model__hidden_layer_sizes"]

(4, 2)

In [74]:
pipeline_tuned = Pipeline(steps = [("preproc", preproc), ("Model", MLPRegressor(activation = grid_search.best_params_["Model__activation"],
                                                                                hidden_layer_sizes = grid_search.best_params_["Model__hidden_layer_sizes"],
                                                                                learning_rate = grid_search.best_params_["Model__learning_rate"],
                                                                                learning_rate_init=grid_search.best_params_["Model__learning_rate_init"],
                                                                                max_iter = grid_search.best_params_["Model__max_iter"]))])

In [75]:
pipeline_tuned.fit(x_train, y_train)
pred = pipeline_tuned.predict(x_test)

In [76]:
### MSE
from math import sqrt
error = []
y_test = np.ravel(y_test)
for i in range(len(pred)):
  error.append((pred[i] - y_test[i])**2)
sum(error)/len(y_test)

20.270683963178943

In [77]:
### MAE
error = []
y_test = np.ravel(y_test)
for i in range(len(pred)):
  error.append(sqrt((pred[i] - y_test[i])**2))
sum(error)/len(y_test)

3.434622175205162

In [78]:
r2_score(y_test, pred)

0.258678890645474