<a href="https://colab.research.google.com/github/AdamMcCloskey381/RTA_project/blob/main/MLPRegressor_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [104]:
### import potrzebnych bibliotek
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.compose import ColumnTransformer

In [105]:
### załadowanie danych
df = pd.DataFrame()
for i in range(118, 1555):
  try:
    df1 = pd.read_csv("biegi_{}.csv".format(i))
    df = df.append(df1)
  except:
    pass

In [106]:
### zastąpienie części braków danych
df["Klub"].isnull().sum()
df["Klub"] = df["Klub"].replace(np.nan, "Brak klubu")

In [107]:
### usunięcie pozostałych braków danych
df = df.dropna()

In [108]:
df.isnull().sum()

Miejsce        0
Nr             0
Nazwisko       0
Imię           0
Miasto         0
Klub           0
Rocznik        0
Kategoria      0
Czas netto     0
Czas brutto    0
Data           0
Rok            0
Miesiąc        0
Płeć           0
dtype: int64

In [109]:
### zostawiamy klub (binarna), rocznik- przeskalować na wiek, miesiąc (parametryzacja), płeć(binarna); zmienna celu- czas netto
from datetime import date
df["New_Klub"], df["Wiek"]= 0, 0
for i in range(df.shape[0]):
  df.iloc[i, df.columns.tolist().index("New_Klub")] = 0 if df.iloc[i, df.columns.tolist().index("Klub")] == "Brak klubu" else 1
  df.iloc[i, df.columns.tolist().index("Wiek")] = date.today().year - df.iloc[i, df.columns.tolist().index("Rocznik")]  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [131]:
df_final = df.loc[:, ["Czas netto", "Miesiąc", "New_Klub", "Wiek", "Płeć"]]
df_final["Miesiąc"], df_final["Płeć"] = df_final["Miesiąc"].astype("category"), df_final["Płeć"].astype("category") 
df_final["New_Klub"], df_final["Wiek"] = df_final["New_Klub"].astype("float64"), df_final["Wiek"].astype("float64")

In [134]:
num_var = ["Wiek"]
cat_var = ["Miesiąc", "Płeć"]

num_trans = Pipeline(steps = [("scaler", StandardScaler())])
cat_trans = Pipeline(steps = [("Encoder", OneHotEncoder())])

preproc = ColumnTransformer(transformers = [("num_preproc", num_trans, num_var), ("cat_preproc", cat_trans, cat_var)])

In [136]:
pipeline = Pipeline(steps = [("preproc", preproc), ("Model", MLPRegressor())])

from sklearn import set_config
set_config(display = "diagram")
pipeline

In [138]:
x = df_final.loc[:, df_final.columns != "Czas netto"]
y = df_final.loc[:, df_final.columns == "Czas netto"]

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)

In [140]:
y_train = np.ravel(y_train)
pipeline.fit(x_train, y_train)

In [141]:
pred = pipeline.predict(x_test)
pred

array([27.24631687, 22.3612508 , 22.86151913, ..., 21.6041272 ,
       26.75123772, 27.81960517])

In [155]:
### MSE
from math import sqrt
error = []
y_test = np.ravel(y_test)
for i in range(len(pred)):
  error.append((pred[i] - y_test[i])**2)
sum(error)/len(y_test)

20.42976486756695

In [156]:
### MAE
error = []
y_test = np.ravel(y_test)
for i in range(len(pred)):
  error.append(sqrt((pred[i] - y_test[i])**2))
sum(error)/len(y_test)

3.432429177485628

In [158]:
from sklearn.model_selection import GridSearchCV

grid_params = [
               {"Model__hidden_layer_sizes": [(4, ), (4, 2), (4, 3), (8, ), (8, 2), (8, 3)],
                "Model__activation": ["relu", "logistic", "tanh"],
                "Model__solver": ["sgd", "adam"]}
]

In [159]:
grid_search = GridSearchCV(pipeline, grid_params, cv = 10, verbose = True, n_jobs = -1)
grid_search.fit(x_train, y_train)
grid_search.best_params_

Fitting 10 folds for each of 36 candidates, totalling 360 fits




{'Model__activation': 'logistic',
 'Model__hidden_layer_sizes': (8,),
 'Model__solver': 'adam'}

In [162]:
grid_search.best_params_["Model__hidden_layer_sizes"]

(8,)

In [163]:
pipeline_tuned = Pipeline(steps = [("preproc", preproc), ("Model", MLPRegressor(activation = grid_search.best_params_["Model__activation"],
                                                                                hidden_layer_sizes = grid_search.best_params_["Model__hidden_layer_sizes"],
                                                                                solver = grid_search.best_params_["Model__solver"]))])

In [164]:
pipeline_tuned.fit(x_train, y_train)
pred = pipeline_tuned.predict(x_test)



In [168]:
### MSE
from math import sqrt
error = []
y_test = np.ravel(y_test)
for i in range(len(pred)):
  error.append((pred[i] - y_test[i])**2)
sum(error)/len(y_test)

20.347287237729105

In [169]:
### MAE
error = []
y_test = np.ravel(y_test)
for i in range(len(pred)):
  error.append(sqrt((pred[i] - y_test[i])**2))
sum(error)/len(y_test)

3.4401999362146247