In [1]:
import numpy as np
import pandas as pd

from sklearn import compose, impute, linear_model, preprocessing
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import median_absolute_error, make_scorer

In [2]:
players = pd.read_csv('../data/data_cleaned.csv', encoding='utf-8')
y = players['Value']
X = players.drop('Value', axis=1)

In [3]:
def make_pipeline(regressor=None):
    "Create a single pipeline that processes the data and then fits the regressor." 
    numeric_features = list(X.select_dtypes(include='float64').columns)
    numeric_transformer = Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='median')),
        ('scaler', preprocessing.StandardScaler())])
    
    categorical_features = list(X.select_dtypes(include='object').columns)
    categorical_transformer = Pipeline(steps=[
        ('imputer', impute.SimpleImputer(strategy='most_frequent')),
        ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))])
    
    preprocessor = compose.ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='passthrough')
    
    pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', regressor)])
    return pipeline

In [4]:
regressor = linear_model.Lasso(tol=0.05)
pipeline = make_pipeline(regressor)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
pipeline.fit(X_train, y_train);

In [7]:
y_pred = pipeline.predict(X_train)
medae_value_train = median_absolute_error(y_train, y_pred)
print(f"{medae_value_train:.4f} medae on train dataset")

334132.9289 medae on train dataset


In [8]:
median_absolute_error_scorer = make_scorer(median_absolute_error)
cross_val_score(pipeline, X_train, y_train, scoring=median_absolute_error_scorer, cv=5)

array([378951.20310401, 341964.20698475, 452514.99080652, 343672.33816559,
       367634.56565546])

In [9]:
y_test_pred = pipeline.predict(X_test)
medae_value_train = median_absolute_error(y_test, y_test_pred)
print(f"{medae_value_train:.4f} medae on test dataset")

355870.7667 medae on test dataset
