In [1]:
import glob
import os

root_directory = os.path.join('..', 'files', 'input')

test_data, train_data = glob.glob(f'{root_directory}/*')

In [2]:
import pandas as pd

def load_data(path):
    dataset = pd.read_csv(path)
    return dataset

train = load_data(path=train_data)
test = load_data(path=test_data)

In [3]:
def data_cleaning(dataset):

    # import datetime as dt
    # current_date = pd.to_datetime(dt.date.today())
    # current_year = current_date.year

    df = dataset.copy()

    df['Age'] = df['Year'].apply(lambda x: 2021 - x)
    df.drop(columns= ['Year', 'Car_Name'], inplace=True)

    df['Fuel_Type'] = df['Fuel_Type'].astype('category')
    df['Selling_type'] = df['Selling_type'].astype('category')
    df['Transmission'] = df['Transmission'].astype('category')
    
    return df

train = data_cleaning(dataset=train)
test = data_cleaning(dataset=test)

In [4]:
def data_division(dataset):

    df = dataset.copy()

    x = df.drop(columns='Present_Price')
    y = df['Present_Price']

    return x, y

x_train, y_train = data_division(dataset=train)
x_test, y_test = data_division(dataset=test)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [6]:
categorical_features = x_train.select_dtypes(include='category').columns.to_list()

column_transformer = ColumnTransformer(
    [
        ('categories', OneHotEncoder(handle_unknown='infrequent_if_exist'), categorical_features)
    ],
    remainder=MinMaxScaler()
)

pipe = Pipeline(
    [
        ('preprocessor', column_transformer),
        ('k_best_selector', SelectKBest(score_func=f_regression, k='all')),
        ('estimator', LinearRegression(n_jobs=-1))
    ]
)

pipe.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [7]:
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error

y_pred_train = pipe.predict(x_train)
y_pred_test = pipe.predict(x_test)

print('R2 train', r2_score(y_true=y_train, y_pred=y_pred_train))
print('R2 test', r2_score(y_true=y_test, y_pred=y_pred_test))
print('MSE train', mean_squared_error(y_true=y_train, y_pred=y_pred_train))
print('MSE test', mean_squared_error(y_true=y_test, y_pred=y_pred_test))
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error
print('MAE train', median_absolute_error(y_true=y_train, y_pred=y_pred_train))
print('MAE test', median_absolute_error(y_true=y_test, y_pred=y_pred_test))

R2 train 0.8916962358587399
R2 test 0.7325716754123308
MSE train 5.874646280598205
MSE test 32.56667275386622
MAE train 1.092912344019556
MAE test 1.5033540603205786


In [8]:
columns_transformer = pipe.named_steps['preprocessor']

x_transformed = columns_transformer.transform(x_train)
x_transformed = pd.DataFrame(x_transformed, columns=columns_transformer.get_feature_names_out())

print('columnas dataset original:', len(train.columns))
print('columnas dataset transformado:', len(x_transformed.columns))

columnas dataset original: 8
columnas dataset transformado: 11


In [9]:
kbest = pipe.named_steps['k_best_selector'] 

feature_names = columns_transformer.get_feature_names_out()

anova = pd.DataFrame(
    {
        'feature':feature_names,
        'F-Score':kbest.scores_,
        'P-Value':[f'{p:.6%}' for p in kbest.pvalues_],
    }
).sort_values(by='F-Score', ascending=False).reset_index(drop=True)

anova

Unnamed: 0,feature,F-Score,P-Value
0,remainder__Selling_Price,772.690478,0.000000%
1,categories__Selling_type_Dealer,96.739984,0.000000%
2,categories__Selling_type_Individual,96.739984,0.000000%
3,categories__Fuel_Type_Diesel,75.717701,0.000000%
4,categories__Fuel_Type_Petrol,70.869838,0.000000%
5,categories__Transmission_Automatic,70.117638,0.000000%
6,categories__Transmission_Manual,70.117638,0.000000%
7,remainder__Driven_kms,26.69811,0.000055%
8,remainder__Owner,0.277102,59.916529%
9,categories__Fuel_Type_CNG,0.048443,82.601011%


In [10]:
from sklearn.model_selection import GridSearchCV

param_grid= {
    'preprocessor__categories__max_categories':[None, 2],
    'k_best_selector__k':list(range(1, 11)),
}

model = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=10,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    refit=True,
    verbose=1 
)

model.fit(x_train, y_train)

Fitting 10 folds for each of 22 candidates, totalling 220 fits


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [11]:
print('mejores parámetros:', model.best_params_)

y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

print()
print('R2 train', r2_score(y_true=y_train, y_pred=y_pred_train))
print('R2 test', r2_score(y_true=y_test, y_pred=y_pred_test))
print('MSE train', mean_squared_error(y_true=y_train, y_pred=y_pred_train))
print('MSE test', mean_squared_error(y_true=y_test, y_pred=y_pred_test))
print('MAE train', median_absolute_error(y_true=y_train, y_pred=y_pred_train))
print('MAE test', median_absolute_error(y_true=y_test, y_pred=y_pred_test))
print()
print('score train:', model.score(x_train, y_train))
print('score test:', model.score(x_test, y_test))

mejores parámetros: {'k_best_selector__k': 10, 'preprocessor__categories__max_categories': 2}

R2 train 0.8915427894895408
R2 test 0.7321866445875325
MSE train 5.882969566028163
MSE test 32.61356072989005
MAE train 1.103915501642664
MAE test 1.53223568081905

score train: -1.6273604191514626
score test: -2.4782208214792565


In [12]:
import pickle
import gzip
import os

os.makedirs('../files/models', exist_ok=True)

with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
    pickle.dump(model, file)

In [13]:
import json

def calculate_metrics(modelo, x, y, tipo):

    y_pred = modelo.predict(x)
    
    metrics = {
        'type': 'metrics',
        'dataset':tipo,
        'r2':r2_score(y_pred=y_pred, y_true=y),
        'mse':mean_squared_error(y_pred=y_pred, y_true=y),
        'mad':median_absolute_error(y_pred=y_pred, y_true=y)
    }
    return metrics

train_metrics = calculate_metrics(modelo=model, x=x_train, y=y_train, tipo='train')
test_metrics = calculate_metrics(modelo=model, x=x_test, y=y_test, tipo='test')

metricas = [train_metrics, test_metrics]

os.makedirs('../files/output', exist_ok=True)

with open('../files/output/metrics.json', 'w') as file:
    for metrica in metricas:
        file.write(json.dumps(metrica)+'\n')