In [1]:
import os
import json
import gzip
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


# Paso 1.
print('Paso 1...')
# Preprocese los datos.
# - Cree la columna 'Age' a partir de la columna 'Year'.
#   Asuma que el año actual es 2021.
# - Elimine las columnas 'Year' y 'Car_Name'.
df_train = pd.read_csv('../files/input/train_data.csv.zip', index_col=False, compression="zip")
df_test = pd.read_csv('../files/input/test_data.csv.zip', index_col=False, compression="zip")

df_train['Age'] = 2021 - df_train['Year']
df_test['Age'] = 2021 - df_test['Year']

df_train.drop(columns=['Year', 'Car_Name'], inplace=True)
df_test.drop(columns=['Year', 'Car_Name'], inplace=True)

# Eliminamos los registros con informacion no disponible
df_train = df_train.dropna()
df_test = df_test.dropna()

# Paso 2.
print('Paso 2...')
# Divida los datasets en x_train, y_train, x_test, y_test.
x_train = df_train.drop(columns=['Present_Price'])
y_train = df_train['Present_Price']

x_test = df_test.drop(columns=['Present_Price'])
y_test = df_test['Present_Price']


Paso 1...
Paso 2...


In [2]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Selling_Price  211 non-null    float64
 1   Driven_kms     211 non-null    int64  
 2   Fuel_Type      211 non-null    object 
 3   Selling_type   211 non-null    object 
 4   Transmission   211 non-null    object 
 5   Owner          211 non-null    int64  
 6   Age            211 non-null    int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 11.7+ KB


In [3]:
x_train.describe()

Unnamed: 0,Selling_Price,Driven_kms,Owner,Age
count,211.0,211.0,211.0,211.0
mean,4.692512,35578.009479,0.047393,7.35545
std,4.819333,28912.475577,0.271907,2.794843
min,0.1,1200.0,0.0,3.0
25%,1.025,15000.0,0.0,5.0
50%,3.75,32000.0,0.0,7.0
75%,6.05,47500.0,0.0,9.0
max,23.5,213000.0,3.0,18.0


In [4]:
y_train.head()

0    8.500
1    4.600
2    0.826
3    4.430
4    1.500
Name: Present_Price, dtype: float64

In [5]:
y_train.info()

<class 'pandas.core.series.Series'>
RangeIndex: 211 entries, 0 to 210
Series name: Present_Price
Non-Null Count  Dtype  
--------------  -----  
211 non-null    float64
dtypes: float64(1)
memory usage: 1.8 KB


In [6]:
x_test.head()

Unnamed: 0,Selling_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,4.75,43000,Diesel,Dealer,Manual,0,8
1,7.25,6900,Petrol,Dealer,Manual,0,4
2,2.85,5200,Petrol,Dealer,Manual,0,10
3,6.75,18796,Petrol,Dealer,Manual,0,6
4,6.5,33429,Diesel,Dealer,Manual,0,6


In [7]:
# Paso 3.
print('Paso 3...')
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Escala las variables numéricas al intervalo [0, 1].
# - Selecciona las K mejores entradas.
# - Ajusta un modelo de regresion lineal.


# Creamos el transformer
transformer = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(dtype="int"), ['Fuel_Type', 'Selling_type', 'Transmission']),
    ],
    # remainder=MinMaxScaler(),
)

# Creamos el pipeline
pipeline = Pipeline(
    steps =[
        ('transformer', transformer),
        ('feature_selection', SelectKBest(score_func=f_regression)),
        ('scaler', MinMaxScaler()),
        ('linearregression', LinearRegression()),
    ],
    verbose=False,
)

Paso 3...


In [8]:
# Paso 4.
print('Paso 4...')
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use el error medio absoluto
# para medir el desempeño modelo.
params = {
    'feature_selection__k': [1],
}

grid = GridSearchCV(pipeline, params, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1, refit=True)



Paso 4...


In [9]:
grid.fit(x_train, y_train)
x_train.info()
print('Mejores hiperparametros:', grid.best_params_)
print('score_test:', grid.score(x_test, y_test))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Selling_Price  211 non-null    float64
 1   Driven_kms     211 non-null    int64  
 2   Fuel_Type      211 non-null    object 
 3   Selling_type   211 non-null    object 
 4   Transmission   211 non-null    object 
 5   Owner          211 non-null    int64  
 6   Age            211 non-null    int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 11.7+ KB
Mejores hiperparametros: {'feature_selection__k': 1}
score_test: -3.9356753907191857


In [10]:
model = grid

In [11]:
model

In [12]:
model.estimator

In [13]:
df_train.head(n=5)

Unnamed: 0,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age
0,7.4,8.5,15059,Petrol,Dealer,Automatic,0,5
1,4.0,4.6,30000,Petrol,Dealer,Manual,0,8
2,0.5,0.826,6000,Petrol,Individual,Manual,0,10
3,3.15,4.43,15000,Petrol,Dealer,Manual,0,5
4,1.25,1.5,15000,Petrol,Individual,Manual,0,8


In [14]:
df_train.Transmission.value_counts()

Transmission
Manual       182
Automatic     29
Name: count, dtype: int64

In [15]:
x_train.shape

(211, 7)

In [16]:
def _load_grading_data():
    """Load grading data"""
    with open("../files/grading/x_train.pkl", "rb") as file:
        x_train = pickle.load(file)

    with open("../files/grading/y_train.pkl", "rb") as file:
        y_train = pickle.load(file)

    with open("../files/grading/x_test.pkl", "rb") as file:
        x_test = pickle.load(file)

    with open("../files/grading/y_test.pkl", "rb") as file:
        y_test = pickle.load(file)

    return x_train, y_train, x_test, y_test

In [17]:
x_train_0, y_train_0, x_test_0, y_test_0 = _load_grading_data()

In [18]:
x_train_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Selling_Price  211 non-null    float64
 1   Driven_kms     211 non-null    int64  
 2   Fuel_Type      211 non-null    object 
 3   Selling_type   211 non-null    object 
 4   Transmission   211 non-null    object 
 5   Owner          211 non-null    int64  
 6   Age            211 non-null    int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 11.7+ KB


In [19]:
x_test_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Selling_Price  90 non-null     float64
 1   Driven_kms     90 non-null     int64  
 2   Fuel_Type      90 non-null     object 
 3   Selling_type   90 non-null     object 
 4   Transmission   90 non-null     object 
 5   Owner          90 non-null     int64  
 6   Age            90 non-null     int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 5.0+ KB
