# Construccion de modelos

In [27]:
# Bibliotecas de Datos
import numpy as np # linear algebra
import pandas as pd # data processing

# Biblioteca de Visualizacion
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

# Bibliotecas Extras
import re

In [28]:
def preparar_motor(pl):
    pl = re.sub('[^0-9.]', ' ', pl)
    pl = str.strip(pl)
    pl = pl.split(' ')[0]
    pl = np.nan if (pl == '' or pl == ' ') else pl
    return pl

El propósito de estas lineas es identificar y analizar los valores atípicos o outliers en los datos. Los outliers son valores que difieren significativamente  del patrón general de los datos y pueden tener un impacto en los análisis y modelos posteriores. Al examinar los outliers, se pueden tomar decisiones informadas sobre cómo manejarlos, como eliminarlos, tratarlos de manera especial o aplicar técnicas específicas de mitigación de outliers.

In [30]:
df_cars = pd.read_csv("../Data/argentina_cars.csv", converters={'motor': str})
df_cars.head()

Unnamed: 0,money,brand,model,year,color,fuel_type,door,gear,motor,body_type,kilometres,currency
0,10350000,Toyota,Corolla Cross,2022,Plateado,Nafta,5.0,Automática,,SUV,500,pesos
1,10850000,Jeep,Compass,2022,Blanco,Nafta,5.0,Automática,2.4,SUV,500,pesos
2,35500,Jeep,Compass,2022,Gris oscuro,Nafta,5.0,Automática,2.4,SUV,500,dólares
3,19000,Citroën,C4 Cactus,2022,Gris oscuro,Nafta,5.0,Automática,,SUV,550,dólares
4,5800000,Toyota,Corolla,2019,Gris,Nafta,4.0,Manual,1.8,Sedán,9000,pesos


In [31]:
df_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   money       510 non-null    int64  
 1   brand       510 non-null    object 
 2   model       510 non-null    object 
 3   year        510 non-null    int64  
 4   color       499 non-null    object 
 5   fuel_type   510 non-null    object 
 6   door        510 non-null    float64
 7   gear        509 non-null    object 
 8   motor       510 non-null    object 
 9   body_type   509 non-null    object 
 10  kilometres  510 non-null    int64  
 11  currency    510 non-null    object 
dtypes: float64(1), int64(3), object(8)
memory usage: 47.9+ KB


In [32]:
df_cars.nunique()

money         343
brand          27
model         157
year           20
color          14
fuel_type       4
door            4
gear            2
motor          30
body_type      10
kilometres    247
currency        2
dtype: int64

In [33]:
# Pré Processamento
df_cars_2 = df_cars.copy()

valor_dolar = 172.72 # Valor referente a 16/12/2022

# Tornando as moedas no mesmo formato
df_cars_2['money'] = df_cars_2.apply(lambda row: np.round(row['money'] / valor_dolar) \
                                     if row['currency'] == 'pesos' else row['money'], 
                                     axis=1)

df_cars_2.drop(['currency'],axis=1,inplace=True)

In [34]:
df_cars_2.fuel_type.value_counts()

fuel_type
Nafta            416
Diésel            72
Nafta/GNC         19
Híbrido/Nafta      3
Name: count, dtype: int64

In [35]:
df_cars_2.gear.value_counts()

gear
Manual        298
Automática    211
Name: count, dtype: int64

### Analizando Outliers

In [36]:
df_cars_2[df_cars_2['money'] > 110000]

Unnamed: 0,money,brand,model,year,color,fuel_type,door,gear,motor,body_type,kilometres
54,185000.0,Mercedes-Benz,Clase E,2019,Gris,Nafta,4.0,Automática,3.0,Sedán,18594
134,115700.0,BMW,Serie M,2017,Gris,Nafta,2.0,Automática,3.0,Coupé,11000
400,128000.0,Ford,F-150,2020,Blanco,Diésel,4.0,Automática,2.0,Pick-Up,52900
403,235000.0,Audi,R8 Coupé,2011,Blanco,Nafta,2.0,Automática,5.2,Coupé,19200
461,430000.0,Audi,R8 Coupé,2020,Azul,Nafta,2.0,Automática,5.2,Coupé,3000


In [37]:
df_cars_2[df_cars_2['kilometres'] > 230000]

Unnamed: 0,money,brand,model,year,color,fuel_type,door,gear,motor,body_type,kilometres
250,10599.0,Mercedes-Benz,Clase C,2008,Plateado,Nafta,4.0,Automática,2.2,Sedán,250000
333,53555.0,Mercedes-Benz,Sprinter,2018,Blanco,Diésel,2.0,Manual,2.1,Minivan,242000
361,16300.0,Ford,F-100,1997,Gris,Diésel,2.0,Manual,2.5,Pick-Up,335000


In [38]:
df_cars_2[df_cars_2['year'] <= 2000]

Unnamed: 0,money,brand,model,year,color,fuel_type,door,gear,motor,body_type,kilometres
208,16000.0,BMW,X5,2000,,Nafta,5.0,Manual,4.4,SUV,132000
361,16300.0,Ford,F-100,1997,Gris,Diésel,2.0,Manual,2.5,Pick-Up,335000
440,10000.0,Volvo,960,1995,Gris,Nafta,5.0,Manual,3.0,Rural,125000
467,11290.0,Peugeot,504,1996,,Diésel,4.0,Manual,2.3,Sedán,99000


## Preparación de datos

In [39]:
# Bibliotecas de Machine Learning
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor

In [40]:
df_cars_3 = df_cars_2.copy()

In [41]:
# df_cars_3.motor.unique()
df_cars_3['motor'] = df_cars_3['motor'].apply(lambda x: preparar_motor(x))
df_cars_3['motor'] = df_cars_3['motor'].astype(float)

In [42]:
le = preprocessing.LabelEncoder()

# df_cars_3['brand'] = le.fit_transform(df_cars_3['brand'])
# df_cars_3['model'] = le.fit_transform(df_cars_3['model'])
df_cars_3['color'] = le.fit_transform(df_cars_3['color'])
df_cars_3['fuel_type'] = le.fit_transform(df_cars_3['fuel_type'])
df_cars_3['gear'] = le.fit_transform(df_cars_3['gear'])
df_cars_3['body_type'] = le.fit_transform(df_cars_3['body_type'])

In [43]:
df_cars_3 = df_cars_3[df_cars_3['money'] < 100000]
df_cars_3 = df_cars_3[df_cars_3['kilometres'] < 250000]
df_cars_3 = df_cars_3[df_cars_3['year'] >= 2000]
# df_cars_3 = df_cars_3[df_cars_3['motor'] <= 6]
df_cars_3.shape

(501, 11)

In [44]:
df_cars_4 = df_cars_3.drop(['brand', 'model', 'body_type',],axis=1)

df_cars_4 = df_cars_4.dropna()

In [45]:
df_cars_4.isna().sum()

money         0
year          0
color         0
fuel_type     0
door          0
gear          0
motor         0
kilometres    0
dtype: int64

In [46]:
X = df_cars_4.drop(['money'], axis=1).values
y = df_cars_4['money']

In [47]:
# Padronização dos Dados
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [48]:
# Separando los datos de entrenamiento y testeo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

## Entrenando los modelos

In [49]:
import pickle

In [50]:
modelA = DecisionTreeRegressor(random_state=10)
modelA.fit(X_train, y_train)
y_pred = modelA.predict(X_test)

In [51]:
mse = round(metrics.mean_squared_error(y_test, y_pred), 5)
rmse = round(np.sqrt(mse), 3)
r2_value = round(metrics.r2_score(y_test, y_pred), 5)

print("Accuracy on Traing set: ", modelA.score(X_train, y_train))
print("Accuracy on Testing set: ", modelA.score(X_test, y_test))
print('\n')
print('MSE: {}'.format(mse))
print('Root Mean Squared Error: {}'.format(rmse))
print('R-squared: {}'.format(r2_value))

Accuracy on Traing set:  0.9999917546364435
Accuracy on Testing set:  0.6701657806319894


MSE: 87680416.78049
Root Mean Squared Error: 9363.782
R-squared: 0.67017


In [53]:
modelB = RandomForestRegressor(max_depth=9,random_state=10)
modelB.fit(X_train, y_train)
y_pred = modelB.predict(X_test)

In [54]:
mse = round(metrics.mean_squared_error(y_test, y_pred), 5)
rmse = round(np.sqrt(mse), 3)
r2_value = round(metrics.r2_score(y_test, y_pred), 5)

print("Accuracy on Traing set: ", modelB.score(X_train, y_train))
print("Accuracy on Testing set: ", modelB.score(X_test, y_test))
print('\n')
print('MSE: {}'.format(mse))
print('Root Mean Squared Error: {}'.format(rmse))
print('R-squared: {}'.format(r2_value))

Accuracy on Traing set:  0.9500068744885101
Accuracy on Testing set:  0.8270391539614242


MSE: 45978489.12228
Root Mean Squared Error: 6780.744
R-squared: 0.82704


In [55]:
modelC = KNeighborsRegressor(n_neighbors=5, weights='distance')
modelC.fit(X_train, y_train)
y_pred = modelC.predict(X_test)

In [56]:
mse = round(metrics.mean_squared_error(y_test, y_pred), 5)
rmse = round(np.sqrt(mse), 3)
r2_value = round(metrics.r2_score(y_test, y_pred), 5)

print("Accuracy on Traing set: ", modelC.score(X_train, y_train))
print("Accuracy on Testing set: ", modelC.score(X_test, y_test))
print('\n')
print('MSE: {}'.format(mse))
print('Root Mean Squared Error: {}'.format(rmse))
print('R-squared: {}'.format(r2_value))

Accuracy on Traing set:  0.9999917546364435
Accuracy on Testing set:  0.7433625600079621


MSE: 68222386.81934
Root Mean Squared Error: 8259.684
R-squared: 0.74336


## Guardando modelos

In [57]:
# Guardar el modelo entrenado en un archivo .pkl
with open("random_forest_model.pkl", "wb") as a:
    pickle.dump(modelA, a)

In [58]:
# Guardar el modelo entrenado en un archivo .pkl
with open("KNeighbors_regressor.pkl", "wb") as b:
    pickle.dump(modelB, b)

In [59]:
# Guardar el modelo entrenado en un archivo .pkl
with open("DecisionTree_regressor.pkl", "wb") as c:
    pickle.dump(modelC, c)