In [1]:
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import pickle

# libreria normalización y estandarización
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

# librerías de visualización
import seaborn as sns
import matplotlib.pyplot as plt

# para calcular las métricas
import statsmodels.formula.api as smf


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn import tree


# soporte para limpieza
import sys
sys.path.append("../")

import src.soporte as sp

import warnings
warnings.filterwarnings('ignore')

plt.rcParams["figure.figsize"] = (15,10)

In [2]:
df_train2 = pd.read_csv("../data/train2.csv", index_col = 0)

In [3]:
df_train2.head(3)

Unnamed: 0,id,carat,cut,color,clarity,x,y,z,price
0,0,0.3,Premium,D,SI2,4.31,4.28,2.68,6.353
1,1,1.01,Ideal,E,VVS2,6.42,6.46,4.04,9.183
2,2,0.72,Ideal,F,VS2,5.71,5.74,3.54,7.983


<h2>Estandarización</h2>

In [4]:
robust = RobustScaler()

In [5]:
robust.fit(df_train2[['carat', 'x', 'y', 'z']])

In [6]:
X_robust = robust.transform(df_train2[['carat', 'x', 'y', 'z']])

In [7]:
df_train3 = df_train2.copy()

In [8]:
df_train3[['carat', 'x', 'y', 'z']] = X_robust
df_train3.head()

Unnamed: 0,id,carat,cut,color,clarity,x,y,z,price
0,0,-0.625,Premium,D,SI2,-0.754098,-0.78022,-0.75,6.353
1,1,0.484375,Ideal,E,VVS2,0.398907,0.417582,0.464286,9.183
2,2,0.03125,Ideal,F,VS2,0.010929,0.021978,0.017857,7.983
3,3,0.59375,Very Good,G,SI2,0.464481,0.43956,0.535714,8.371
4,4,-0.53125,Premium,G,VS1,-0.650273,-0.631868,-0.625,6.588


In [9]:
with open('../data/modelo_1/estandarizacion.pkl', 'wb') as s:
    pickle.dump(robust, s)

<h2>Encoding</h2>

In [10]:
df_train4 = df_train3.copy()

In [11]:
df_train4 = sp.ordinal_encoder(df_train4, "cut", ["Ideal", "Very Good", "Good", "Premium", "Fair"], 1)


# df_train4 = sp.ordinal_map_con(df_train4, "cut", ["Ideal", "Very Good", "Good", "Premium", "Fair"])

# with open(f'../data/modelo_1/encoding_cut.pkl', 'wb') as s:
#         pickle.dump(sp.ordinal_map_con, s)

In [12]:
df_train4 = sp.ordinal_encoder(df_train4, "color", ["E", "D", "F", "G", "H", "I", "J"], 1)

# df_train4 = sp.ordinal_map_con(df_train4, "color", ["E", "D", "F", "G", "H", "I", "J"])

# with open(f'../data/modelo_1/encoding_color.pkl', 'wb') as s:
#         pickle.dump(sp.ordinal_map_con, s)

In [13]:
df_train4 = sp.ordinal_encoder(df_train4, "clarity", ["VVS1", "IF", "VVS2", "VS1", "VS2", "SI1", "I1", "SI2"], 1)

# df_train4 = sp.ordinal_map_con(df_train4, "clarity", ["VVS1", "IF", "VVS2", "VS1", "VS2", "SI1", "I1", "SI2"])

# with open(f'../data/modelo_1/encoding_clarity.pkl', 'wb') as s:
#         pickle.dump(sp.ordinal_map_con, s)

In [14]:
df_train4.head(5)

Unnamed: 0,id,carat,cut,color,clarity,x,y,z,price,cut_mapeada,color_mapeada,clarity_mapeada
0,0,-0.625,Premium,D,SI2,-0.754098,-0.78022,-0.75,6.353,3,1,7
1,1,0.484375,Ideal,E,VVS2,0.398907,0.417582,0.464286,9.183,0,0,2
2,2,0.03125,Ideal,F,VS2,0.010929,0.021978,0.017857,7.983,0,2,4
3,3,0.59375,Very Good,G,SI2,0.464481,0.43956,0.535714,8.371,1,3,7
4,4,-0.53125,Premium,G,VS1,-0.650273,-0.631868,-0.625,6.588,3,3,3


Preperamos los datos para que se pueda entrenar nuestro modelo.

In [15]:
df = df_train4.copy()
df = df_train4[['carat','x', 'y', 'z', 'cut_mapeada', 'color_mapeada', 'clarity_mapeada', 'price']]

In [16]:
df.head(5)

Unnamed: 0,carat,x,y,z,cut_mapeada,color_mapeada,clarity_mapeada,price
0,-0.625,-0.754098,-0.78022,-0.75,3,1,7,6.353
1,0.484375,0.398907,0.417582,0.464286,0,0,2,9.183
2,0.03125,0.010929,0.021978,0.017857,0,2,4,7.983
3,0.59375,0.464481,0.43956,0.535714,1,3,7,8.371
4,-0.53125,-0.650273,-0.631868,-0.625,3,3,3,6.588


<h2>Decision Tree</h2>

In [17]:
X = df.drop("price", axis = 1)
y = df["price"] 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state = 42)

In [18]:
y_train.describe()

count    36394.000000
mean         7.785014
std          1.015915
min          5.787000
25%          6.851000
50%          7.782000
75%          8.579000
max          9.842000
Name: price, dtype: float64

In [19]:
y_test.describe()

count    4044.000000
mean        7.768586
std         1.023411
min         5.823000
25%         6.848000
50%         7.739000
75%         8.586000
max         9.839000
Name: price, dtype: float64

In [20]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
regressor.fit(X_train, y_train)

In [21]:
max_features = np.sqrt(len(X_train.columns))
max_features

2.6457513110645907

In [22]:
print(regressor.tree_.max_depth)

33


In [23]:
y_pred_test_dt = regressor.predict(X_test)
y_pred_train_dt = regressor.predict(X_train)

In [24]:
dt_results1 = sp.metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decission Tree I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.085443,0.015268,0.123562,0.985419,test,Decission Tree I
1,0.000389,3.2e-05,0.00564,0.999969,train,Decission Tree I


In [25]:
df_all = dt_results1.copy()

In [26]:
# Generación del GridSearch
# ==============================================================================

# definimos un diccionario con los hiperparámetros que queremos testear. 
param = {"max_depth": [5, 6, 7, 8],
        "min_samples_split": [10, 20, 30, 40, 50, 100, 150],
        "max_features": [3, 4, 5, 6]}

gs = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10,
            verbose=0,
            n_jobs = -1,
            return_train_score = True,
            scoring="neg_mean_squared_error")

In [27]:
%%time
gs.fit(X_train, y_train)

CPU times: total: 3.27 s
Wall time: 13.8 s


In [28]:
best_tree = gs.best_estimator_
best_tree

In [29]:
y_pred_test_dt = gs.predict(X_test)
y_pred_train_dt = gs.predict(X_train)

In [30]:
dt_results2 = sp.metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decission Tree II")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.106353,0.018774,0.137018,0.982071,test,Decission Tree II
1,0.103651,0.017778,0.133335,0.982774,train,Decission Tree II


In [31]:
df_all = pd.concat([df_all, dt_results2 ], axis = 0)
df_all

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.085443,0.015268,0.123562,0.985419,test,Decission Tree I
1,0.000389,3.2e-05,0.00564,0.999969,train,Decission Tree I
0,0.106353,0.018774,0.137018,0.982071,test,Decission Tree II
1,0.103651,0.017778,0.133335,0.982774,train,Decission Tree II


<h2>Linear Regression</h2>

In [32]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [33]:
y_pred_test = lr.predict(X_test)
y_pred_train = lr.predict(X_train)

In [34]:
train_df = pd.DataFrame({'Real': y_train, 'Predicted': y_pred_train, 'Set': ['Train']*len(y_train)})
test_df  = pd.DataFrame({'Real': y_test,  'Predicted': y_pred_test,  'Set': ['Test']*len(y_test)})
results = pd.concat([train_df,test_df], axis = 0)
results.head()

Unnamed: 0,Real,Predicted,Set
28182,7.143,7.344137,Train
29911,7.436,7.406115,Train
1045,6.39,6.510494,Train
5779,7.913,7.917343,Train
5219,9.605,9.587665,Train


In [35]:
lr_results = sp.metricas(y_test, y_train, y_pred_test, y_pred_train, "Regresion lineal")
lr_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.131956,0.03009,0.173464,0.971264,test,Regresion lineal
1,0.131524,0.030373,0.174277,0.970571,train,Regresion lineal


In [36]:
df_all = pd.concat([df_all, lr_results ], axis = 0)
df_all

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.085443,0.015268,0.123562,0.985419,test,Decission Tree I
1,0.000389,3.2e-05,0.00564,0.999969,train,Decission Tree I
0,0.106353,0.018774,0.137018,0.982071,test,Decission Tree II
1,0.103651,0.017778,0.133335,0.982774,train,Decission Tree II
0,0.131956,0.03009,0.173464,0.971264,test,Regresion lineal
1,0.131524,0.030373,0.174277,0.970571,train,Regresion lineal


<h2>Random Forest</h2>

In [38]:
param = {"max_depth": [6, 7, 8, 9],
        "min_samples_split": [10, 20, 40, 50, 100],
        "max_features": [3, 4, 5, 6]}

bosque = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param,
            cv=10,
            verbose=0,
            n_jobs = -1,
            return_train_score = True,
            scoring="neg_mean_squared_error")

In [39]:
%%time
bosque.fit(X_train, y_train)

CPU times: total: 7.98 s
Wall time: 7min 16s


In [40]:
bos = bosque.best_estimator_
bos

In [41]:
y_pred_test_rf = bos.predict(X_test)
y_pred_train_rf = bos.predict(X_train)

In [42]:
rf_results = sp.metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest")
rf_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.085834,0.012278,0.110805,0.988275,test,Random Forest
1,0.081408,0.011001,0.104887,0.98934,train,Random Forest


In [43]:
df_all = pd.concat([df_all, rf_results ], axis = 0)
df_all

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.085443,0.015268,0.123562,0.985419,test,Decission Tree I
1,0.000389,3.2e-05,0.00564,0.999969,train,Decission Tree I
0,0.106353,0.018774,0.137018,0.982071,test,Decission Tree II
1,0.103651,0.017778,0.133335,0.982774,train,Decission Tree II
0,0.131956,0.03009,0.173464,0.971264,test,Regresion lineal
1,0.131524,0.030373,0.174277,0.970571,train,Regresion lineal
0,0.085834,0.012278,0.110805,0.988275,test,Random Forest
1,0.081408,0.011001,0.104887,0.98934,train,Random Forest


In [44]:
with open('../data/modelo_1/mejor_modelo.pkl', 'wb') as modelo:
        pickle.dump(bosque, modelo)