# Importamos librerías y datos limpios

In [2]:
import pandas as pd
import numpy as np
from time import time

from sklearn.model_selection        import train_test_split
from sklearn.model_selection        import GridSearchCV

from sklearn.metrics                import r2_score, mean_absolute_error, mean_squared_error, explained_variance_score
from sklearn.metrics                import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.linear_model           import LinearRegression
from sklearn.linear_model           import LogisticRegression
from sklearn.linear_model           import Ridge
from sklearn.linear_model           import Lasso
from sklearn.linear_model           import BayesianRidge
from sklearn.linear_model           import ElasticNet
from sklearn.linear_model           import GammaRegressor
from sklearn.linear_model           import Lars
from sklearn.linear_model           import SGDRegressor
from sklearn.linear_model           import LassoLars
from sklearn.linear_model           import OrthogonalMatchingPursuit
from sklearn.linear_model           import PassiveAggressiveRegressor
from sklearn.linear_model           import PoissonRegressor
from sklearn.linear_model           import RANSACRegressor

from sklearn.ensemble               import RandomForestRegressor
from sklearn.ensemble               import AdaBoostRegressor
from sklearn.ensemble               import BaggingRegressor
from sklearn.ensemble               import ExtraTreesRegressor
from sklearn.ensemble               import GradientBoostingRegressor
from sklearn.neighbors              import KNeighborsRegressor
from sklearn.tree                   import DecisionTreeRegressor
from sklearn.gaussian_process       import GaussianProcessRegressor
from sklearn.svm                    import SVR
from sklearn.svm                    import LinearSVR
from sklearn.svm                    import NuSVR

from lightgbm                       import LGBMRegressor
import xgboost as xgb

from sklearn                        import preprocessing
from sklearn.preprocessing          import StandardScaler
from sklearn.preprocessing          import PolynomialFeatures
from sklearn.preprocessing          import OneHotEncoder

import scipy.stats as stats
from scipy.stats import norm

randomstate = 42

In [3]:
def rangos_percentiles(columna, percentil_inferior, percentil_superior):
    # Funcion que devuelve la posicion de los percentiles inferiores y superiores indicados.
    limite_inferior = np.percentile(columna , percentil_inferior)
    limite_superior = np.percentile(columna , percentil_superior)    
    return limite_inferior , limite_superior

def onehotencodificar(df) :
    for c in list(df.columns[df.dtypes == 'object']):
        onehotencoder = preprocessing.OneHotEncoder()
        labels = [str(c)+ "_" + str(a) for a in list(df[c].unique())]
        ohe = onehotencoder.fit_transform(df[c].values.reshape(-1,1)).toarray()
        df[labels] = pd.DataFrame(ohe, index=df.index)
        df.drop(columns=c,inplace=True)

In [4]:
diamonds_train               = pd.read_csv("data/train.csv")
diamonds_X_to_predict        = pd.read_csv("data/test.csv")
diamonds_train.shape

(40455, 11)

In [5]:
d_temp = diamonds_train.drop(['id'], axis=1)
d_temp.loc[d_temp['x']==0, 'x'] = None
d_temp.loc[d_temp['y']==0, 'y'] = None
d_temp.loc[d_temp['z']==0, 'z'] = None

percentiles = {'carat'    : ( 1  , 99 ) , 
               'table'    : ( 1  , 99 ) , 
               'y'        : ( 1  , 99 ) , 
               'z'        : ( 0.25 , 99.75 ) }
for var,val in percentiles.items() :
    lim_lower,lim_upper = rangos_percentiles(d_temp[var],val[0],val[1])
    d_temp.loc[d_temp[var] <= lim_lower, var] = None
    d_temp.loc[d_temp[var] >= lim_upper, var] = None
d_temp.dropna(axis=0, how='any', inplace=True)
d_temp.shape

(38708, 10)

# Generación de X e y de entrenamiento y de test

In [6]:
X = d_temp.drop(['price'], axis=1)
#y = df_to_use['price']               # En caso de no querer el logaritmico
y = np.log(d_temp['price'])  # En caso de SI querer el logaritmico 

X_to_predict = diamonds_X_to_predict.drop(columns="id")


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=randomstate)

# Numerical Scaling & Categorical Encoding

In [8]:
onehotencodificar(X_train)
onehotencodificar(X_test)
onehotencodificar(X_to_predict)

In [10]:
scaler       = StandardScaler()
#scaler       = PolynomialFeatures(degree=2, interaction_only=True) #Me da peores resultados que la Standard.

X_train      = scaler.fit_transform(X_train)
X_test       = scaler.transform(X_test)
X_to_predict = scaler.transform(X_to_predict)

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.



In [15]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.479605,0.176405,1.772299,0.594161,0.672147,0.656620,-0.160218,3.238853,-0.825901,-0.594532,...,-0.32686,4.340194,-0.115549,-0.188175,-0.567480,-0.446746,-0.424920,1.826739,-0.268756,-0.317888
1,-1.072275,0.032500,0.784864,-1.290026,-1.261588,-1.266351,-0.160218,-0.308751,-0.825901,-0.594532,...,-0.32686,-0.230404,-0.115549,-0.188175,-0.567480,-0.446746,-0.424920,-0.547424,3.720844,-0.317888
2,-1.094766,0.967883,-0.696289,-1.373561,-1.335607,-1.266351,-0.160218,3.238853,-0.825901,-0.594532,...,-0.32686,-0.230404,-0.115549,-0.188175,1.762176,-0.446746,-0.424920,-0.547424,-0.268756,-0.317888
3,1.604156,0.248358,0.784864,1.429514,1.366071,1.437827,-0.160218,-0.308751,-0.825901,1.681995,...,-0.32686,-0.230404,-0.115549,-0.188175,-0.567480,-0.446746,-0.424920,1.826739,-0.268756,-0.317888
4,-0.869856,0.967883,0.784864,-0.965166,-1.002523,-0.890770,-0.160218,-0.308751,-0.825901,-0.594532,...,-0.32686,-0.230404,-0.115549,-0.188175,-0.567480,-0.446746,2.353387,-0.547424,-0.268756,-0.317888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30961,0.299677,0.248358,-0.202572,0.501344,0.440839,0.506388,-0.160218,-0.308751,1.210799,-0.594532,...,-0.32686,-0.230404,-0.115549,-0.188175,-0.567480,2.238410,-0.424920,-0.547424,-0.268756,-0.317888
30962,0.929426,-0.543120,0.291146,1.113936,1.032988,1.002154,-0.160218,-0.308751,-0.825901,1.681995,...,-0.32686,-0.230404,-0.115549,-0.188175,1.762176,-0.446746,-0.424920,-0.547424,-0.268756,-0.317888
30963,-0.195126,-0.974835,1.772299,0.009413,0.033737,-0.094541,-0.160218,3.238853,-0.825901,-0.594532,...,-0.32686,-0.230404,-0.115549,-0.188175,1.762176,-0.446746,-0.424920,-0.547424,-0.268756,-0.317888
30964,-0.622455,-0.758978,-0.696289,-0.510362,-0.475141,-0.575283,-0.160218,-0.308751,-0.825901,-0.594532,...,-0.32686,-0.230404,-0.115549,-0.188175,-0.567480,2.238410,-0.424920,-0.547424,-0.268756,-0.317888


In [11]:
X_train.shape

(30966, 26)

In [16]:
print(f"Dimensión dataset original = {diamonds_train.shape}")
print(f"Dimensión de X_train       = {X_train.shape}")
print(f"Dimensión de y_train       = {y_train.shape}")
print(f"Dimensión de X_test        = {X_test.shape}")
print(f"Dimensión de y_test        = {y_test.shape}")
print(f"Dimensión de X_to_predict  = {X_to_predict.shape}")


Dimensión dataset original = (40455, 11)
Dimensión de X_train       = (30966, 26)
Dimensión de y_train       = (30966,)
Dimensión de X_test        = (7742, 26)
Dimensión de y_test        = (7742,)
Dimensión de X_to_predict  = (13485, 26)


## Calculamos los regresores

In [56]:
Models = {"LGBMReg"                     : LGBMRegressor()             ,
          "ExtraTreesReg"               : ExtraTreesRegressor()       ,
          "Random_Forest_Regressor"     : RandomForestRegressor()     ,
          "XGB Regressor"               : xgb.XGBRegressor()          ,
          "Linear_Regression"           : LinearRegression()          ,
          "Gradient_Boosting_Regressor" : GradientBoostingRegressor() ,
          "Bagging_Regressor"           : BaggingRegressor()          ,
          "Ada_Boost_Regressor"         : AdaBoostRegressor()         ,
          "KNeighbors"                  : KNeighborsRegressor()       ,
          "DecisionTreeReg"             : DecisionTreeRegressor()     }

In [57]:
modelos=[]
for name, model in Models.items():
    print(f"---< {name} >-------------------")
    start = time()
    model.fit(X_train, y_train)
    train_time = time()
    print("   Training time: %0.3fs" % (train_time-start))
    y_pred = model.predict(X_test)
    predict_time = time()
    print("   Prediction time: %0.3fs" % (predict_time-train_time))
    y_pred_exp = np.exp(y_pred)
    y_test_exp = np.exp(y_test)
    print('\tR2 - ', r2_score(y_test, y_pred)                      , '\tR2 exp   - ', r2_score(y_test_exp, y_pred_exp))
    print('\tMAE - ', mean_absolute_error(y_test, y_pred)          , '\tMAE exp  - ', mean_absolute_error(y_test_exp, y_pred_exp))
    print('\tMSE - ', mean_squared_error(y_test, y_pred)           , '\tMSE exp  - ', mean_squared_error(y_test_exp, y_pred_exp))
    print('\tRMSE - ', np.sqrt(mean_squared_error(y_test, y_pred)) , '\tRMSE exp - ', np.sqrt(mean_squared_error(y_test_exp, y_pred_exp)))
    modelos.append({"name": name, 
                    "R2"  :r2_score(y_test_exp, y_pred_exp),
                    "MAE" :mean_absolute_error(y_test_exp, y_pred_exp),
                    "MSE" :mean_squared_error(y_test_exp, y_pred_exp),
                    "RMSE":np.sqrt(mean_squared_error(y_test_exp, y_pred_exp))
                   })


---< LGBMReg >-------------------
   Training time: 0.352s
   Prediction time: 0.021s
	R2 -  0.9918005876636407 	R2 exp   -  0.9807751053542346
	MAE -  0.06641342884936413 	MAE exp  -  271.46576516659957
	MSE -  0.008170048194953875 	MSE exp  -  282679.80256954284
	RMSE -  0.09038831890766569 	RMSE exp -  531.6764077609075
---< ExtraTreesReg >-------------------
   Training time: 11.699s
   Prediction time: 0.322s
	R2 -  0.9911809091297263 	R2 exp   -  0.9801710883929873
	MAE -  0.0666665213295926 	MAE exp  -  267.49010427914453
	MSE -  0.00878750750542266 	MSE exp  -  291561.1721947155
	RMSE -  0.09374170632873428 	RMSE exp -  539.9640471315803
---< Random_Forest_Regressor >-------------------
   Training time: 14.228s
   Prediction time: 0.245s
	R2 -  0.9915386469500501 	R2 exp   -  0.9801846241476521
	MAE -  0.06539631339651066 	MAE exp  -  269.34407689352037
	MSE -  0.00843105083349235 	MSE exp  -  291362.1446043565
	RMSE -  0.09182075382772868 	RMSE exp -  539.7797185930169
---< X

In [59]:
#resultados de la Scalar
modelos = pd.DataFrame(modelos)
modelos.sort_values(by="RMSE")

Unnamed: 0,name,R2,MAE,MSE,RMSE
3,XGB Regressor,0.981315,270.69499,274734.0,524.150759
0,LGBMReg,0.980775,271.465765,282679.8,531.676408
2,Random_Forest_Regressor,0.980185,269.344077,291362.1,539.779719
1,ExtraTreesReg,0.980171,267.490104,291561.2,539.964047
6,Bagging_Regressor,0.978831,282.547645,311264.2,557.910586
5,Gradient_Boosting_Regressor,0.965084,370.08311,513398.6,716.518361
9,DecisionTreeReg,0.964583,356.101252,520764.7,721.640319
4,Linear_Regression,0.961835,401.174279,561175.1,749.116203
8,KNeighbors,0.945717,442.374379,798162.3,893.399297
7,Ada_Boost_Regressor,0.859662,797.670853,2063508.0,1436.491659


In [46]:
#Resultados de la polinomica:
#modelos = pd.DataFrame(modelos)
#modelos.sort_values(by="RMSE")

Unnamed: 0,name,R2,MAE,MSE,RMSE
0,LGBMReg,0.981084,268.59756,278134.6,527.384663
2,Random_Forest_Regressor,0.980047,272.08929,293392.1,541.656801
1,ExtraTreesReg,0.980017,266.563693,293822.3,542.053739
3,XGB Regressor,0.979947,277.18227,294852.0,543.00278
6,Bagging_Regressor,0.978723,284.400691,312848.7,559.328791
4,Linear_Regression,0.969838,332.942365,443495.3,665.9544
5,Gradient_Boosting_Regressor,0.967637,356.810714,475864.1,689.82907
9,DecisionTreeReg,0.963018,365.465846,543779.0,737.413704
8,KNeighbors,0.952936,420.413122,692020.4,831.877642
7,Ada_Boost_Regressor,0.85877,802.567014,2076626.0,1441.050167


In [60]:
LGBMRegressor().get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq'])

In [61]:
lgbm_params = {"learning_rate": [0.01, 0.03, 0.05, 0.1, 0.5],
               "n_estimators": [500, 1000, 1500],
               "max_depth":[3,5,8]}

In [62]:
gridLGBM = GridSearchCV(LGBMRegressor(),
                        lgbm_params,
                        cv=10,
                        refit=True,
                        n_jobs=-1).fit(X_train,y_train)

In [63]:
print(gridLGBM.best_params_,'\n')
print(gridLGBM.best_estimator_,'\n')

{'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 1500} 

LGBMRegressor(learning_rate=0.05, max_depth=8, n_estimators=1500) 



In [64]:
BestModelLGBM = LGBMRegressor(**gridLGBM.best_params_)          

BestModelLGBM.fit(X_train, y_train)
y_pred = BestModelLGBM.predict(X_test)
y_pred_exp = np.exp(y_pred)
y_test_exp = np.exp(y_test)
print('\tR2 - ', r2_score(y_test, y_pred)                      , '\tR2 exp   - ', r2_score(y_test_exp, y_pred_exp))
print('\tMAE - ', mean_absolute_error(y_test, y_pred)          , '\tMAE exp  - ', mean_absolute_error(y_test_exp, y_pred_exp))
print('\tMSE - ', mean_squared_error(y_test, y_pred)           , '\tMSE exp  - ', mean_squared_error(y_test_exp, y_pred_exp))
print('\tRMSE - ', np.sqrt(mean_squared_error(y_test, y_pred)) , '\tRMSE exp - ', np.sqrt(mean_squared_error(y_test_exp, y_pred_exp)))


	R2 -  0.9926662521463276 	R2 exp   -  0.981950219210008
	MAE -  0.06109084058339214 	MAE exp  -  256.8475538345334
	MSE -  0.007307483872770599 	MSE exp  -  265401.1147604553
	RMSE -  0.08548382228685494 	RMSE exp -  515.1709568293377


In [65]:
y_predicted = model.predict(X_to_predict)
y_predicted_exp = np.exp(y_predicted)     #por que el precio está transformado logaritmicamente. Hay que deshacerlo


In [73]:
submision = pd.DataFrame(columns=['id','price']) 
submision['id']     = diamonds_X_to_predict['id']
submision['price']  = y_predicted_exp
submision.to_csv(  'data/submision_7_DaniHelguera.csv' , index = False)

In [74]:
submision

Unnamed: 0,id,price
0,0,3188.0
1,1,3095.0
2,2,3763.0
3,3,2797.0
4,4,5556.0
...,...,...
13480,13480,8400.0
13481,13481,15055.0
13482,13482,7509.0
13483,13483,552.0


In [75]:
submision.shape

(13485, 2)

In [None]:
Models = {
    "LR"                          : LinearRegression()                                           ,
    "Decision Tree"               : DecisionTreeRegressor(random_state = randomstate)            ,
    "Elastic Net"                 : ElasticNet(random_state = randomstate, tol=1e-3)             ,#copy_X=True, normalize=False, 
    "Extra Trees"                 : ExtraTreesRegressor(random_state = randomstate)              ,
    "Gradient Boosted"            : GradientBoostingRegressor(random_state = randomstate)        ,
    "KNN"                         : KNeighborsRegressor()                                        ,
    "Lars"                        : Lars(random_state = randomstate, normalize=False)            , #copy_X=True, 
    "Lasso"                       : Lasso(random_state = randomstate, tol=1e-3)                  , #copy_X=True, 
    "LinearSVR"                   : LinearSVR(random_state = randomstate)                        ,
    "MLPRegressor"                : MLPRegressor(random_state = randomstate)                     ,
    "Bayesian Ridge"              : BayesianRidge()                                              , 
    "Gamma Regressor"             : GammaRegressor()                                             ,
    "Lasso Lars"                  : LassoLars(random_state = randomstate)                        , #, copy_X=True, normalize=False
    "Nu SVR"                      : NuSVR()                                                      ,
    "Orthogonal Matching Pursuit" : OrthogonalMatchingPursuit()                                  ,
    "Passive Aggressive"          : PassiveAggressiveRegressor(random_state = randomstate)       ,
    "RANSAC"                      : RANSACRegressor(random_state = randomstate)                  ,
    "Ridge"                       : Ridge(random_state = randomstate, tol=1e-3, normalize=False) ,
    "SVR"                         : SVR()                                                        ,
    "XGB Regressor"               : xgb.XGBRegressor(random_state = randomstate)                 ,
    "Ada Boost"                   : AdaBoostRegressor(random_state=randomstate)                  ,
    "SGD Regressor"               : SGDRegressor(random_state=randomstate)                       ,
    "Random Forest Regressor"     : RandomForestRegressor(random_state=randomstate)              }