In [2]:
import matplotlib.pyplot as plt
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

pipe = Pipeline([('classifier', KNeighborsClassifier)])

%matplotlib inline

In [21]:
diamonds_test = pd.read_csv('diamonds_test_rob_scl.csv')
diamonds_test.head()

Unnamed: 0,city,cut,color,clarity,carat,depth,table
0,-1.0,2,-0.333333,-0.666667,0.140625,0.6,1.0
1,0.666667,0,1.0,0.0,0.78125,-0.533333,0.0
2,-0.5,1,0.333333,-0.666667,1.359375,0.266667,1.333333
3,-0.5,2,-0.333333,-0.666667,0.3125,1.333333,-1.0
4,-1.0,2,-0.333333,0.0,-0.3125,0.733333,0.333333


In [22]:
diamonds_train = pd.read_csv('diamonds_train_rob_scl.csv')
diamonds_train.head()

Unnamed: 0,city,cut,color,clarity,carat,depth,table,price
0,-0.666667,1,1.0,0.333333,0.796875,0.4,0.333333,4268
1,-0.5,2,0.333333,0.333333,-0.59375,0.8,0.0,505
2,-0.333333,-2,0.0,0.0,0.015625,2.466667,-0.666667,2686
3,-0.5,-1,-1.0,-0.666667,-0.453125,1.333333,-0.333333,738
4,-0.666667,0,0.0,-0.666667,0.5,-0.866667,0.666667,4882


In [23]:
X = diamonds_train.drop('price', axis=1)
y = diamonds_train['price']

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (38432, 7), X_test: (2023, 7), y_train: (38432,), y_test: (2023,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [49]:
round(X_train.describe(), 2)

Unnamed: 0,city,cut,color,clarity,carat,depth,table
count,38432.0,38432.0,38432.0,38432.0,38432.0,38432.0,38432.0
mean,-0.04,0.55,-0.13,-0.05,0.15,-0.03,0.15
std,0.63,1.03,0.57,0.58,0.74,0.96,0.74
min,-1.0,-2.0,-1.0,-1.33,-0.78,-12.53,-4.67
25%,-0.5,0.0,-0.67,-0.67,-0.47,-0.53,-0.33
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.5,1.0,0.33,0.33,0.53,0.47,0.67
max,1.0,2.0,1.0,1.0,5.94,11.47,12.67


In [50]:
round(y_train.describe(),2)

count    38432.00
mean      3920.68
std       3983.43
min        326.00
25%        945.00
50%       2394.00
75%       5312.00
max      18823.00
Name: price, dtype: float64

In [51]:
round(X_test.describe(),2)

Unnamed: 0,city,cut,color,clarity,carat,depth,table
count,2023.0,2023.0,2023.0,2023.0,2023.0,2023.0,2023.0
mean,-0.04,0.58,-0.14,-0.06,0.17,-0.03,0.16
std,0.63,1.05,0.57,0.58,0.76,0.93,0.75
min,-1.0,-2.0,-1.0,-1.33,-0.77,-4.2,-2.3
25%,-0.5,0.0,-0.67,-0.67,-0.47,-0.53,-0.33
50%,0.0,0.0,0.0,0.0,0.02,0.07,0.0
75%,0.5,1.0,0.33,0.33,0.56,0.47,0.67
max,1.0,2.0,1.0,1.0,4.22,4.47,4.33


In [52]:
round(y_test.describe(),2)

count     2023.00
mean      4075.99
std       4157.72
min        326.00
25%        950.00
50%       2442.00
75%       5590.50
max      18791.00
Name: price, dtype: float64

## Models

In [53]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor(random_state=42)

In [54]:
# Make a dict of hyperparameters values to search

search_space = {
    'n_estimators': [100,200,500],
    'max_depth': [3,6,9],
    'gamma': [0.01,0.1],
    'learning_rate': [0.001,0.01,0.1,1]
}

In [55]:
# MAke a GridSearchCV object

GS = GridSearchCV(estimator = xgb_model,
                 param_grid = search_space,
                 scoring = ['r2','neg_root_mean_squared_error'], # sklearn.metrics.SCORERS.keys()
                 refit = 'r2',
                 cv = 5,
                 verbose = 4)

In [56]:
GS.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-5010.961) r2: (test=-0.621) total time=   0.1s
[CV 2/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-5099.459) r2: (test=-0.652) total time=   0.1s
[CV 3/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-5205.369) r2: (test=-0.647) total time=   0.1s
[CV 4/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-5033.934) r2: (test=-0.628) total time=   0.1s
[CV 5/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-5104.119) r2: (test=-0.622) total time=   0.1s
[CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-4569.116) r2: (test=-0.34

[CV 2/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-1339.385) r2: (test=0.886) total time=   0.2s
[CV 3/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-1374.873) r2: (test=0.885) total time=   0.2s
[CV 4/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-1287.652) r2: (test=0.893) total time=   0.2s
[CV 5/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-1333.505) r2: (test=0.889) total time=   0.2s
[CV 1/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=500; neg_root_mean_squared_error: (test=-815.272) r2: (test=0.957) total time=   0.7s
[CV 2/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=500; neg_root_mean_squared_error: (test=-858.615) r2: (test=0.953) total time=   0.7s
[CV 3/5] END gamma=0.01, learning_rate=0.01, max_depth

[CV 5/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=500; neg_root_mean_squared_error: (test=-559.844) r2: (test=0.980) total time=   0.7s
[CV 1/5] END gamma=0.01, learning_rate=0.1, max_depth=6, n_estimators=100; neg_root_mean_squared_error: (test=-538.880) r2: (test=0.981) total time=   0.2s
[CV 2/5] END gamma=0.01, learning_rate=0.1, max_depth=6, n_estimators=100; neg_root_mean_squared_error: (test=-559.019) r2: (test=0.980) total time=   0.3s
[CV 3/5] END gamma=0.01, learning_rate=0.1, max_depth=6, n_estimators=100; neg_root_mean_squared_error: (test=-550.588) r2: (test=0.982) total time=   0.3s
[CV 4/5] END gamma=0.01, learning_rate=0.1, max_depth=6, n_estimators=100; neg_root_mean_squared_error: (test=-535.862) r2: (test=0.982) total time=   0.4s
[CV 5/5] END gamma=0.01, learning_rate=0.1, max_depth=6, n_estimators=100; neg_root_mean_squared_error: (test=-525.709) r2: (test=0.983) total time=   0.3s
[CV 1/5] END gamma=0.01, learning_rate=0.1, max_depth=6, n_estim

[CV 3/5] END gamma=0.01, learning_rate=1, max_depth=6, n_estimators=200; neg_root_mean_squared_error: (test=-686.476) r2: (test=0.971) total time=   0.5s
[CV 4/5] END gamma=0.01, learning_rate=1, max_depth=6, n_estimators=200; neg_root_mean_squared_error: (test=-668.910) r2: (test=0.971) total time=   0.5s
[CV 5/5] END gamma=0.01, learning_rate=1, max_depth=6, n_estimators=200; neg_root_mean_squared_error: (test=-662.862) r2: (test=0.973) total time=   0.5s
[CV 1/5] END gamma=0.01, learning_rate=1, max_depth=6, n_estimators=500; neg_root_mean_squared_error: (test=-707.216) r2: (test=0.968) total time=   1.6s
[CV 2/5] END gamma=0.01, learning_rate=1, max_depth=6, n_estimators=500; neg_root_mean_squared_error: (test=-705.024) r2: (test=0.968) total time=   1.5s
[CV 3/5] END gamma=0.01, learning_rate=1, max_depth=6, n_estimators=500; neg_root_mean_squared_error: (test=-705.233) r2: (test=0.970) total time=   1.5s
[CV 4/5] END gamma=0.01, learning_rate=1, max_depth=6, n_estimators=500; neg

[CV 1/5] END gamma=0.1, learning_rate=0.001, max_depth=9, n_estimators=100; neg_root_mean_squared_error: (test=-4989.897) r2: (test=-0.607) total time=   0.4s
[CV 2/5] END gamma=0.1, learning_rate=0.001, max_depth=9, n_estimators=100; neg_root_mean_squared_error: (test=-5076.949) r2: (test=-0.637) total time=   0.4s
[CV 3/5] END gamma=0.1, learning_rate=0.001, max_depth=9, n_estimators=100; neg_root_mean_squared_error: (test=-5180.806) r2: (test=-0.632) total time=   0.4s
[CV 4/5] END gamma=0.1, learning_rate=0.001, max_depth=9, n_estimators=100; neg_root_mean_squared_error: (test=-5016.167) r2: (test=-0.617) total time=   0.4s
[CV 5/5] END gamma=0.1, learning_rate=0.001, max_depth=9, n_estimators=100; neg_root_mean_squared_error: (test=-5080.928) r2: (test=-0.607) total time=   0.4s
[CV 1/5] END gamma=0.1, learning_rate=0.001, max_depth=9, n_estimators=200; neg_root_mean_squared_error: (test=-4526.922) r2: (test=-0.323) total time=   1.0s
[CV 2/5] END gamma=0.1, learning_rate=0.001, m

[CV 4/5] END gamma=0.1, learning_rate=0.01, max_depth=9, n_estimators=200; neg_root_mean_squared_error: (test=-956.848) r2: (test=0.941) total time=   1.1s
[CV 5/5] END gamma=0.1, learning_rate=0.01, max_depth=9, n_estimators=200; neg_root_mean_squared_error: (test=-980.260) r2: (test=0.940) total time=   1.1s
[CV 1/5] END gamma=0.1, learning_rate=0.01, max_depth=9, n_estimators=500; neg_root_mean_squared_error: (test=-543.095) r2: (test=0.981) total time=   2.9s
[CV 2/5] END gamma=0.1, learning_rate=0.01, max_depth=9, n_estimators=500; neg_root_mean_squared_error: (test=-555.330) r2: (test=0.980) total time=   2.9s
[CV 3/5] END gamma=0.1, learning_rate=0.01, max_depth=9, n_estimators=500; neg_root_mean_squared_error: (test=-548.958) r2: (test=0.982) total time=   2.9s
[CV 4/5] END gamma=0.1, learning_rate=0.01, max_depth=9, n_estimators=500; neg_root_mean_squared_error: (test=-541.915) r2: (test=0.981) total time=   3.0s
[CV 5/5] END gamma=0.1, learning_rate=0.01, max_depth=9, n_estim

[CV 2/5] END gamma=0.1, learning_rate=1, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-624.969) r2: (test=0.975) total time=   0.1s
[CV 3/5] END gamma=0.1, learning_rate=1, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-607.117) r2: (test=0.978) total time=   0.1s
[CV 4/5] END gamma=0.1, learning_rate=1, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-590.608) r2: (test=0.978) total time=   0.1s
[CV 5/5] END gamma=0.1, learning_rate=1, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-579.325) r2: (test=0.979) total time=   0.1s
[CV 1/5] END gamma=0.1, learning_rate=1, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-576.304) r2: (test=0.979) total time=   0.3s
[CV 2/5] END gamma=0.1, learning_rate=1, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-612.315) r2: (test=0.976) total time=   0.3s
[CV 3/5] END gamma=0.1, learning_rate=1, max_depth=3, n_estimators=200; neg_root_m

In [57]:
print(GS.best_estimator_) # To get the complete details of the best model

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.01, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...)


In [58]:
print(GS.best_params_) # to get only the best hypterparameter values that we searched for

{'gamma': 0.01, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200}


In [59]:
print(GS.best_score_) # score according to the metric we passed in refit

0.9817200001483405


In [36]:
df = pd.DataFrame(GS.cv_results_)
df = df.sort_values('rank_test_r2')
df.to_csv('cv_results.csv')

In [37]:
# Obtener los mejores hiperparámetros
best_params = GS.best_params_
print("Mejores hiperparámetros:", best_params)

# Obtener los resultados
results = GS.cv_results_


Mejores hiperparámetros: {'gamma': 0.01, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200}


In [38]:
# Obtener el mejor modelo
best_model = GS.best_estimator_

# Predecir los valores en los datos de prueba
y_pred = best_model.predict(X_test)

# Calcular métricas de evaluación
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE en datos de prueba:", rmse)


RMSE en datos de prueba: 540.9210137299019


In [39]:
# Hacer predicciones en diamonds_test utilizando el mejor modelo obtenido
y_prediction = best_model.predict(diamonds_test)
y_prediction

array([2853.467 , 5742.2393, 9678.099 , ..., 3133.5227, 2004.9507,
        827.8709], dtype=float32)

In [40]:
solution = []
for i, value in enumerate(y_prediction):
    solution.append((i,value))

In [41]:
submission = pd.DataFrame(solution)
submission

Unnamed: 0,0,1
0,0,2853.467041
1,1,5742.239258
2,2,9678.098633
3,3,3978.034180
4,4,1635.021729
...,...,...
13480,13480,1670.307007
13481,13481,2437.840576
13482,13482,3133.522705
13483,13483,2004.950684


In [42]:
submission.columns=["id", "price"]

In [43]:
submission.reset_index(drop=True)

Unnamed: 0,id,price
0,0,2853.467041
1,1,5742.239258
2,2,9678.098633
3,3,3978.034180
4,4,1635.021729
...,...,...
13480,13480,1670.307007
13481,13481,2437.840576
13482,13482,3133.522705
13483,13483,2004.950684


## Tunning

In [62]:
# Creamos una instancia del modelo XGBoost
xgb_model = XGBRegressor(gamma=0.01, learning_rate=0.1, max_depth=6, n_estimators=200)
# Ajustamos el modelo a los datos de entrenamiento
xgb_model.fit(X_train, y_train)

# Hacemos predicciones en los datos de prueba
y_pred = xgb_model.predict(X_test)

# Calculamos el error cuadrático medio
mse = mean_squared_error(y_test, y_pred)

# Calculamos la raíz del error cuadrático medio
rmse = np.sqrt(mse)

# Mostramos el resultado
print("Root Mean Squared Error:", rmse)


Root Mean Squared Error: 546.6108546046999


In [66]:
y_prediction = xgb_model.predict(diamonds_test)
y_prediction

array([2908.215 , 5707.517 , 9634.991 , ..., 3171.6257, 2051.3054,
        813.3031], dtype=float32)

In [67]:
solution = []
for i, value in enumerate(y_prediction):
    solution.append((i,value))
submission = pd.DataFrame(solution)
submission.columns=["id", "price"]
submission.reset_index(drop=True)

Unnamed: 0,id,price
0,0,2908.215088
1,1,5707.517090
2,2,9634.991211
3,3,3947.952148
4,4,1644.677246
...,...,...
13480,13480,1627.846069
13481,13481,2367.106934
13482,13482,3171.625732
13483,13483,2051.305420


In [68]:
from pathlib import Path
filepath = Path('rob_scl.csv', index=False)  
filepath.parent.mkdir(parents=True, exist_ok=True) 
submission.to_csv(filepath, index=False, encoding='utf-8')

In [61]:
# Creamos una instancia del modelo XGBoost
xgb_model = XGBRegressor(gamma=0.1, learning_rate=0.1, max_depth=6, n_estimators=100)


# Ajustamos el modelo a los datos de entrenamiento
xgb_model.fit(X_train, y_train)

# Hacemos predicciones en los datos de prueba
y_pred = xgb_model.predict(X_test)

# Calculamos el error cuadrático medio
mse = mean_squared_error(y_test, y_pred)

# Calculamos la raíz del error cuadrático medio
rmse = np.sqrt(mse)

# Mostramos el resultado
print("Root Mean Squared Error:", rmse)


Root Mean Squared Error: 559.1038725671015


In [38]:
from pathlib import Path
filepath = Path('outputrob_xgb.csv', index=False)  
filepath.parent.mkdir(parents=True, exist_ok=True) 
submission.to_csv(filepath, index=False, encoding='utf-8')

## Scaling

### _RobustScaler()_

In [None]:
scaler = RobustScaler().fit(X_train)
X_train_rob_scaled = scaler.transform(X_train)
print('X_lab_train_rob_scaled', X_train_rob_scaled)
print('\n')

scaler = RobustScaler().fit(X_test)
X_test_rob_scaled = scaler.transform(X_test)
print('X_lab_test_rob_scaled', X_test_rob_scaled)
print('\n')

### _Robust Scaled to Random Forest (test-in-training)_

In [None]:
%%time

# Model definition

model = RandomForestRegressor()
print(type(model))

In [None]:
%%time

# Model training

weights = model.fit(X, y)
print(type(weights))

In [None]:
%%time

# Model predictions

predictions = model.predict(X_test)
print(type(predictions))

In [None]:
# RMSE

tricky_error = round(mean_squared_error(y_test, predictions)**0.5, 5)
print(f"Speed predictions error is: +/- {tricky_error} m/s (Mean speed is around: 40 m/s and Std is around: 2 m/s)")

In [None]:
y_prediction = model.predict(diamonds_test)

In [None]:
y_prediction

In [None]:
solution = []
for i, value in enumerate(y_prediction):
    solution.append((i,value))

In [None]:
submission = pd.DataFrame(solution)
submission

In [None]:
submission.columns=["id", "price"]

In [None]:
submission.reset_index(drop=True)

In [None]:
from pathlib import Path
filepath = Path('./predictions/randomforest_rob_203.csv', index=False)  
filepath.parent.mkdir(parents=True, exist_ok=True) 
submission.to_csv(filepath, index=False, encoding='utf-8')

### _Robust Scaled to Random Forest (the-real-stuff)_

In [None]:
%%time

# Model definition

model = LinearRegression()
print(type(model))

In [None]:
%%time

# Model training

weights = model.fit(X_train_rob_scaled, y_train)
print(type(weights))

In [None]:
%%time

# Model predictions

predictions = model.predict(X_test_rob_scaled)
print(type(predictions))

In [None]:
# RMSE

real_error = round(mean_squared_error(y_test, predictions)**0.5, 5)
print(f"Speed predictions error is: +/- {real_error} m/s (Mean speed is around: 40 m/s and Std is around: 2 m/s)")

In [None]:
# RMSE comparison

print(f"The real_error is {round(real_error/tricky_error, 2)} times the tricky_error")

### _StandardScaler()_

In [None]:
scaler = StandardScaler().fit(X_train)
print(scaler.n_features_in_)
X_train_std_scaled = scaler.transform(X_train)
print('X_train_std_scaled', X_train_std_scaled)
print('\n')

scaler = StandardScaler().fit(X_test)
print(scaler.n_features_in_)
X_test_std_scaled = scaler.transform(X_test)
print('X_test_std_scaled', X_test_std_scaled)
print('\n')

### _Standard Scaled to Random Forest (test-in-training)_

In [None]:
%%time

# Model definition

model = LinearRegression()
print(type(model))

In [None]:
%%time

# Model training

weights = model.fit(X, y)
print(type(weights))

In [None]:
%%time

# Model predictions

predictions = model.predict(X_test_std_scaled)
print(type(predictions))

In [None]:
# RMSE

tricky_error = round(mean_squared_error(y_test, predictions)**0.5, 5)
print(f"Speed predictions error is: +/- {tricky_error} m/s (Mean speed is around: 40 m/s and Std is around: 2 m/s)")

### _Standard Scaled to Random Forest (the-real-stuff)_

In [None]:
%%time

# Model definition

model = LinearRegression()
print(type(model))

In [None]:
%%time

# Model training

weights = model.fit(X_train_std_scaled, y_train)
print(type(weights))

In [None]:
%%time

# Model predictions

predictions = model.predict(X_test_std_scaled)
print(type(predictions))

In [None]:
# RMSE

real_error = round(mean_squared_error(y_test, predictions)**0.5, 5)
print(f"Speed predictions error is: +/- {real_error} m/s (Mean speed is around: 40 m/s and Std is around: 2 m/s)")

In [None]:
# RMSE comparison

print(f"The real_error is {round(real_error/tricky_error, 2)} times the tricky_error")