In [1]:
#Librerías
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from time import time
from pandas.plotting import scatter_matrix
from prettytable import PrettyTable

# === 

# Scikit-learn packages
#SEVERAL
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
#Metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score
from sklearn.metrics import mean_squared_error
#Models
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from lazypredict.Supervised import LazyRegressor
# Hide warnings
import warnings
warnings.filterwarnings('ignore')
# Setting up max columns displayed to 100
pd.options.display.max_columns = 100


In [2]:
df_train = pd.read_csv(r"C:\Users\Usuario\Documents\Entrega_ML\Data\Processed\train.csv")

In [4]:
df_train.head(5)

Unnamed: 0,Hydrogen,Oxigen,Nitrogen,Methane,CO,CO2,Ethylene,Ethane,Acethylene,DBDS,Power factor,Interfacial V,Dielectric rigidity,Water content,Health index,Life expectation
0,2845,5860,27842,7406,32,1344,16684,5467,7,19.0,1.0,45,55,0,95.2,19.0
1,12886,61,25041,877,83,864,4,305,0,45.0,1.0,45,55,0,85.5,19.0
2,2820,16400,56300,144,257,1080,206,11,2190,1.0,1.0,39,52,11,85.3,19.0
3,1099,70,37520,545,184,1402,6,230,0,87.0,4.58,33,49,5,85.3,6.0
4,3210,3570,47900,160,360,2130,4,43,4,1.0,0.77,44,55,3,85.2,6.0


In [12]:
df_train.columns

Index(['H2', 'O2', 'N2', 'CH4', 'CO', 'CO2', 'C2H4', 'C2H6', 'C2H2', 'DBDS',
       'Power factor', 'Interfacial V', 'Dielectric rigidity', 'Water content',
       'Health index', 'Life expectation'],
      dtype='object')

### LIST OF REGRESSION ALGORITHMS IN MACHINE LEARNING
- Linear Regression
- Lasso Regression 
- Ridge Regression
- Polynomial Regression 
- Support Vector Machines (SVM)
- Gausian Regression
- Random Forest
- Decision Tree Regression 
- KNN Model
- Neural Network Regression 

### Primero Aproximación: "Multiple Models – Regression"

In [20]:
X = df_train[['H2', 'O2', 'N2', 'CH4', 'CO', 'CO2', 'C2H4', 'C2H6', 'C2H2', 'DBDS', 'Interfacial V', 'Dielectric rigidity', 'Water content', 'Life expectation', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6']]
y1 = df_train['Health index']

In [22]:
df_train.columns

Index(['H2', 'O2', 'N2', 'CH4', 'CO', 'CO2', 'C2H4', 'C2H6', 'C2H2', 'DBDS',
       'Power factor', 'Interfacial V', 'Dielectric rigidity', 'Water content',
       'Health index', 'Life expectation', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6'],
      dtype='object')

In [23]:
#Perform Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#Splitting the data into train and test split
X_train, X_test, y_train, y_test = train_test_split(X_sc, y1, random_state=3,test_size=0.25)


In [24]:
regressors = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    LinearRegression(),
    Lasso(),
    Ridge()
]

In [25]:
head = 9
for model in regressors[:head]:
    start = time()
    model.fit(X_train, y_train)
    train_time = time() - start
    start = time()
    y_pred = model.predict(X_test)
    predict_time = time()-start    
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
    print("\tR2 score:", r2_score(y_test, y_pred))
    print()

KNeighborsRegressor()
	Training time: 0.007s
	Prediction time: 0.007s
	Explained variance: 0.4369400324581272
	Mean absolute error: 8.941186440677965
	R2 score: 0.38904697848953185

GradientBoostingRegressor()
	Training time: 0.369s
	Prediction time: 0.000s
	Explained variance: 0.7331377233288412
	Mean absolute error: 5.6052363790061595
	R2 score: 0.7328121483574479

ExtraTreesRegressor()
	Training time: 0.263s
	Prediction time: 0.012s
	Explained variance: 0.8047816567766422
	Mean absolute error: 3.986127118644075
	R2 score: 0.8047488764219812

RandomForestRegressor()
	Training time: 0.388s
	Prediction time: 0.012s
	Explained variance: 0.7332016112228048
	Mean absolute error: 4.583127118644075
	R2 score: 0.7303801615113417

DecisionTreeRegressor()
	Training time: 0.004s
	Prediction time: 0.000s
	Explained variance: 0.38314226590260514
	Mean absolute error: 5.851694915254247
	R2 score: 0.37717241325835227

LinearRegression()
	Training time: 0.032s
	Prediction time: 0.000s
	Explained var

### Segunda Aproximación:  'LazyRegressor'

Con el objetivo de entender qué modelos podrían  funcionar mejor sin tunear o modificar ningún parametro ('parameter tuning').

In [26]:
# pip install lazypredict 

In [28]:
# Call train_test_split on the data and capture the results
X_train, X_test, y_train, y_test = train_test_split(X, y1, random_state=3,test_size=0.25)

reg = LazyRegressor(ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
print(models)


 79%|███████▊  | 33/42 [00:03<00:00,  9.68it/s]

RANSACRegressor model failed to execute
RANSAC could not find a valid consensus set. All `max_trials` iterations were skipped because each randomly chosen sub-sample failed the passing criteria. See estimator attributes for diagnostics (n_skips*).


100%|██████████| 42/42 [00:05<00:00,  8.20it/s]

                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
ExtraTreesRegressor                          0.74       0.78  7.48        0.29
HistGradientBoostingRegressor                0.69       0.74  8.17        0.97
GradientBoostingRegressor                    0.69       0.74  8.18        0.19
RandomForestRegressor                        0.68       0.73  8.32        0.55
LGBMRegressor                                0.67       0.73  8.33        0.20
BaggingRegressor                             0.64       0.70  8.74        0.07
XGBRegressor                                 0.62       0.69  8.95        0.25
AdaBoostRegressor                            0.46       0.55 10.77        0.20
LassoLarsIC                                  0.45       0.54 10.87        0.03
LassoCV                                      0.45       0.54 10.87        0.24
Lasso                                        0.45   




### List of Regression Algorithms we are going to study                                                                     
- LGBMRegressor                        
- HistGradientBoostingRegressor            
- ExtraTreesRegressor                         
- GradientBoostingRegressor                 
- RandomForestRegressor                                                                                       

In [None]:
# Report Performance (Funciones)

def best_model(search):
    print(search.best_estimator_)
    print(search.best_params_)

def test_train_scores(X_train, y_train, X_test, y_test, model):
    print("Accuracy train", model.score(X_train, y_train))
    print("Accuracy test", model.score(X_test, y_test))
    
def explain_scores(X1_train, y1_train, X1_test, y1_test, model):
    print("Accuracy train", model.score(X1_train, y1_train))
    print("Accuracy test", model.score(X1_test, y1_test))

def explain_scores(X_test, y_test, model):
    print('MAE:', metrics.mean_absolute_error(y_test, model.predict(X_test)))
    print('MSE:', metrics.mean_squared_error(y_test, model.predict(X_test)))
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(X_test))))

def explain_scores(X_test, y_test, model):
    print('MAE:', metrics.mean_absolute_error(y_test, model.predict(X_test)))
    print('MSE:', metrics.mean_squared_error(y_test, model.predict(X_test)))
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(X_test))))
    print('R^2(accuracy):', metrics.r2_score(y_test, model.predict(X_test)))

# MAPE y SMAPE
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred - y_true) / y_true)) * 100
 
def smape(y_true, y_pred):
    return 2.0 * np.mean(np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))) * 100

def sweet_table(X_test, y_test, *arbitrarios):
    """
    Nos prporciona una pequeña descripción de las principales métricas a utilizar par evaluar el rendimiento
    de nuestro modelo de ML. Siempre y cuando se siga el siguiente proceso: 
    1) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)
    2) Con nuestro modelo definido (ejemplo):
       model = LGBMRegressor()
       model1 = LinearRegression()
    3) Entrenado nuestro modelo:
       model.fit(X_train, y_train)
       model1.fit(X_train, y_train)
    Argumentos:
      X_test (np.array): (Ver Descripción)
      y_test (np.array): (Ver Descripción)
      *arbitrareos (str): Serán uno o varios algoritmos con los que se quiere entrenar y evaluar nuestro modelo de ML.
    """
    names = ['Metrics']
    maes = ['MAE']
    mses = ['MSE']
    rmses = ['RMSE']
    score_test = ['Accuracy (R^2)']
    # score_train = ['Accuracy (TRN)']
    # mean_rmses = ['Mean(RMSE)_CrossValidation']

    for model in arbitrarios:
      names.append(str(model))
      MAE = metrics.mean_absolute_error(y_test, model.predict(X_test))
      maes.append(str(MAE))
      MSE = metrics.mean_squared_error(y_test, model.predict(X_test))
      mses.append(str(MSE))
      RMSE = np.sqrt(metrics.mean_squared_error(y_test, model.predict(X_test)))
      rmses.append(str(RMSE))
      ACC =  metrics.r2_score(y_test, model.predict(X_test))
      score_test.append(str(ACC))
      # SCORE_TR = model.score(X_train, y_train)
      # SCORE_TS = model.score(X_test, y_test)

    x = PrettyTable()
    x.field_names = names
    x.add_row(maes)
    x.add_row(mses)
    x.add_row(rmses)
    x.add_row(score_test)
    # x.add_row(score_train)
    # x.add_row(mean_rmses)

    return x

### 1ª LGBMRegressor  

In [29]:
#Extra
import lightgbm as lgb
from lightgbm import LGBMRegressor

Paso 1)

In [None]:
'''Lo hacemos pese a no apreciar grandes mejoras al escalar los datos'''

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

X1_train, X1_test, y1_train, y1_test = train_test_split(X_scaled, y1, test_size=0.18, random_state=45, shuffle=True)

params = {
    'boosting_type': ['dart', 'gbdt', 'goss'],
    'num_leaves': [7, 14, 21, 28, 31, 50],
    'max_depth': [-1, 3, 5],
    'learning_rate': [0.01, 0.1, 1.0],
    'n_estimators': [10, 50, 100, 200, 500],
         }

grid = GridSearchCV(lgb.LGBMRegressor(random_state=0), params, scoring='r2', cv=5, n_jobs=-1)
grid.fit(X1_train, y1_train)

In [None]:
best_model(grid)

Paso 2)\
       --> Ejecutar el modelo escogido.

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_scaled, y1, test_size=0.18, random_state=42)

# Define the model
model = LGBMRegressor(learning_rate=0.01, n_estimators=500, num_leaves=14,
              random_state=0)
# Evaluate the model
model.fit(X2_train, y2_train)

In [None]:
explain_scores(X2_test, y2_test, model)

Paso 3)\
   --> Comprobar si existe overfitting.

In [None]:
test_train_scores(X2_train, y2_train, X2_test, y2_test, model)

In [None]:
scores = cross_val_score(model, X_scaled, y1, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores1 = np.sqrt(-scores)
print(lin_rmse_scores1)
print('\n')
print('Media de los resultados:', lin_rmse_scores1.mean())

In [None]:
#Plot Figure
y_pred1 = model.predict(X2_test)
sns.distplot(y_pred1 - y2_test)
plt.title('LGBMRegressor')
plt.savefig('hist_model1.png', orientation = 'horizontal', dpi=300);

### 2ª HistGradientBoostingRegressor 

In [31]:
#Extra
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

Paso 1)


In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

X1_train, X1_test, y1_train, y1_test = train_test_split(X_scaled, y1, test_size=0.18, random_state=45, shuffle=True)

params = {
    'loss' : ['least_squares', 'least_absolute_deviation', 'poisson'],
    'learning_rate': [0.05, 0.1, 0.2, 0.3],
    'max_iter': [25, 30, 50, 60, 70, 100],
    'max_bins': [80, 100, 125, 135, 180, 255],
         }

grid = GridSearchCV(HistGradientBoostingRegressor(), params, scoring='r2', cv=5)
grid.fit(X1_train, y1_train)

In [None]:
best_model(grid)

Paso 2)

In [32]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_scaled, y1, test_size=0.18, random_state=42, shuffle=True)

# Define the model
model1 = HistGradientBoostingRegressor(learning_rate=0.05, max_bins=100, max_iter=70)
#Evaluate the Model
model1.fit(X2_train, y2_train)

HistGradientBoostingRegressor(learning_rate=0.05, max_bins=100, max_iter=70)

In [None]:
explain_scores(X2_test, y2_test, model)

Paso 3)

In [None]:
test_train_scores(X2_train, y2_train, X2_test, y2_test, model)

In [None]:
scores = cross_val_score(model, X_scaled, y1, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-scores)
print(lin_rmse_scores)
print('\n')
print('Media de los resultados:', lin_rmse_scores.mean())

In [None]:
#Plot Figure
y_pred2 = model.predict(X2_test)
sns.distplot(y_pred2 - y2_test)
plt.title('HistGradientBoostingRegressor');
plt.savefig('hist_model2.png', orientation = 'horizontal', dpi=300);

### 3ª ExtraTreesRegressor

Paso 1)

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

X1_train, X1_test, y1_train, y1_test = train_test_split(X_scaled, y1, test_size=0.18, random_state=45, shuffle=True)

model = ExtraTreesRegressor()
                            
grid = GridSearchCV(
    estimator=model,
    param_grid={'n_estimators':[50,100,150],
        'max_depth':[2,4,5,8],
        'min_samples_split':[2,3,5,6],
        'min_samples_leaf':[2,3,4,6],
        'min_impurity_decrease':[0.01,0,1]},
    scoring='r2',
    cv=5
)

'''param_grid={'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': range(20, 50, 5),
                         'min_samples_split': range(15, 36, 5),
                         'n_estimators': range(50, 126, 25)},
             scoring='r2')'''

grid.fit(X1_train, y1_train)

In [None]:
best_model(grid)

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_scaled, y1, test_size=0.18, random_state=42)

# Define the model
model = ExtraTreesRegressor(max_depth=8, min_impurity_decrease=0.01, min_samples_leaf=2,
                    min_samples_split=6, n_estimators=50)

'''ExtraTreesRegressor(min_samples_leaf=20, min_samples_split=35, n_estimators=50,
                    n_jobs=4)'''
#Evaluate the Model
model.fit(X2_train, y2_train)

In [None]:
explain_scores(X2_test, y2_test, model)

Paso 3)

In [None]:
test_train_scores(X2_train, y2_train, X2_test, y2_test, model)

In [None]:
scores = cross_val_score(model, X_scaled, y1, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-scores)
print(lin_rmse_scores)
print('\n')
print('Media de los resultados:', lin_rmse_scores.mean())

In [None]:
#Plot Figure
y_pred3 = model.predict(X2_test)
sns.set_theme()
sns.distplot(y_pred3 - y2_test)
plt.title('ExtraTreeRegressor');
plt.savefig('hist_model.png', orientation = 'horizontal', dpi=300);

### 4ª GradientBoostingRegressor

Paso 1)

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

X1_train, X1_test, y1_train, y1_test = train_test_split(X_scaled, y1, test_size=0.18, random_state=45, shuffle=True)

model = GradientBoostingRegressor()
                            
grid = GridSearchCV(
    estimator=model,
    param_grid = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 },
    scoring='r2',
    cv=5
)

grid.fit(X1_train, y1_train)

In [None]:
best_model(grid)

Paso 2)

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_scaled, y1, test_size=0.18, random_state=42, shuffle=True)

# Define the model
model = GradientBoostingRegressor(learning_rate=0.01, max_depth=6, n_estimators=500,
                          subsample=0.5)

'''GradientBoostingRegressor(learning_rate=0.04, max_depth=6, subsample=0.5)
   GradientBoostingRegressor(learning_rate=0.05, loss='lad', max_features='auto')'''
   
#Evaluate the model
model.fit(X2_train, y2_train)

In [None]:
explain_scores(X2_test, y2_test, model)

Paso 3)

In [None]:
test_train_scores(X2_train, y2_train, X2_test, y2_test, model)

In [None]:
scores = cross_val_score(model, X_scaled, y1, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-scores)
print(lin_rmse_scores)
print('\n')
print('Media de los resultados:', lin_rmse_scores.mean())

In [None]:
y_pred4 = model.predict(X2_test)
sns.set_theme()
sns.distplot(y_pred4 - y2_test)
plt.title('GradientBoostingRegressor');
# plt.savefig('quesito.png', dpi = 300, orientation = 'horizontal')
plt.savefig('hist_model4.png', orientation = 'horizontal', dpi=300);

### 5ª RandomForestRegressor                        

Paso 1)

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

X1_train, X1_test, y1_train, y1_test = train_test_split(X_scaled, y1, test_size=0.18, random_state=45, shuffle=True)

model = RandomForestRegressor()
                            
grid = GridSearchCV(
    estimator=model,
    param_grid={'max_depth': [10, 20, 30, 40, 50],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600]},
    scoring='r2',
    cv=5
)

grid.fit(X1_train, y1_train)

In [None]:
best_model(grid)

Paso 2)

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_scaled, y1, test_size=0.18, random_state=42, shuffle=True)

#  define the model
model = RandomForestRegressor(max_depth=10, min_samples_leaf=2, n_estimators=200)
model.fit(X2_train, y2_train)

In [None]:
explain_scores(X2_test, y2_test, model)

Paso 3)

In [None]:
test_train_scores(X2_train, y2_train, X2_test, y2_test, model)

In [None]:
scores = cross_val_score(model, X_scaled, y1, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-scores)
print(lin_rmse_scores)
print('\n')
print('Media de los resultados:', lin_rmse_scores.mean())

In [None]:
y_pred = model.predict(X2_test)
sns.set_theme()
sns.distplot(y_pred - y2_test)
plt.title('RandomForestRegressor');
# plt.savefig('quesito.png', dpi = 300, orientation = 'horizontal')
plt.savefig('hist_model5.png', orientation = 'horizontal', dpi=300);

Eligimos nuestro modelo Más Optimo.

 - Función 'SAVE' en Pickel

In [None]:
train = pd.read_csv('../data/processed/train.csv', index_col=0)
train

In [None]:
X = train.drop(columns='Health index')
y = train['Health index']

In [None]:
best_model = LGBMRegressor(learning_rate=0.01, n_estimators=500, num_leaves=14,
              random_state=0)
train_model = best_model.fit(X, y)

In [None]:
import pickle

with open('my_model.pkl', 'wb') as archivo_salida:
    pickle.dump(best_model, archivo_salida)

In [None]:
with open('my_model.sav', 'rb') as archivo_entrada:
    loaded_model = pickle.load(archivo_entrada)
# loaded_model = pickle.load(open('my_model', 'rb'))

print(loaded_model)