# Implementando LGBM como regresor
- Alvarado Morán Óscar
- Bermúdez Marbán Dante

In [None]:
import numpy as np
import pandas as pd

#!conda install -c conda-forge lightgbm -y
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
def aplicar_lgbm(X_train, y_train, X_test, y_test):
    d_train = lgb.Dataset(X_train, label = y_train)
    params = {}
    params['learning_rate'] = 0.0095
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'mae'
    params['sub_feature'] = 0.5
    params['num_leaves'] = 30
    params['min_data'] = 20
    params['max_depth'] = 10
    clf = lgb.train(params, d_train, 100) # Num_boost_round es el tercer parámetro
    y_pred = clf.predict(X_test)
    
    mae = mean_absolute_error(y_pred,y_test)
    return mae

In [None]:
datos = pd.read_csv("../csvs/stats_per_file2.csv")
datos.head()

In [None]:
datos.fillna(0, inplace = True)

In [None]:
train = pd.read_csv("/home/oscar/Escritorio/predict-volcanic-eruptions-ingv-oe/train.csv")
train.head()

In [None]:
datos_time = datos.join(train.set_index("segment_id"), on = "segment_id")
datos_time.head()

In [None]:
X = datos_time.iloc[:,1:-1].values
y = datos_time.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
aplicar_lgbm(X_train, y_train, X_test, y_test) # Con las estadísticas anteriores

In [None]:
# Con las estadísticas de señales
datos = pd.read_csv("../csvs/stats_per_file_signal.csv")
train = pd.read_csv("/home/oscar/Escritorio/predict-volcanic-eruptions-ingv-oe/train.csv")
datos_time = datos.join(train.set_index("segment_id"), on = "segment_id")

X = datos_time.iloc[:,1:-1].values
y = datos_time.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
datos.head()

In [None]:
aplicar_lgbm(X_train, y_train, X_test, y_test)

## Usando algo así como random search

MAE:  2580211.5837444784

{'learning_rate': 0.24842943048773125, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.6330313130909996, 'num_leaves': 48, 'min_data': 21, 'max_depth': 27}

MAE: 2553720.985319375

{'learning_rate': 0.1275200906747731,
 'boosting_type': 'dart',
 'objective': 'regression',
 'metric': 'mae',
 'sub_feature': 0.4067123592984939,
 'num_leaves': 287,
 'min_data': 19,
 'max_depth': 94}

In [None]:
def random_search(X_train, y_train, X_test, y_test, its = 50):
    #Set the minimum error arbitrarily large
    minimo = 99999999999999999999999 
    pp = {}
    count = 0 #Used for keeping track of the iteration number
    maes = []
    pars = []
    #How many runs to perform using randomly selected hyperparameters
    iterations = its
    for i in range(iterations):
        print('iteration number', count)
        count += 1 #increment count
        params = {} #initialize parameters
        try:
            d_train = lgb.Dataset(X_train, label = y_train) #Load in data
            params['learning_rate'] = np.random.uniform(0, 1)
            params['boosting_type'] = np.random.choice(['gbdt', 'dart', 'goss'])
            params['objective'] = 'regression'
            params['metric'] = 'mae'
            params['sub_feature'] = np.random.uniform(0, 1)
            params['num_leaves'] = np.random.randint(20, 300)
            params['min_data'] = np.random.randint(10, 100)
            params['max_depth'] = np.random.randint(5, 200)
            iterations = np.random.randint(10, 10000)
            #print(params, iterations)#Train using selected parameters
            clf = lgb.train(params, d_train, iterations)
            y_pred = clf.predict(X_test) #Create predictions on test set
            mae = mean_absolute_error(y_pred,y_test)    
            #print('MAE:', mae)
            maes.append(mae)
            pars.append(params)
            if mae < minimo:
                minimo = mae
                pp = params
        except: #in case something goes wrong
            print('failed with')
            print(params)
    return maes, pars, minimo, pp

In [None]:
%%time
# Con las estadísticas de señales
datos = pd.read_csv("../csvs/stats_per_file_signal.csv")
train = pd.read_csv("/home/oscar/Escritorio/predict-volcanic-eruptions-ingv-oe/train.csv")
datos_time = datos.join(train.set_index("segment_id"), on = "segment_id")

X = datos_time.iloc[:,1:-1].values
y = datos_time.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
maes1, pars1, minimo1, pp1 = random_search(X_train, y_train, X_test, y_test, its = 200)
print(minimo1)
pp1

In [None]:
%%time
# Con las estadísticas base
datos = pd.read_csv("../csvs/stats_per_file2.csv")
train = pd.read_csv("/home/oscar/Escritorio/predict-volcanic-eruptions-ingv-oe/train.csv")
datos_time = datos.join(train.set_index("segment_id"), on = "segment_id")

X = datos_time.iloc[:,1:-1].values
y = datos_time.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
maes2, pars2, minimo2, pp2 = random_search(X_train, y_train, X_test, y_test, its = 100)
print(minimo2)
pp2

# Probando con los datos de prueba

In [None]:
# Entrenamiento
datos = pd.read_csv("../csvs/stats_per_file2.csv")
train = pd.read_csv("/home/oscar/Escritorio/predict-volcanic-eruptions-ingv-oe/train.csv")
datos_time = datos.join(train.set_index("segment_id"), on = "segment_id")

X = datos_time.iloc[:,1:-1].values
y = datos_time.iloc[:,-1].values

d_train = lgb.Dataset(X, label = y)
params['learning_rate'] = 0.1275200906747731
params['boosting_type'] = 'dart'
params['objective'] = 'regression'
params['metric'] = 'mae'
params['sub_feature'] = 0.4067123592984939
params['num_leaves'] = 287
params['min_data'] = 19
params['max_depth'] = 94
iterations = 10000
clf = lgb.train(params, d_train, iterations)

# Prueba
#X_test = 
y_pred = clf.predict(X_test) #Create predictions on test set

