In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pylab as plt
import sys
import random

from itertools import product

from Data_Loading import *
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer, r2_score
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import PolynomialFeatures

In [None]:
main_df = pd.read_csv('Banco de Dados/atributos/atributos_main.csv', index_col=0)
main_df['filepath'] = main_df['database']+'/'+main_df['filename']
main_df = main_df.set_index(['database', 'filename'])
main_df

In [None]:
main_df['bpm'] = main_df['bpm'].apply(lambda bpm: round(bpm*2)/2)

X = main_df.loc[:, :'v_sd_D1']
y = main_df.loc[:, ['bpm']]
X_train, X_test, y_train, y_test = train_test_split(X, y)

y_train = y_train.values.reshape(-1,)
y_test = y_test.values.reshape(-1,)

plt.subplot(2,1,1)
plt.hist(y_train, bins = 273)
plt.subplot(2,1,2)
plt.hist(y_test, bins = 273)
plt.show()

In [None]:
normal_dict = {}

for bpm in main_df['bpm'].unique():
    normal_dict[bpm] = main_df[main_df['bpm'] == bpm]



random_df = main_df.sample(frac=1)
rnd_dict = {}

for bpm in random_df['bpm'].unique():
    rnd_dict[bpm] = random_df[random_df['bpm'] == bpm]

   

In [None]:
dicio = rnd_dict #Choose dictionary between random or not

train = pd.DataFrame()
cv = pd.DataFrame()
test = pd.DataFrame()

first = True

for lab, bpm_list in dicio.items():
    if(len(bpm_list)<10):
        train = pd.concat([train, bpm_list], axis=0)
    else:
        train_size = int(0.7*len(bpm_list))
        cv_size = int(0.15*len(bpm_list))
        test_size = int(0.15*len(bpm_list))
        
        train_set = bpm_list.iloc[:train_size]
        cv_set = bpm_list.iloc[train_size-1:train_size+cv_size]
        test_set = bpm_list.iloc[train_size+cv_size-1:]
        
        train = pd.concat([train, train_set], axis=0)
        cv = pd.concat([cv, cv_set], axis=0)
        test = pd.concat([test, test_set], axis=0)
        
        
X_train = train.loc[:, :'v_sd_D1']
y_train = train.loc[:, ['bpm']]

X_cv = cv.loc[:, :'v_sd_D1']
y_cv = cv.loc[:, ['bpm']]

X_test = test.loc[:, :'v_sd_D1']
y_test = test.loc[:, ['bpm']]


y_train = y_train.values.reshape(-1,)
y_cv = y_cv.values.reshape(-1,)
y_test = y_test.values.reshape(-1,)

plt.subplot(3,1,1)
plt.hist(y_train, bins = 273)
plt.subplot(3,1,2)
plt.hist(y_cv, bins = 273)
plt.subplot(3,1,3)
plt.hist(y_test, bins = 273)
plt.show()

In [None]:
X_train_std = StandardScaler().fit_transform(X_train)
X_cv_std = StandardScaler().fit_transform(X_cv)
X_test_std = StandardScaler().fit_transform(X_test)


results_df = pd.DataFrame({"Espaço":[],
                           "Train_loss_array":[], "Train_loss":[],
                           "CV_loss_array":[], "CV_loss":[],
                           "Test_loss_array":[], "Test_loss":[],
                           "num_iter":[], "time_elapsed":[]})



#alphas = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3]
#learning_rates = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3]
#layer_sizes = [(3,), (3,3), (5,), (5,5), (7,), (7,7), (9,), (9,9)]

alphas = [1, 3, 10, 30, 100, 300]
learning_rates = [0.0003, 0.001, 0.003, 0.01, 0.03]
layer_sizes = [(5,), (5,5), (7,), (7,7), (9,), (9,9)]

iter_divisor = 24

params_space = list(product(alphas, learning_rates, layer_sizes))
params_num = len(params_space)
cur_param = 0

In [1]:
for space_i, space in enumerate(params_space):
    start = time.time()
    mlp = MLPRegressor(alpha = space[0], learning_rate_init = space[1], hidden_layer_sizes = space[2], warm_start = True)
    train_error = []
    cv_error = []
    test_error = []
    
    times_down = 0
    times_up = 0
    max_iter = int(iter_divisor/space[1])
    for i in range(max_iter):
        mlp.partial_fit(X_train_std, y_train)
        train_loss = mean_squared_error(mlp.predict(X_train_std), y_train)
        
        if i>0:
            if(train_error[-1] - train_loss)<0:
                times_up =  times_up+1
                if times_up >=5:
                    sys.stdout.flush()
                    print(f'Espaço de Parâmetros {space_i}: {space} - Treinamento encerrado por falta de evolução: {i} iterações | erro={round(train_loss,3)}!')
                    break
            elif (train_error[-1] - train_loss)<0.001:
                times_down  = times_down + 1
                if times_down >=5:
                    sys.stdout.flush()
                    print(f'Espaço de Parâmetros {space_i}: {space} - Treinamento encerrado por evolução aceitável: {i} iterações | erro={round(train_loss,3)}!')
                    break                
            else:
                times_down=0
                times_up=0
                    
        
        train_error.append(train_loss)
        cv_error.append(mean_squared_error(mlp.predict(X_cv_std), y_cv))
        test_error.append(mean_squared_error(mlp.predict(X_test_std), y_test))
        
        print(f'Espaço de Parâmetros {space_i} {space} - Iterações realizadas: {i}/{max_iter} - Train_loss: {train_loss}', end='\r')
        
        if i==max_iter-1:
            sys.stdout.flush()
            print(f'Espaço de Parâmetros {space_i}: {space} - Treinamento encerrado por máximo de iterações: {max_iter}!')
    
    end = time.time()
    
    
    this_dict = {"Espaço":[], "Layers":[], "Train_loss_array":[], "CV_loss_array":[], "Test_loss_array":[]}
    this_dict["Espaço"].append(space)
    this_dict["Alpha"] = space[0]
    this_dict["Learning Rate"] = space[1]
    this_dict["Layers"].append(space[2])
    this_dict["Train_loss_array"].append(train_error)
    this_dict["Train_loss"] = float(train_error[-1])
    this_dict["CV_loss_array"].append(cv_error)
    this_dict["CV_loss"] = float(cv_error[-1])
    this_dict["Test_loss_array"].append(test_error)
    this_dict["Test_loss"] = float(test_error[-1])
    this_dict["num_iter"] = i
    this_dict["time_elapsed"] = (end-start)/60
    
    
    results_df = pd.concat([results_df, pd.DataFrame(this_dict)], ignore_index = True)

SyntaxError: invalid syntax (113909657.py, line 30)

In [None]:
results_df.sort_values(['CV_loss','num_iter'], ascending = [True, False]).to_csv('result_of_48_iterations_v3.csv')



In [None]:
three = pd.read_csv('result_of_48_iterations_v3.csv')
three.sort_values('CV_loss')

In [None]:
alpha = three.groupby('Alpha')['CV_loss'].agg([np.mean, np.min, np.max])
alpha['score'] = (alpha['mean']+alpha['amin'])/2
print('Alpha')
print(alpha.sort_values('score').iloc[0])
print(alpha.sort_values('mean').iloc[0])
print(alpha.sort_values('amin').iloc[0])
print()
print('Layers')
layers = three.groupby('Layers')['CV_loss'].agg([np.mean, np.min, np.max])
layers['score'] = (layers['mean']+layers['amin'])/2
print(layers.sort_values('score').iloc[0])
print(layers.sort_values('mean').iloc[0])
print(layers.sort_values('amin').iloc[0])
print()
print('Learning Rate')
learning_rate = three.groupby('Learning Rate')['CV_loss'].agg([np.mean, np.min, np.max])
learning_rate['score'] = (learning_rate['mean']+learning_rate['amin'])/2
print(learning_rate.sort_values('score').iloc[0])
print(learning_rate.sort_values('mean').iloc[0])
print(learning_rate.sort_values('amin').iloc[0])

In [None]:
test_df = main_df.loc[main_df['database']!= 'extended_ballroom']
test_df['database'].unique()

In [None]:
test_df = main_df.loc[main_df['database']!= 'extended_ballroom']

X = test_df.iloc[:, :66]
y = test_df.iloc[:, [66]]
X_train, X_test, y_train, y_test = train_test_split(X, y)

y_train = y_train.values.reshape(-1,)
y_test = y_test.values.reshape(-1,)

plt.subplot(2,1,1)
plt.hist(y_train, bins = 273)
plt.subplot(2,1,2)
plt.hist(y_test, bins = 273)
plt.show()

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.4, normalize=True)
lasso.fit(X,y)
print(lasso.coef_)

plt.plot(lasso.coef_, range(len(X.columns)))
plt.yticks(range(len(X.columns)), X.columns, rotation = -15)
plt.show()




#import seaborn as sns
#sns.heatmap(main_df.corr(), square=True, cmap='RdYlGn')

main_df.corr().loc[:, ['bpm']].iloc[:-1].describe()

In [None]:
mlp = MLPRegressor(max_iter=2000)

norm = StandardScaler()
quantile = QuantileTransformer()

mse = make_scorer(mean_squared_error, greater_is_better=False)
mae = make_scorer(mean_absolute_error,  greater_is_better=False)
r2 = make_scorer(r2_score)

param_grid_mlp = {'model__hidden_layer_sizes': [(5,), (5,5)],
                  'model__learning_rate_init': [0.01, 0.03, 0.1],
                  'model__alpha': [0.003, 0.01, 0.03, 0.1]}


#MLP com Standard Scaler
mlp_scale_pipe= Pipeline([
    ('scale', norm),
    ('model', mlp)
])

mlp_scale = GridSearchCV(estimator = mlp_scale_pipe,
                   param_grid = param_grid_mlp,
                   scoring = {'mse': mse, 'mae': mae, 'r2': r2},
                   refit='r2',
                   cv=10,
                   n_jobs=-1)



#MLP com Quantile Transform
mlp_quant_pipe = Pipeline([
    ('scale', quantile),
    ('model', mlp)
])

mlp_quant = GridSearchCV(estimator = mlp_quant_pipe,
                   param_grid = param_grid_mlp,
                   scoring = {'mse': mse, 'mae': mae, 'r2': r2},
                   refit='r2',
                   cv=10,
                   n_jobs=-1)



In [None]:
start = time.time()
mlp_scale.fit(X_train, y_train)
end = time.time()

print(f'MLP com Standard Scaler Treinado em {(end-start)/60} minutos')
print()

In [None]:
start = time.time()
mlp_quant.fit(X_train, y_train)
end = time.time()

print(f'MLP com Quantile Transformer Treinado em {(end-start)/60} minutos')
print()


In [None]:
print(mlp_scale.best_score_)
print(mlp_scale.best_params_)
print()

print(mlp_quant.best_score_)
print(mlp_quant.best_params_)


In [None]:
pd.DataFrame(mlp_scale.cv_results_).loc[:,['mean_test_mse', 'rank_test_mse', 'mean_test_mae', 'rank_test_mae', 'mean_test_r2', 'rank_test_r2']]

In [None]:
X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().fit_transform(X_test)

In [None]:
X_train_poly = PolynomialFeatures(2).fit_transform(X_train)
X_test_poly = PolynomialFeatures(2).fit_transform(X_test)

In [None]:
train = []
test = []

mlp_alone = MLPRegressor(max_iter = 1000, alpha = 0.3, hidden_layer_sizes = (5,), learning_rate_init = 0.001, warm_start = True)

times = 0

for i in range(5500):
    mlp_alone.partial_fit(X_train_std, y_train)
    train_loss = mean_squared_error(mlp_alone.predict(X_train_std), y_train)
    
    evolution = 0
    if i>0:    
        evolution = train[-1]-train_loss
        if np.abs(evolution) < 0.001:
            times = times + 1
            if times > 10:
                sys.stdout.flush()
                print(f'Treinamento encerrado por evolução aceitável: {evolution} com {i} iterações!')
                break
            else:
                times = 0
            
            
    train.append(train_loss)
    pred = mlp_alone.predict(X_test_std)
    test_loss  = mean_squared_error(pred, y_test)
    test.append(test_loss) 
    print(f'Iterações realizadas: {i} ---- Evolução: {evolution}', end='\r')
    
if i==1199:
    print('Treinamento encerrado por máximo de iterações!')
    i=i+1

In [None]:
plt.plot(range(i+1), test, label='teste')
plt.plot(range(i+1), train, label='train')
plt.legend()
plt.show()

In [None]:
mean_squared_error(mlp_alone.predict(X_test_std), y_test)

In [None]:
bancos = {}
bancos_names = main_df['database'].unique()

for banco in bancos_names:
    banco_i = main_df.loc[main_df['database']==banco]
    bancos[banco] = banco_i.iloc[:, :67]

In [None]:
bancos_names


In [None]:
plt.hist(bancos['banco1']['bpm'], bins = 150);
plt.show()

In [None]:
for banco in bancos:
    