Regression Machine Learning Methods
The aim is to predict the energy of the songs using as input the Mel Frequency Cepstral Coefficients.

### Libraries

In [None]:
import pandas as pd
from numpy import mean
from numpy import std
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
import matplotlib.pyplot as plt

### Dataset path and needed parameters
Be careful and change the paths before executing

In [ ]:
# Data
# Dataset for classification purposes
rdata_path = 'df_energy.csv'

# Parameters
plots = True

### Data loading

In [ ]:
data = pd.read_csv(rdata_path, sep=';', decimal=",", index_col=None)
y = data.iloc[:, 6:7]
X = data.iloc[:, 7:28]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Models
The classification models available in this project are:
1. Ridge regression
2. K-Nearest Neighbours (KNN)
3. Regression Tree

### Ridge Regression
Step-by-step Ridge regression process:
1. Load the data
2. Execute the cross validation method over the data
3. Define the model and its possible alphas
5. Train the models and return the best one
6. Test the model and save the results

In [ ]:
# Result list
outer_results = list()

cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# Outer loop
for train, test in cv_outer.split(data.iloc[:, 6:28]):
    # Split data
    X_train, X_test = X.iloc[train, 0:28], X.iloc[test, 0:28]
    y_train, y_test = y.iloc[train], y.iloc[test]

    # Model selection
    model = RidgeCV(alphas=[0.1, 0.3, 0.5, 0.7, 1])

    # Search
    result = model.fit(X_train, y_train)

    # Model Evaluation
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store result
    outer_results.append({'Model': 'Ridge', 'MSE': mse, 'R2': r2})

    # Plot results
    if plots:
        plt.clf()
        plt.plot([min(np.array(y_test)), max(np.array(y_test))], [min(y_pred), max(y_pred)], linestyle='--',
                 color='black',
                 label='Perfect Prediction')
        plt.scatter(y_test, y_pred, color='purple')
        plt.xlabel('Real Values')
        plt.ylabel('Predicted Values')
        plt.title('Scatter Matrix - Ridge Regression')
        plt.savefig(f'Ridge/Alpha_{r2}.png')

    # # Report progress
    print('- mse=%.3f,r2=%.3f, alpha=%.3f' % (mse, r2, model.alpha_,))

# Summarize the estimated performance of the model
mse_values = [item['MSE'] for item in outer_results]
r2_values = [item['R2'] for item in outer_results]
print('MSE: %.3f (%.3f)' % (mean(mse_values), std(mse_values)))
print('R2: %.3f (%.3f)' % (mean(r2_values), std(r2_values)))

### K-Nearest Neighbours (KNN)
Step-by-step Ridge regression process:
1. Load the data
2. Execute the cross validation method over the data
3. Define the model and its space
4. Apply the search definition for the cross validation
5. Train the models and return the best one
6. Test the model and save the results

In [ ]:
# Result list
outer_results = list()

# Other variables
num_knn = 0

# Enumerate the splits
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# Outer loop
for train, test in cv_outer.split(data.iloc[:, 6:28]):
    # Split data
    X_train, X_test = X.iloc[train, 0:20].astype(float), X.iloc[test, 0:20].astype(float)
    a1 = X_train.dtypes
    a2 = X_test.dtypes
    y_train, y_test = y.iloc[train].astype(float), y.iloc[test].astype(float)
    a3 = y_train.dtypes
    a4 = y_test.dtypes

    # Configure the cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)

    # Model selection and search space definition
    # Model selection
    model = KNeighborsRegressor()
    # Search Variables definition
    space = dict()
    space['n_neighbors'] = list(range(2, 100))

    # Search definition
    search = GridSearchCV(model, space, cv=cv_inner, refit=True)

    # Search
    result = search.fit(X_train, y_train)

    # Save the best model
    best_model = result.best_estimator_

    # Model Evaluation
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store result for the outer one
    outer_results.append({'Model': 'KNN', 'MSE': mse, 'R2': r2})

    # Plot results
    if plots:
        num_knn += 1
        plt.clf()
        plt.plot([min(np.array(y_test)), max(np.array(y_test))], [min(y_pred), max(y_pred)], linestyle='--', color='black',
                 label='Perfect Prediction')
        plt.scatter(y_test, y_pred, color='blue')
        plt.xlabel('Real Values')
        plt.ylabel('Predicted Values')
        plt.title('Scatter Matrix - KNN')
        plt.savefig(f'KNN/{best_model}_{num_knn}.png')

    # Report progress
    print('- mse=%.3f,r2=%.3f, K_neightbours=%s' % (mse, r2, result.best_params_))

In [ ]:
# Summarize the estimated performance of the model
mse_values = [item['MSE'] for item in outer_results]
r2_values = [item['R2'] for item in outer_results]
print('MSE: %.3f (%.3f)' % (mean(mse_values), std(mse_values)))
print('R2: %.3f (%.3f)' % (mean(r2_values), std(r2_values)))

### Regression Tree
Step-by-step Ridge regression process:
1. Load the data
2. Execute the cross validation method over the data
3. Define the model and its space
4. Apply the search definition for the cross validation
5. Train the models and return the best one
6. Test the model and save the results

In [ ]:
# Result list
outer_results = list()

# Enumerate splits
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)

# Outer loop
for train, test in cv_outer.split(data.iloc[:, 6:28]):
    # Split data
    X_train, X_test = X.iloc[train, 0:20].astype(float), X.iloc[test, 0:20].astype(float)
    a1 = X_train.dtypes
    a2 = X_test.dtypes
    y_train, y_test = y.iloc[train].astype(float), y.iloc[test].astype(float)
    a3 = y_train.dtypes
    a4 = y_test.dtypes

    # Configure the cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)

    # Model selection and search space definition
    # Model selection
    model = DecisionTreeRegressor()
    # Search Variables definition
    space = dict()
    space['max_leaf_nodes'] = list(range(2, 100))
    space['min_samples_split'] = list(range(2, 20))

    # Search definition
    search = GridSearchCV(model, space, cv=cv_inner, refit=True)

    # Search
    result = search.fit(X_train, y_train)

    # Save the best model
    best_model = result.best_estimator_

    # Model Evaluation
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store result for the outer one
    outer_results.append({'Model': 'Regression_Tree', 'MSE': mse, 'R2': r2})

    # Plot results
    if plots:
        fig = plt.figure(figsize=(25, 20))
        _ = tree.plot_tree(best_model,
                           feature_names=X.iloc[:, 0:20].columns,
                           filled=True)
        plt.savefig(f'RegressTree/{best_model}.png')

    # Report progress
    print('- mse=%.3f,r2=%.3f, Tree_Params=%s' % (mse, r2, result.best_params_))

# Summarize the estimated performance of the model
mse_values = [item['MSE'] for item in outer_results]
r2_values = [item['R2'] for item in outer_results]
print('MSE: %.3f (%.3f)' % (mean(mse_values), std(mse_values)))
print('R2: %.3f (%.3f)' % (mean(r2_values), std(r2_values)))
