# Import the Libs

In [2]:

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import sklearn as sk
from sklearn import ParameterGrid

# Data Sepration

In [4]:
data = pd.read_csv('input.csv')
df_data = pd.DataFrame(data)

In [5]:
from sklearn.model_selection import train_test_split

# Function the create new set every run
def data_shufle(df_data):
    
    # Create the dataframe
    df_data = pd.DataFrame(data)
    
    # Shuffle the data and drop the NSP column
    data = sk.utils.shuffle(data)
    
    # Split the data
    x_train, x_temp, y_train, y_temp = train_test_split(df_data, data["NSP"], test_size=0.5)
    x_validation, x_test, y_validation, y_test = train_test_split(x_temp, y_temp, test_size=0.5)


    return x_train, x_validation, x_test, y_train, y_validation, y_test

# KNR

In [6]:
from sklearn.neighbors import KNeighborsRegressor

def KNN_model(x_train, x_validation, y_train, y_validation):
    
    best_rmse = float('inf')
    
    # Find the best configuration
    for k in range(1, 50):
        for i in range("uniform", "distance"):
            knn_inst = KNeighborsRegressor(n_neighbors=k, weights=i)
            knn_inst.fit(x_train, y_train)
            knn_validation = knn_inst.predict(x_validation)
            mse = sk.metrics.mean_squared_error(y_validation, knn_validation)
            rmse = np.sqrt(mse)
            
            # See if is the best model
            if(rmse < best_rmse):
                best_rmse = rmse
                best_k = k
                best_dist = i
                best_KNN = knn_inst
                
    return best_KNN, best_k, best_dist


def KNN(x_train, x_validation, x_test, y_train, y_validation, y_test):
    
    # Create the model
    best_KNN, best_k, best_dist = KNN_model(x_train, x_validation, y_train, y_validation)
    
    # Save the parameters
    best_params = {"k": best_k, "dist": best_dist}
    
    # Predict the test data
    knn_test = best_KNN.predict(x_test)
    
    # Calculate the error
    mse = sk.metrics.mean_squared_error(y_test, knn_test)
    rmse = np.sqrt(mse)
    
    return best_KNN, best_params, rmse
    
    
     

# SVR

In [None]:
from sklearn.svm import SVR

def SVM_model(x_train, x_validation, y_train, y_validation):
    
    best_rmse = float('inf')
    
    # State the parameters
    parameters_grid = { "kernel": ["linear", "poly", "rbf", "sigmoid"], 
                        "C": [0.1, 1, 10, 100, 1000]}

    
    # Find the best configuration
    for i in ParameterGrid(parameters_grid):
        svm_inst = SVR(**i)
        svm_inst.fit(x_train, y_train)
        svm_validation = svm_inst.predict(x_validation)
        mse = sk.metrics.mean_squared_error(y_validation, svm_validation)
        rmse = np.sqrt(mse)
        
        # See if is the best model
        if(rmse < best_rmse):
            best_rmse = rmse
            best_params = i
            best_SVM = svm_inst
            
    return best_SVM, best_params

def SVM(x_train, x_validation, x_test, y_train, y_validation, y_test):
    
    # Create the model
    best_SVM, best_params = SVM_model(x_train, x_validation, y_train, y_validation)
    
    # Predict the test data
    svm_test = best_SVM.predict(x_test)
    
    # Calculate the error
    mse = sk.metrics.mean_squared_error(y_test, svm_test)
    rmse = np.sqrt(mse)
    
    return best_SVM, best_params, rmse

# MLP

In [None]:
from sklearn.neural_network import MLPRegressor

def MLP_model(x_train, x_validation, y_train, y_validation):
    
    best_rmse = float('inf')
    
    # Parameters to be used
    parameters_grid = { "hidden_layer_sizes": [(100, 50, 25), (21, 14, 1), (100, 100, 100)],
                        "activation": ["identity", "logistic", "tanh", "relu"],
                        "max_iter": [1000, 2000],
                        "learning_rate": ["constant", "invscaling", "adaptive"]
    }
    
    # Find the best configuration
    for i in ParameterGrid(parameters_grid):
        mlp_inst = MLPRegressor(**i)
        mlp_inst.fit(x_train, y_train)
        mlp_validation = mlp_inst.predict(x_validation)
        mse = sk.metrics.mean_squared_error(y_validation, mlp_validation)
        rmse = np.sqrt(mse)
        
        # Save the best model
        if (rmse < best_rmse):
            best_rmse = rmse
            best_params = i
            best_MLP = mlp_inst
        
    return best_MLP, best_params


def MLP(x_train, x_validation, x_test, y_train, y_validation, y_test):
    
    # Create the model
    best_MLP, best_params = MLP_model(x_train, x_validation, y_train, y_validation)
    
    # Predict the data
    mlp_test = best_MLP.predict(x_test)
    
    # Calculate the error
    mse = sk.metrics.mean_squared_error(y_test, mlp_test)
    rmse = np.sqrt(mse)
    
    return best_MLP, best_params, rmse

# RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

def RF_model(x_train, x_validation, y_train, y_validation):
    
    best_rmse = float('inf')
    
    # Parameters to be used
    parameters_grid = { "hidden_layer_sizes": [(100, 50, 25), (21, 14, 1), (100, 100, 100)],
                        "activation": ["identity", "logistic", "tanh", "relu"],
                        "max_iter": [1000, 2000],
                        "learning_rate": ["constant", "invscaling", "adaptive"]
    }
    
    # Find the best config
    for i in ParameterGrid(parameters_grid):
        rf_inst = RandomForestRegressor(**i)
        rf_inst.fit(x_train, y_train)
        rf_validation = rf_inst.predict(x_validation)
        mse = sk.metrics.mean_squared_error(y_validation, rf_validation)
        rmse = np.sqrt(mse)
        
        # Save the best model
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = rf_inst
            best_params = i
            
    return best_model, best_params

def RF(x_train, x_validation, x_test, y_train, y_validation, y_test):
    
    # Create the model
    best_RF, best_params = RF_model(x_train, x_validation, y_train, y_validation)
    
    # Predict the data
    rf_test = best_RF.predict(x_test)
    
    # Calculate the error
    mse = sk.metrics.mean_squared_error(y_test, rf_test)
    rmse = np.sqrt(mse)   
    
    return best_RF, best_params, rmse  
    

# GB

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

def GB_model(x_train, x_validation, y_train, y_validation):
    
    best_rmse = float('inf')
    
    # Parameters to be used
    parameters_grid = { "n_estimators": [10, 50, 100],
                        "loss": ["squared_error", "quantile", "absolute_error", "huber"],
                        "max_depth": [None, 10, 50], 
                        "learning_rate": [0.1, 0.5, 1.0],
                        "min_samples_split": [2, 5, 10], 
                        "min_samples_leaf": [1, 2, 4],
    }
    
    # Find the best config
    for i in ParameterGrid(parameters_grid):
        gb_inst = GradientBoostingRegressor(**i)
        gb_inst.fit(x_train, y_train)
        gb_validation = gb_inst.predict(x_validation)
        mse = sk.metrics.mean_squared_error(y_validation, gb_validation)
        rmse = np.sqrt(mse)
        
        # Save the best model
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = gb_inst
            best_params = i
            
    return best_model, best_params

def GB(x_train, x_validation, x_test, y_train, y_validation, y_test):
        
        # Create the model
        best_GB, best_params = GB_model(x_train, x_validation, y_train, y_validation)
        
        # Predict the data
        gb_test = best_GB.predict(x_test)
        
        # Calculate the error
        mse = sk.metrics.mean_squared_error(y_test, gb_test)
        rmse = np.sqrt(mse)   
        
        return best_GB, best_params, rmse