# Import the Libs

In [21]:

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import sklearn as sk
from sklearn.model_selection import ParameterGrid

# Data Sepration

In [22]:
# Create a dataframe
data = pd.read_csv("input.csv")
df_data = pd.DataFrame(data)
df_data.head()

Unnamed: 0,Sex,Lengh,Diameter,Height,W-Height,S-Height,V-Height,S-Height.1,Class
0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,2,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [23]:
from sklearn.model_selection import train_test_split

# Function the create new set every run
def data_shufle(df_data):
    
    # Shuffle the data
    df_data = sk.utils.shuffle(df_data)
    df_data_norm = df_data.drop(columns=["Class"])    
    
    # Split the data
    x_train, x_temp, y_train, y_temp = train_test_split(df_data_norm, df_data["Class"], test_size=0.5)
    x_validation, x_test, y_validation, y_test = train_test_split(x_temp, y_temp, test_size=0.5)

    return x_train, x_validation, x_test, y_train, y_validation, y_test

# KNR

In [24]:
from sklearn.neighbors import KNeighborsRegressor

def KNR_model(x_train, x_validation, y_train, y_validation):
    
    best_rmse = float('inf')
    
    # Find the best configuration
    for k in range(1, 50):
        for i in("uniform", "distance"):
            knn_inst = KNeighborsRegressor(n_neighbors=k, weights=i)
            knn_inst.fit(x_train, y_train)
            knn_validation = knn_inst.predict(x_validation)
            mse = sk.metrics.mean_squared_error(y_validation, knn_validation)
            rmse = np.sqrt(mse)
            
            # See if is the best model
            if(rmse < best_rmse):
                best_rmse = rmse
                best_k = k
                best_dist = i
                best_KNR = knn_inst
                
    return best_KNR, best_k, best_dist


def KNR(x_train, x_validation, x_test, y_train, y_validation, y_test):
    
    # Create the model
    best_KNR, best_k, best_dist = KNR_model(x_train, x_validation, y_train, y_validation)
    
    # Predict the test data
    knn_test = best_KNR.predict(x_test)
    
    # Calculate the error
    mse = sk.metrics.mean_squared_error(y_test, knn_test)
    rmse = np.sqrt(mse)
    
    return best_KNR, best_k, best_dist, rmse
    
    
     

# SVR

In [25]:
from sklearn.svm import SVR

def SVM_model(x_train, x_validation, y_train, y_validation):
    
    best_rmse = float('inf')
    
    # State the parameters
    parameters_grid = { "kernel": ["linear", "poly", "rbf", "sigmoid"], 
                        "C": [0.1, 1, 10, 100, 1000]}

    
    # Find the best configuration
    for i in ParameterGrid(parameters_grid):
        svm_inst = SVR(**i)
        svm_inst.fit(x_train, y_train)
        svm_validation = svm_inst.predict(x_validation)
        mse = sk.metrics.mean_squared_error(y_validation, svm_validation)
        rmse = np.sqrt(mse)
        
        # See if is the best model
        if(rmse < best_rmse):
            best_rmse = rmse
            best_params = i
            best_SVM = svm_inst
            
    return best_SVM, best_params

def SVM(x_train, x_validation, x_test, y_train, y_validation, y_test):
    
    # Create the model
    best_SVM, best_params = SVM_model(x_train, x_validation, y_train, y_validation)
    
    # Predict the test data
    svm_test = best_SVM.predict(x_test)
    
    # Calculate the error
    mse = sk.metrics.mean_squared_error(y_test, svm_test)
    rmse = np.sqrt(mse)
    
    return best_SVM, *best_params, rmse

# MLP

In [26]:
from sklearn.neural_network import MLPRegressor

def MLP_model(x_train, x_validation, y_train, y_validation):
    
    best_rmse = float('inf')
    
    # Parameters to be used
    parameters_grid = { "hidden_layer_sizes": [(100, 50, 25), (21, 14, 1), (100, 100, 100)],
                        "activation": ["identity", "logistic", "tanh", "relu"],
                        "max_iter": [1000, 2000],
                        "learning_rate": ["constant", "invscaling", "adaptive"]
    }
    
    # Find the best configuration
    for i in ParameterGrid(parameters_grid):
        mlp_inst = MLPRegressor(**i)
        mlp_inst.fit(x_train, y_train)
        mlp_validation = mlp_inst.predict(x_validation)
        mse = sk.metrics.mean_squared_error(y_validation, mlp_validation)
        rmse = np.sqrt(mse)
        
        # Save the best model
        if (rmse < best_rmse):
            best_rmse = rmse
            best_params = i
            best_MLP = mlp_inst
        
    return best_MLP, best_params


def MLP(x_train, x_validation, x_test, y_train, y_validation, y_test):
    
    # Create the model
    best_MLP, best_params = MLP_model(x_train, x_validation, y_train, y_validation)
    
    # Predict the data
    mlp_test = best_MLP.predict(x_test)
    
    # Calculate the error
    mse = sk.metrics.mean_squared_error(y_test, mlp_test)
    rmse = np.sqrt(mse)
    
    return best_MLP, *best_params, rmse

# RF

In [27]:
from sklearn.ensemble import RandomForestRegressor

def RF_model(x_train, x_validation, y_train, y_validation):
    
    best_rmse = float('inf')
    
    # Parameters to be used
    parameters_grid = { 
        "n_estimators": [10, 50], 
        "criterion": ["friedman_mse", "absolute_error", "squared_error"],  
        "max_depth": [None, 10], 
        "min_samples_split": [2, 5],  
        "min_samples_leaf": [1, 2],  
    }
    
    # Find the best config
    for i in ParameterGrid(parameters_grid):
        rf_inst = RandomForestRegressor(**i)
        rf_inst.fit(x_train, y_train)
        rf_validation = rf_inst.predict(x_validation)
        mse = sk.metrics.mean_squared_error(y_validation, rf_validation)
        rmse = np.sqrt(mse)
        
        # Save the best model
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = rf_inst
            best_params = i
            
    return best_model, best_params

def RF(x_train, x_validation, x_test, y_train, y_validation, y_test):
    
    # Create the model
    best_RF, best_params = RF_model(x_train, x_validation, y_train, y_validation)
    
    # Predict the data
    rf_test = best_RF.predict(x_test)
    
    # Calculate the error
    mse = sk.metrics.mean_squared_error(y_test, rf_test)
    rmse = np.sqrt(mse)   
    
    return best_RF, *best_params, rmse  
    

# GB

In [28]:
from sklearn.ensemble import GradientBoostingRegressor

def GB_model(x_train, x_validation, y_train, y_validation):
    
    best_rmse = float('inf')
    
    # Parameters to be used
    parameters_grid = { "n_estimators": [10, 50],
                        "loss": ["squared_error", "quantile", "absolute_error", "huber"],
                        "max_depth": [None, 10], 
                        "learning_rate": [0.1, 0.5],
                        "min_samples_split": [2, 5], 
                        "min_samples_leaf": [1, 2],
    }
    
    # Find the best config
    for i in ParameterGrid(parameters_grid):
        gb_inst = GradientBoostingRegressor(**i)
        gb_inst.fit(x_train, y_train)
        gb_validation = gb_inst.predict(x_validation)
        mse = sk.metrics.mean_squared_error(y_validation, gb_validation)
        rmse = np.sqrt(mse)
        
        # Save the best model
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = gb_inst
            best_params = i
            
    return best_model, best_params

def GB(x_train, x_validation, x_test, y_train, y_validation, y_test):
        
    # Create the model
    best_GB, best_params = GB_model(x_train, x_validation, y_train, y_validation)
    
    # Predict the data
    gb_test = best_GB.predict(x_test)
    
    # Calculate the error
    mse = sk.metrics.mean_squared_error(y_test, gb_test)
    rmse = np.sqrt(mse)   
    
    return best_GB, *best_params, rmse

# MLR

In [29]:
from sklearn.linear_model import LinearRegression

def MLR(x_train, x_validation, x_test, y_train, y_validation, y_test):
    
    # Create the model
    mlr_inst = LinearRegression()
    mlr_inst.fit(x_train, y_train)
    
    # Predict the data
    mlr_test = mlr_inst.predict(x_validation)
    
    # Calculate the error
    rmse = np.sqrt(np.mean((mlr_test - y_validation)**2))
    
    return rmse

# Main

In [30]:
# Create the output dataframes
output = pd.DataFrame(columns=["KNN", "SVM", "MLP", "RF", "GB", "MLR"])

KNR_params = pd.DataFrame(
    columns=["k", 
             "dist"])

SVM_params = pd.DataFrame(
    columns=["kernel", 
             "C"])

MLP_params = pd.DataFrame(
    columns=["hidden_layer_sizes", 
             "activation", 
             "max_iter", 
             "learning_rate"])

RF_params = pd.DataFrame(
    columns=["n_estimators",
            "criterion",
            "max_depth",
            "min_samples_split",
            "min_samples_leaf"])

GB_params = pd.DataFrame(
    columns=["n_estimators", 
             "loss", 
             "max_depth", 
             "learning_rate", 
             "min_samples_split", 
             "min_samples_leaf"])


# Function to run the models
for i in range(20):
    
    # Create the sets
    data_shufled = data_shufle(df_data)
    
    # Run the models and save the results
    best_KNR, *best_params_KNR, rmse_KNR = KNR(*data_shufled)
    KNR_params.loc[len(KNR_params.index)] = best_params_KNR
    
    best_SVM, *best_params_SVM, rmse_SVM = SVM(*data_shufled)
    SVM_params.loc[len(SVM_params.index)] = best_params_SVM
    
    best_MLP, *best_params_MLP, rmse_MLP = MLP(*data_shufled)
    MLP_params.loc[len(MLP_params.index)] = best_params_MLP
    
    best_RF, *best_params_RF, rmse_RF = RF(*data_shufled)
    RF_params.loc[len(RF_params.index)] = best_params_RF
    
    best_GB, *best_params_GB, rmse_GB = GB(*data_shufled)
    GB_params.loc[len(GB_params.index)] = best_params_GB
    
    rmse_MLR = MLR(*data_shufled)
    
    # Show models accuracy
    print("============Run: ", i, "============")
    print("KNN: ", rmse_KNR)
    print("SVM: ", rmse_SVM)
    print("MLP: ", rmse_MLP)
    print("RF: ", rmse_RF)
    print("GB: ", rmse_GB)
    print("MLR: ", rmse_MLR)
    print("================================")
    
    # Save the results
    output.loc[len(output.index)] = [rmse_KNR, rmse_SVM, rmse_MLP, rmse_RF, rmse_GB, rmse_MLR]
    

# Generate the output
output.to_csv("output.csv")

# Generate the parameters
KNR_params.to_csv("data/KNR_params.csv")
SVM_params.to_csv("data/SVM_params.csv")
MLP_params.to_csv("data/MLP_params.csv")
RF_params.to_csv("data/RF_params.csv")
GB_params.to_csv("data/GB_params.csv")
  
    

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(p

KNN:  2.167271327839433
SVM:  2.1240922886554756
MLP:  2.0393498204899414
RF:  2.1351930316242256
GB:  2.128748282828197
MLR:  2.206463405245315
