In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import QuantileRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.feature_selection import r_regression
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error

In [2]:
eng_in = pd.read_csv("engineInputs.csv")
eng_in.head()

Unnamed: 0,In1,In2
0,17.5,576.2
1,35.9,598.6
2,31.4,612.1
3,23.7,624.2
4,20.2,635.2


In [3]:
eng_tar = pd.read_csv("engineTargets.csv")
eng_tar.head()

Unnamed: 0,T1,T2
0,57.6,848
1,24.8,905
2,22.3,578
3,15.6,382
4,16.5,298


In [4]:
eng_in.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1199 entries, 0 to 1198
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   In1     1199 non-null   float64
 1   In2     1199 non-null   float64
dtypes: float64(2)
memory usage: 18.9 KB


In [5]:
eng_in.isna().sum()

In1    0
In2    0
dtype: int64

In [6]:
eng_tar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1199 entries, 0 to 1198
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   T1      1199 non-null   float64
 1   T2      1199 non-null   int64  
dtypes: float64(1), int64(1)
memory usage: 18.9 KB


In [7]:
eng_tar.isna().sum()

T1    0
T2    0
dtype: int64

In [8]:
engine = pd.concat([eng_in.reset_index(drop=True),eng_tar.reset_index(drop=True)],axis=1)
engine.head()

Unnamed: 0,In1,In2,T1,T2
0,17.5,576.2,57.6,848
1,35.9,598.6,24.8,905
2,31.4,612.1,22.3,578
3,23.7,624.2,15.6,382
4,20.2,635.2,16.5,298


In [9]:
X = engine.drop(['T1', 'T2'], axis=1).values
y1 = ((engine['T1']).values).reshape(-1, 1)
y2 = ((engine['T2']).values).reshape(-1, 1)

scaler = StandardScaler()
y1 = scaler.fit_transform(y1)
y2 = scaler.fit_transform(y2)

In [10]:
modelos = [('QR',QuantileRegressor()),
           ('Decision Tree',DecisionTreeRegressor()),
           ('KNN',KNeighborsRegressor())]

In [11]:
resultado_R = []
resultado_R2 = []
resultado_SSE = []
resultado_MAE = []
resultado_MSE = []
resultado_RMSE = []
resultado_RMSLE = []

In [12]:
#Hold-Out (60/40)
X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.40)

for nombre,modelo in modelos:
    modelo.fit(X_train,y_train)
    y_pred = modelo.predict(X_test)
    sse = ((y_test - y_pred)**2).sum
    try:
        RMSLE = mean_squared_log_error(y_test,y_pred,squared=False)
    except:
        RMSLE = 1.0
    
    #R (Pearson correlation coefficient)
    resultado_R.append((nombre,r_regression(y_test,y_pred)))
    #R2 (coefficient of determination)
    resultado_R2.append((nombre,r_regression(y_test,y_pred)**2))
    #SSE (sum of squared errors)
    resultado_SSE.append((nombre,np.sum((y_test-y_pred)**2)))
    #MAE (mean absolute error)
    resultado_MAE.append((nombre,mean_absolute_error(y_test,y_pred)))
    #MSE (mean squared error)
    resultado_MSE.append((nombre,mean_squared_error(y_test,y_pred)))
    resultado_RMSE.append((nombre,mean_squared_error(y_test,y_pred,squared=False)))
    resultado_RMSLE.append((nombre,RMSLE))

resultado_R.sort(key=lambda k:k[1], reverse=True)
resultado_R2.sort(key=lambda k:k[1], reverse=True)
#resultado_SSE.sort(key=lambda k:k[1], reverse=True)
resultado_MAE.sort(key=lambda k:k[1], reverse=True)
resultado_MSE.sort(key=lambda k:k[1], reverse=True)
resultado_RMSE.sort(key=lambda k:k[1], reverse=True)
resultado_RMSLE.sort(key=lambda k:k[1], reverse=True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [13]:
print("R (Pearson correlation coefficient)")
resultado_R

R (Pearson correlation coefficient)


[('KNN', array([0.99492462])),
 ('Decision Tree', array([0.99051192])),
 ('QR', array([0.76195903]))]

In [14]:
print("R2 (coefficient of determination)")
resultado_R2

R2 (coefficient of determination)


[('KNN', array([0.989875])),
 ('Decision Tree', array([0.98111386])),
 ('QR', array([0.58058157]))]

In [15]:
print("SSE (sum of squared errors)")
resultado_SSE

SSE (sum of squared errors)


[('QR', 369532.2782280231),
 ('Decision Tree', 484604.36541516613),
 ('KNN', 5.327679923917815)]

In [16]:
print("MAE (mean absolute error)")
resultado_MAE

MAE (mean absolute error)


[('QR', 0.5466491401268222),
 ('Decision Tree', 0.07764868236580846),
 ('KNN', 0.05286513309613191)]

In [17]:
print("MSE (mean squared error)")
resultado_MSE

MSE (mean squared error)


[('QR', 0.43530032287055614),
 ('Decision Tree', 0.02026052626035389),
 ('KNN', 0.011099333174828781)]

In [18]:
print("RMSE (root mean squared error)")
resultado_RMSE

RMSE (root mean squared error)


[('QR', 0.6597729328114),
 ('Decision Tree', 0.1423394754112642),
 ('KNN', 0.10535337286878281)]

In [19]:
print("RMSLE (Root Mean Squared Logaritmic Error)")
resultado_RMSLE

RMSLE (Root Mean Squared Logaritmic Error)


[('QR', 1.0), ('Decision Tree', 1.0), ('KNN', 1.0)]

In [20]:
resultado_R.clear()
resultado_R2.clear()
resultado_SSE.clear()
resultado_MAE.clear()
resultado_MSE.clear()
resultado_RMSE.clear()
resultado_RMSLE.clear()