In [1]:
from calendar import c
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from mlr import MLR


Define an evaluation method

In [2]:
def calculateEvalMeasures(pred, targets, verbose = True):
    m = len(pred)
    mse = (((pred - targets)**2).sum() / (m))[0]
    mae = ((abs(pred - targets)).sum() / (m))[0]
    rmse = np.sqrt(mse)
    rss = (((targets-pred)**2).sum())[0]
    tss = (((targets-targets.mean())**2).sum())[0]
    r2 = 1 - (rss/tss)
    if verbose:
        print("R^2:", r2)
        print("Mean Absolute Error:", mae)
        print("MSE:", mse)
        print("RMSE:", rmse)
    return r2, mae, mse, rmse

Training performances using Autoscaling and Range-scaling

In [3]:
dataset = pd.read_csv("data/Folds5x2_pp.csv")
targetVarName = "PE"
mlr = MLR(dataset, targetVarName, "AUTO")
pred, params = mlr.learnParams(0.1)
# Training evaluation
targets = mlr.target
print("== Training results with Autoscaling ==")
r2, mae, mse, rmse = calculateEvalMeasures(pred, targets, True)

mlr = MLR(dataset, targetVarName, "MINMAX")
pred, params = mlr.learnParams(0.1)
# Training evaluation
targets = mlr.target
print("== Training results with Range-scaling (min-max) ==")
r2, mae, mse, rmse = calculateEvalMeasures(pred, targets, True)

== Training results with Autoscaling ==
R^2: 0.9283692873775184
Mean Absolute Error: 3.6382751908541873
MSE: 20.862579354385677
RMSE: 4.567557263394262
== Training results with Range-scaling (min-max) ==
R^2: 0.9212096264923988
Mean Absolute Error: 3.8122422861293357
MSE: 22.94784401109133
RMSE: 4.79039079941202


Cross-validation performances

In [4]:
def crossValidate(dataset, groupNumber, targetVarName, normMethod, verbose):
    dataset = dataset.sample(frac=1).reset_index(drop=True)
    if groupNumber <= 1:
        raise Exception("Set at least 2 cross-validation groups")
    
    cvPartition = dataset.shape[0] // groupNumber
    sumR2, sumMae, sumMse, sumRmse = 0, 0, 0, 0
    for i in range(groupNumber):        
        initialRow = i * cvPartition
        finalRow = (i + 1) * cvPartition
        testSet = dataset[initialRow:finalRow].reset_index(drop=True)
        trainingSet = dataset.drop(dataset.index[initialRow:finalRow], axis = 0, inplace = False).reset_index(drop=True)
        testTargets = testSet[[targetVarName]]
        testSet = testSet.drop(targetVarName, axis=1)
        cvMlr = MLR(trainingSet, targetVarName, normMethod)
        trainingPred, params = cvMlr.learnParams(0.1)
        testPred = cvMlr.test(testSet)
        strOut = ""
        if normMethod == "AUTO":
            strOut = "Autoscaling"
        elif normMethod == "MINMAX":
            strOut = "Range scaling (min-max)"
        if verbose:
            print("== Cross-validation results with", strOut, " for group", i,"==")
        r2, mae, mse, rmse = calculateEvalMeasures(testPred, testTargets, verbose)
        sumR2 += r2
        sumMae += mae
        sumMse += mse
        sumRmse += rmse
    r2 = sumR2 / groupNumber
    mae = sumMae / groupNumber
    mse = sumMse / groupNumber
    rmse = sumRmse / groupNumber
    print("== Cross-validation results (average) with", strOut, "==")
    print("R^2:", r2)
    print("Mean Absolute Error:", mae)
    print("MSE:", mse)
    print("RMSE:", rmse)
    
    
crossValidate(dataset, 10, targetVarName, "AUTO", False)
crossValidate(dataset, 10, targetVarName, "MINMAX", False)

== Cross-validation results (average) with Autoscaling ==
R^2: 0.9274085425596912
Mean Absolute Error: 3.6630715574681334
MSE: 21.131805527517184
RMSE: 4.592558731911547
== Cross-validation results (average) with Range scaling (min-max) ==
R^2: 0.9139552270565888
Mean Absolute Error: 3.914255229731881
MSE: 25.001343660031853
RMSE: 4.99561392392383
