[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/ANDREWTOLUTAIWO/poor_people_water_consumption/blob/main/PhD%20Thesis%20MLR%20code.ipynb)

### Training, Validation, Testing and Evaluation Functions

In [86]:
import numpy as np
import pandas as pd
import math

# Training the model
def training(x1, y1):
    print('TRAINING DATA')
    X_train_prediction = model.predict(x1)
    #Evaluating the trained model
    evaluation(y_train, X_train_prediction)
    
#Validating the model
def validation(x1, y1):
    print('VALIDATION DATA')
    X_valid_prediction = model.predict(x1)
    #Evaluating the validated model
    evaluation(y1, X_valid_prediction)

#Testing the model
def testing(x1, y1):
    print('TESTING DATA')
    X_test_prediction = model.predict(x1)
    #Evaluating the tested model
    evaluation(y1, X_test_prediction)
    

# Predicting with the complete data
def complete_prediction(x1, y1):
    print('COMPLETE DATA')
    X_complete_prediction = model.predict(x1)
    evaluation(y, X_complete_prediction)
    #Convert to dataframe
    pred = pd.DataFrame(X_complete_prediction, columns=['Predicted volume'])
    # Join original data table and pred
    complete_data = pd.DataFrame(pd.concat([data, pred], axis=1))
    #Get first 100 records
    data100 = complete_data.iloc[:100]
    #print into excel csv file
    karu_trained_result = complete_data.to_csv('mararaba_tested_result_MLR.csv')
    
# Model evaluation with Mean Absolute Error, Root Mean Square Error and Root Meas Square Percentage Error
def evaluation(x, y):
    import sys
    np.set_printoptions(precision=None, threshold=sys.maxsize, edgeitems=7)
    mae = np.abs(np.subtract(x, np.asarray(y))).mean()
    print('MAE = ', mae)
    rmse = math.sqrt(np.square(np.subtract(x, y)).mean())
    print('RMSE = ', rmse)
    n = (np.subtract(x, x.mean())*np.subtract(y, y.mean())).sum()
    d = math.sqrt(np.square(np.subtract(x, x.mean())).sum()*np.square(np.subtract(y, y.mean())).sum())
    r2s = np.ceil(np.square(n/d)*100.0)
    print('Rsquared =', r2s, "\n")


### Normal Equation Function

In [88]:
def normal_equation(X, y): 
    # X = Coefficient matrix, 
    # y = observation vector, or the vector of target variable
       
    from numpy.linalg import inv
    import numpy as np
    import sys
    np.set_printoptions(precision=None, threshold=sys.maxsize, edgeitems=7)
    # set bias term to 1 for each sample and concatenate with A  
    A = np.c_[np.ones((len(X), 1)), X]
    #print("Coefficient Matrix, A: ", "\n", A, "\n")    # Print design marix A
    # Solution to normal equation  # theta = (A.T * X)^(-1) * A.T * y 
    A_transpose = A.T  
    params = inv(A_transpose.dot(A)).dot(A_transpose).dot(y) 
    print('Here are the parameters; intercept is first in the list: ',"\n", params, "\n")
    # test prediction  
    A2 = np.c_[np.ones((len(X), 1)), X]  
    prediction = A2.dot(params) 
    #print("Prediction: ", "\n", prediction, "\n")   # Print predictions
    #Evaluating the output by calling the function Evaluation
    evaluation(y, prediction)

### Modelling Multilinear Regression with Karu Dataset

In [96]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
import pandas as pd

# Read the file
data = pd.read_csv("karu_data2.csv")

# Modeling with Predictor Features
X = data.drop(columns=['ID',
            'Volume'], axis=1)

# Target Feature
y = data['Volume']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

# Developing the model with Pipeline that infuses StandardScaler with SDGRegressor
model = Pipeline([('scaler', StandardScaler()), ('sgd', SGDRegressor(max_iter=10000, tol=1e-3))])

# Fitting the model with training data so that it becomes a trained model
model.fit(X_train, y_train)

#Testing with Karu_dataset
training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)


TRAINING DATA
MAE =  36.994300786460784
RMSE =  100.16007696898056
Rsquared = 97.0 

VALIDATION DATA
MAE =  25.083840010664666
RMSE =  56.09338873155501
Rsquared = 100.0 

TESTING DATA
MAE =  25.161171631039924
RMSE =  60.89491015593286
Rsquared = 99.0 

COMPLETE DATA
MAE =  34.61994179333909
RMSE =  93.33326657186359
Rsquared = 98.0 



## Modelling Multilinear Regression with Nyanya_Mararaba_test

In [98]:
import pandas as pd
import numpy as np

# Read the file
data = pd.read_csv("Nyanya_Mararaba_test2.csv")

# Modeling with Selected Features
X = data.drop(columns=['ID',
            'Volume'], axis=1)

y = data['Volume']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

model.fit(X_train, y_train)

training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)


TRAINING DATA
MAE =  86.3140649381325
RMSE =  110.1233602538128
Rsquared = 52.0 

VALIDATION DATA
MAE =  82.26976916202494
RMSE =  108.54506112310673
Rsquared = 59.0 

TESTING DATA
MAE =  86.0506073025355
RMSE =  108.2448527871488
Rsquared = 58.0 

COMPLETE DATA
MAE =  85.88328959696203
RMSE =  109.77987713504656
Rsquared = 53.0 



### Modelling Random  Forest with Selected Features

In [61]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd

# Read the file
data = pd.read_csv("karu_data2.csv")

# Modeling with Selected Features
X = data.drop(columns=['ID',
            'Volume'], axis=1)

y = data['Volume']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

#Modeling with Random Forest Regressor
model = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=5,
                                       n_estimators=100, oob_score=True)

#Fitting the model
RF_Model = model.fit(X_train, y_train)

# checking the oob score
print('Out of bag score = ', model.oob_score_, '\n')


# Testing the MLP code
training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)


Out of bag score =  0.9925276081345047 

TRAINING DATA
MAE =  19.516570854215935
RMSE =  43.7170672276976
Rsquared = 100.0 

VALIDATION DATA
MAE =  31.164480908254223
RMSE =  72.81351166753917
Rsquared = 99.0 

TESTING DATA
MAE =  27.875522906090453
RMSE =  84.61757019650723
Rsquared = 99.0 

COMPLETE DATA
MAE =  21.517257064807215
RMSE =  52.67959415489836
Rsquared = 100.0 



### Modelling Random  Forest with Nyanya_Mararaba_test

In [63]:
import pandas as pd

# Read the file
data = pd.read_csv("Nyanya_Mararaba_test2.csv")

# Modeling with Selected Features
X = data.drop(columns=['ID',
            'Volume'], axis=1)

y = data['Volume']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

model.fit(X_train, y_train)

training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)


TRAINING DATA
MAE =  63.62965571834088
RMSE =  83.07506987237005
Rsquared = 73.0 

VALIDATION DATA
MAE =  74.28269301960167
RMSE =  94.24887511750377
Rsquared = 69.0 

TESTING DATA
MAE =  70.33076822337851
RMSE =  92.41188364690142
Rsquared = 64.0 

COMPLETE DATA
MAE =  65.36507069897067
RMSE =  85.22590250298116
Rsquared = 72.0 



## Modelling Support Vector Regression with Selected Features

In [66]:
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Read the file
data = pd.read_csv("karu_data2.csv")

# Modeling with Selected Features
X = data.drop(columns=['ID',
            'Volume'], axis=1)

y = data['Volume']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)


#Modeling with Support Vector Regressor
#model = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
model = Pipeline([('scaler', StandardScaler()), ('svr', LinearSVR(C=1.0, epsilon=0.2))])

#Fitting the model
SVR_Model = model.fit(X_train, y_train)


# Testing the MLP code
training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)

#Testing model with Nyanya_Mararaba_dataset
data2 = pd.read_csv("Nyanya_Mararaba_test2.csv")

# Modeling with Predictor Features
X2 = data2.drop(columns=['ID',
            'Volume'], axis=1)

# Target Feature
y2 = data2['Volume']

#Testing with Nyanya_Marataba_dataset
print("TESTING WITH NYANYA-MARARABA DATASET WHERE THERE IS NO WDN")
testing(X2, y2)

TRAINING DATA
MAE =  275.1868447032512
RMSE =  341.1694713758969
Rsquared = 98.0 

VALIDATION DATA
MAE =  263.5476091413879
RMSE =  324.55102746680035
Rsquared = 99.0 

TESTING DATA
MAE =  264.5231879104702
RMSE =  327.3046465381083
Rsquared = 97.0 

COMPLETE DATA
MAE =  272.956555467787
RMSE =  338.1766648759881
Rsquared = 98.0 

TESTING WITH NYANYA-MARARABA DATASET WHERE THERE IS NO WDN
TESTING DATA
MAE =  1329.5509909406328
RMSE =  1488.2855641630592
Rsquared = 53.0 



## Modelling Support Vector Regression with Nyanya_Mararaba_test

In [68]:
import pandas as pd

# Read the file
data = pd.read_csv("Nyanya_Mararaba_test2.csv")

# Modeling with Selected Features
X = data.drop(columns=['ID',
            'Volume'], axis=1)

y = data['Volume']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

model.fit(X_train, y_train)

training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)

TRAINING DATA
MAE =  93.51842773575494
RMSE =  126.0159537034477
Rsquared = 52.0 

VALIDATION DATA
MAE =  81.99190230973717
RMSE =  110.7623908408303
Rsquared = 54.0 

TESTING DATA
MAE =  102.33170867013213
RMSE =  128.22813298583634
Rsquared = 58.0 

COMPLETE DATA
MAE =  93.24710328659094
RMSE =  124.80021074346257
Rsquared = 53.0 



## Modelling Multilayer Perceptron ANN with Selected Features

In [100]:
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Read the file
data = pd.read_csv("karu_data2.csv")

# Modeling with Selected Features
X = data.drop(columns=['ID',
            'Volume'], axis=1)

y = data['Volume']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

model = Pipeline([('scaler', StandardScaler()), ('sgd', MLPRegressor(hidden_layer_sizes=(32,),
                   activation="relu", 
                   solver='adam',
                   learning_rate_init=0.1,
                   random_state=1, 
                   warm_start=True,
                   max_iter=2000))])
   
# Fitting the model
ANN_Model = model.fit(X_train, y_train)

# Testing the MLP code
training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)

#Calling Normal Equation
print("TRAINING WITH NORMAL EQUATION")
normal_equation(X_train, y_train)
print("VALIDATING WITH NORMAL EQUATION")
normal_equation(X_valid, y_valid)
print("TESTING WITH NORMAL EQUATION")
normal_equation(X_test, y_test)

TRAINING DATA
MAE =  37.7952004252488
RMSE =  96.51036510309063
Rsquared = 98.0 

VALIDATION DATA
MAE =  35.10788581752829
RMSE =  92.91735217378621
Rsquared = 98.0 

TESTING DATA
MAE =  28.027534148417498
RMSE =  61.23991515718835
Rsquared = 99.0 

COMPLETE DATA
MAE =  36.54970233679362
RMSE =  93.21907858485943
Rsquared = 98.0 

TRAINING WITH NORMAL EQUATION
Here are the parameters; intercept is first in the list:  
 [ 9.20546098e+01  8.72676912e-01  1.58182685e-02 -1.27003886e+00
  8.78830959e+00] 

MAE =  35.16629932028874
RMSE =  96.83074255909101
Rsquared = 97.0 

VALIDATING WITH NORMAL EQUATION
Here are the parameters; intercept is first in the list:  
 [-3.05062676e+02  6.77661874e+00 -6.97215057e-02  2.12127544e+00
  8.84334860e+00] 

MAE =  35.52609965307019
RMSE =  90.27756440394964
Rsquared = 98.0 

TESTING WITH NORMAL EQUATION
Here are the parameters; intercept is first in the list:  
 [-1.52131279e+02  1.58167778e+00 -4.73988945e-02  1.53031464e+00
  9.10457869e+00] 

MAE

## Modelling Multilayer Perceptron ANN with Nyanya_Mararaba_test

In [105]:
import pandas as pd

# Read the file
data = pd.read_csv("Nyanya_Mararaba_test2.csv")

# Modeling with Selected Features
X = data.drop(columns=['ID',
            'Volume'], axis=1)

y = data['Volume']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

model = Pipeline([('scaler', StandardScaler()), ('sgd', MLPRegressor(hidden_layer_sizes=(32,),
                   activation="relu", 
                   solver='adam',
                   learning_rate_init=0.1,
                   random_state=1, 
                   warm_start=True,
                   max_iter=2000))])

model.fit(X_train, y_train)

training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)

#Calling Normal Equation
print("TRAINING WITH NORMAL EQUATION")
normal_equation(X_train, y_train)
print("VALIDATING WITH NORMAL EQUATION")
normal_equation(X_valid, y_valid)
print("TESTING WITH NORMAL EQUATION")
normal_equation(X_test, y_test)

TRAINING DATA
MAE =  75.99743735239618
RMSE =  96.70147637670564
Rsquared = 64.0 

VALIDATION DATA
MAE =  72.14901531706879
RMSE =  100.78081141568599
Rsquared = 53.0 

TESTING DATA
MAE =  78.67229576020421
RMSE =  101.13567977220255
Rsquared = 63.0 

COMPLETE DATA
MAE =  75.8800809896442
RMSE =  97.56772106596827
Rsquared = 63.0 

TRAINING WITH NORMAL EQUATION
Here are the parameters; intercept is first in the list:  
 [-5.47561994e+02  1.28519662e+01  3.75884477e-03  3.24735030e+00
  8.27053745e-01] 

MAE =  86.71148593746958
RMSE =  109.60446859673384
Rsquared = 54.0 

VALIDATING WITH NORMAL EQUATION
Here are the parameters; intercept is first in the list:  
 [71.74329864  9.7746765  -0.18060333 -3.40322325  0.8058308 ] 

MAE =  76.47310392102946
RMSE =  103.22469880914628
Rsquared = 49.0 

TESTING WITH NORMAL EQUATION
Here are the parameters; intercept is first in the list:  
 [-3.18971266e+01 -9.38036190e-01 -3.05890918e-02  1.96795568e+00
  8.97940364e-01] 

MAE =  88.87245340224