[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/ANDREWTOLUTAIWO/poor_people_water_consumption/blob/main/PhD%20Thesis%20MLR%20code.ipynb)

### Training, Validation, Testing, Prediction and Evaluation Functions

In [36]:
import numpy as np
import pandas as pd
import math

# Training the model
def training(x1, y1):
    print('TRAINING DATA')
    X_train_prediction = model.predict(x1)
    #Evaluating the trained model
    evaluation(y_train, X_train_prediction)
    
#Validating the model
def validation(x1, y1):
    print('VALIDATION DATA')
    X_valid_prediction = model.predict(x1)
    #Evaluating the validated model
    evaluation(y1, X_valid_prediction)

#Testing the model
def testing(x1, y1):
    print('TESTING DATA')
    X_test_prediction = model.predict(x1)
    #Evaluating the tested model
    evaluation(y1, X_test_prediction)
    

# Predicting with the complete data
def complete_prediction(x1, y1):
    print('COMPLETE DATA')
    X_complete_prediction = model.predict(x1)
    evaluation(y, X_complete_prediction)
    #Convert to dataframe
    pred = pd.DataFrame(X_complete_prediction, columns=['Predicted volume'])
    # Join original data table and pred
    complete_data = pd.DataFrame(pd.concat([data_new, pred], axis=1))
    #Get first 100 records
    data100_without_GSF = complete_data.iloc[:100]
    #print into excel csv file
    data100_without_GSF = data100_without_GSF.to_csv('data100_without_GSF.csv')
    #Complete_Data_With_Prediction = complete_data.to_csv('poor_people_water_data_predicted_february.csv')
    
# Model evaluation with Mean Absolute Error, Root Mean Square Error and Rsquare Score
def evaluation(x, y):
    mae = np.abs(np.subtract(x, np.asarray(y))).mean()
    print('Mean absolute error = ', mae)
    rmse = math.sqrt(np.square(np.subtract(x, y)).mean())
    print('Root mean square error = ', rmse)
    n = (np.subtract(x, x.mean())*np.subtract(y, y.mean())).sum()
    d = math.sqrt(np.square(np.subtract(x, x.mean())).sum()*np.square(np.subtract(y, y.mean())).sum())
    r2s = np.square(n/d)
    print('Rsquare score =', r2s, "\n")

### Pearson Correlation function

In [37]:
# Pearson Correlation
def pearson_correlation(data):
    import matplotlib.pyplot as plt
    import seaborn as sns
    corr = data.corr()
    #Plotting heatmap
    plt.figure(figsize = (10,6))
    return corr
    #sns.heatmap(corr, annot=True)

In [38]:
# One-Hot Encoding
data = pd.read_csv("dry_season_data_without_GSF.csv")

encoded_data = pd.get_dummies(data, columns = ['Gender', 'Method', 'Availability', 'Quality'])

data_new = pd.DataFrame(encoded_data)

data_pearson = data_new.drop(columns=['ID', 
            'Education',
            'Rainfall',
            'Ave temp',
            'Kitchen Sink',
            'ToiletWC',
            'Garden',
            'Car',
            'Gender_male',
            'Gender_female',
            'Method_delivered',
            'Method_borehole',
            'Method_carried',
            'Method_well',
            'Availability_not_often',
            'Availability_often',
            'Quality_poor',
            'Quality_fair',
            'Quality_good',
            'Quality_very good'], axis=1)

# Testing the feature selection code
pearson_correlation(data_pearson)

Unnamed: 0,Household income,Household size,Travel time,Amount spent,Willingness to pay,Volume in lpcd
Household income,1.0,0.333805,-0.118495,0.159673,0.165434,0.147813
Household size,0.333805,1.0,0.189469,-0.261648,-0.116366,-0.298999
Travel time,-0.118495,0.189469,1.0,-0.707457,-0.686565,-0.851267
Amount spent,0.159673,-0.261648,-0.707457,1.0,0.775729,0.903938
Willingness to pay,0.165434,-0.116366,-0.686565,0.775729,1.0,0.69071
Volume in lpcd,0.147813,-0.298999,-0.851267,0.903938,0.69071,1.0


<Figure size 720x432 with 0 Axes>

### Modelling Multilinear Regression with Selected Featuers

In [39]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# One-Hot Encoding
data = pd.read_csv("dry_season_data_without_GSF.csv")

encoded_data = pd.get_dummies(data, columns = ['Gender', 'Method', 'Availability', 'Quality'])

data_new = pd.DataFrame(encoded_data)
# Modeling with Selected Features
X = data_new.drop(columns=['ID',
            'Household income',
            'Education',
            'Rainfall',
            'Kitchen Sink',
            'ToiletWC',
            'Garden',
            'Car',
            'Volume in lpcd',
            'Gender_male',
            'Gender_female',
            'Method_delivered',
            'Availability_not_often',
            'Availability_often',
            'Quality_fair',
            'Quality_good',
            'Quality_very good'], axis=1)

y = data_new['Volume in lpcd']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

# Developing the model with Pipeline that infuses StandardScaler with SDGRegressor
model = Pipeline([('scaler', StandardScaler()), ('sgd', SGDRegressor(max_iter=10000, tol=1e-3))])

# Fitting the model
model.fit(X_train, y_train)


#Testing the MLR code
training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)

TRAINING DATA
Mean absolute error =  7.255551449773989
Root mean square error =  11.390650558960186
Rsquare score = 0.9183084394125011 

VALIDATION DATA
Mean absolute error =  6.922964534894492
Root mean square error =  8.868308816056931
Rsquare score = 0.9430175649288389 

TESTING DATA
Mean absolute error =  6.586496212488465
Root mean square error =  9.223483205704737
Rsquare score = 0.9367061631421686 

COMPLETE DATA
Mean absolute error =  7.155387234557482
Root mean square error =  10.962184567656157
Rsquare score = 0.9220759905312288 



### Modelling Random  Forest with Selected Features

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# One-Hot Encoding
data = pd.read_csv("dry_season_data_without_GSF.csv")

encoded_data = pd.get_dummies(data, columns = ['Gender', 'Method', 'Availability', 'Quality'])

data_new = pd.DataFrame(encoded_data)
# Modeling with Selected Features
X = data_new.drop(columns=['ID',
            'Household income',
            'Education',
            'Rainfall',
            'Kitchen Sink',
            'ToiletWC',
            'Garden',
            'Car',
            'Volume in lpcd',
            'Gender_male',
            'Gender_female',
            'Method_delivered',
            'Availability_not_often',
            'Availability_often',
            'Quality_fair',
            'Quality_good',
            'Quality_very good'], axis=1)

y = data_new['Volume in lpcd']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

#Modeling with Random Forest Regressor
model = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=5,
                                       n_estimators=100, oob_score=True)

#Fitting the model
model.fit(X_train, y_train)

# checking the oob score
print('Out of bag score = ', model.oob_score_, '\n')


# Testing the MLP code
training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)

Out of bag score =  0.9641282693319799 

TRAINING DATA
Mean absolute error =  2.2552474529815063
Root mean square error =  4.987680612392865
Rsquare score = 0.9843639741912371 

VALIDATION DATA
Mean absolute error =  2.616031648406002
Root mean square error =  5.214332996431043
Rsquare score = 0.9805448659728101 

TESTING DATA
Mean absolute error =  2.33876124976671
Root mean square error =  4.35998846102998
Rsquare score = 0.9862913723628463 

COMPLETE DATA
Mean absolute error =  2.2996772522024607
Root mean square error =  4.951913075851612
Rsquare score = 0.984219521750785 



## Modelling Support Vector Regression with Selected Features

In [25]:
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# One-Hot Encoding
data = pd.read_csv("dry_season_data_without_GSF.csv")

encoded_data = pd.get_dummies(data, columns = ['Gender', 'Method', 'Availability', 'Quality'])

data_new = pd.DataFrame(encoded_data)
# Modeling with Selected Features
X = data_new.drop(columns=['ID',
            'Household income',
            'Education',
            'Rainfall',
            'Kitchen Sink',
            'ToiletWC',
            'Garden',
            'Car',
            'Volume in lpcd',
            'Gender_male',
            'Gender_female',
            'Method_delivered',
            'Availability_not_often',
            'Availability_often',
            'Quality_fair',
            'Quality_good',
            'Quality_very good'], axis=1)

y = data_new['Volume in lpcd']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)


#Modeling with Support Vector Regressor
#model = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
model = Pipeline([('scaler', StandardScaler()), ('svr', LinearSVR(C=1.0, epsilon=0.2))])

#Fitting the model
model.fit(X_train, y_train)


# Testing the MLP code
training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)

TRAINING DATA
Mean absolute error =  6.31192675666225
Root mean square error =  13.21990846129287
Rsquare score = 0.8893282859008069 

VALIDATION DATA
Mean absolute error =  4.927546638894555
Root mean square error =  8.326081941937838
Rsquare score = 0.9387860658989177 

TESTING DATA
Mean absolute error =  5.145149308476594
Root mean square error =  10.546550345033229
Rsquare score = 0.9341454163660883 

COMPLETE DATA
Mean absolute error =  6.056811000066896
Root mean square error =  12.564558100004472
Rsquare score = 0.8979543930125268 



## Modelling Multilayer Perceptron ANN with Selected Features

In [26]:
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# One-Hot Encoding
data = pd.read_csv("dry_season_data_without_GSF.csv")

encoded_data = pd.get_dummies(data, columns = ['Gender', 'Method', 'Availability', 'Quality'])

data_new = pd.DataFrame(encoded_data)
# Modeling with Selected Features
X = data_new.drop(columns=['ID',
            'Household income',
            'Education',
            'Rainfall',
            'Kitchen Sink',
            'ToiletWC',
            'Garden',
            'Car',
            'Volume in lpcd',
            'Gender_male',
            'Gender_female',
            'Method_delivered',
            'Availability_not_often',
            'Availability_often',
            'Quality_fair',
            'Quality_good',
            'Quality_very good'], axis=1)

y = data_new['Volume in lpcd']

# Splitting data_var into training-validation-test set in ratio 80-10-10
# We first split the data into training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

# Now we divide the remaining data equally between valid and test
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

model = Pipeline([('scaler', StandardScaler()), ('sgd', MLPRegressor(hidden_layer_sizes=(32,),
                   activation="relu", 
                   solver='adam',
                   learning_rate_init=0.01,
                   random_state=1, 
                   warm_start=True,
                   max_iter=2000))])
   
# Fitting the model
model.fit(X_train, y_train)

# Testing the MLP code
training(X_train, y_train)
validation(X_valid, y_valid)
testing(X_test, y_test)
complete_prediction(X, y)

TRAINING DATA
Mean absolute error =  4.87639974486996
Root mean square error =  7.982866400097984
Rsquare score = 0.9583919568429035 

VALIDATION DATA
Mean absolute error =  5.495508873261042
Root mean square error =  7.2593522517766615
Rsquare score = 0.9688061914235934 

TESTING DATA
Mean absolute error =  5.016408151768464
Root mean square error =  8.262159676077832
Rsquare score = 0.9582222896770582 

COMPLETE DATA
Mean absolute error =  4.95231149839893
Root mean square error =  7.942107562372143
Rsquare score = 0.9591918109556836 

