In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/mockdata.csv")
df

Unnamed: 0,age,housingtype,yearly income,cpf balance,yearly expenditure,savings,quality of life,disaster preparedness,retirement readiness
0,39,3-Room Flat,176846,120000,63000,9475,8,3,2
1,27,5-Room,83068,150000,34000,6280,8,2,3
2,53,5-Room,99755,300000,24000,116916,7,9,1
3,25,Landed Property,198879,70000,177000,8050,7,4,5
4,34,4-Room Flat,97753,180000,57000,557,7,5,4
...,...,...,...,...,...,...,...,...,...
4995,49,Apartment,25227,300000,11000,473221,10,10,5
4996,59,1&2-Room Flat,124741,210000,20000,230207,9,7,1
4997,34,1&2-Room Flat,51688,150000,13000,8264,8,5,1
4998,58,Apartment,106048,200000,23000,145291,6,8,4


In [3]:
# Encode the housingtype
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

df[["housingtype"]] = encoder.fit_transform(df[["housingtype"]])
df["housingtype"]

0       1.0
1       3.0
2       3.0
3       7.0
4       2.0
       ... 
4995    4.0
4996    0.0
4997    0.0
4998    4.0
4999    0.0
Name: housingtype, Length: 5000, dtype: float64

In [4]:
df.describe()

Unnamed: 0,age,housingtype,yearly income,cpf balance,yearly expenditure,savings,quality of life,disaster preparedness,retirement readiness
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,47.4742,3.5006,112094.8948,263110.0,60965.8,199050.2208,7.0054,6.1618,3.2674
std,13.351097,2.273551,50913.879436,142376.690945,41907.401213,186064.809204,2.235883,2.814108,1.793137
min,25.0,0.0,24021.0,10000.0,10000.0,500.0,1.0,1.0,1.0
25%,36.0,2.0,68346.25,140000.0,26000.0,7217.75,6.0,4.0,2.0
50%,47.0,3.0,111664.0,270000.0,50000.0,171698.5,7.0,7.0,3.0
75%,59.0,5.0,156897.5,380000.0,86000.0,359344.75,9.0,9.0,4.0
max,70.0,7.0,199929.0,500000.0,195000.0,657783.0,10.0,10.0,10.0


In [5]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target (y)
X = df.drop(columns=["quality of life", "disaster preparedness", "retirement readiness"])
y = df["quality of life"]
# y = df["disaster preparedness"]
# y = df["retirement readiness"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
# Since the standard deviation between each data point is high, we need to standardise/normalise it
# Since we have outliers & n>30 so CLT -> norm dist, we just choose standardisation over normalisation.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# Now we implement the model.
from sklearn import svm
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": np.logspace(-2, 4, num=7),
    "gamma": np.logspace(-3, 2, num=6)
}

svr = svm.SVR()

grid = GridSearchCV(
    svr,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error"
)

grid.fit(X_train, y_train)

TypeError: Parameter grid for parameter 'C' needs to be a list or a numpy array, but got 10000.0 (of type float) instead. Single values need to be wrapped in a list with one element.

In [None]:
grid.best_params_

{'C': 10000.0, 'gamma': 0.01}

In [None]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, accuracy_score

# Predict values
y_pred = grid.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse)
mean_y = sum(list(y))/len(y)
print("Mean of target value:", mean_y )
# While not a true metric, we calculate the % as a gauge.
print("Accuracy: ", accuracy_score(y_pred, y_test))

MAE 1.6167373601819408
RMSE 1.914967263649654
Mean of target value: 7.0054
27.3355877415944
