In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../data/mockdata.csv")
df

In [None]:
# Encode the housingtype
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

df[["housingtype"]] = encoder.fit_transform(df[["housingtype"]])
df.describe()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into features (X) and target (y)
X = df.drop(columns=["quality_of_life", "disaster_preparedness", "retirement_readiness"])
# y = df["quality_of_life"]
# y = df["disaster_preparedness"]
y = df["retirement_readiness"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(type(X_train))
# Since the standard deviation between each data point is high, we need to standardise/normalise it
# Since we have outliers & n>30 so CLT -> norm dist, we just choose standardisation over normalisation.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

In [None]:
# # SVR model.
# from sklearn import svm
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # param_grid = {
# #     "C": np.logspace(-2, 3, num=7),
# #     "gamma": np.logspace(-3, 2, num=6),
# #     'epsilon': np.logspace(-3, 2, num=6)
# # }
# # svr = svm.SVR()
# # grid = GridSearchCV(
# #     svr,
# #     param_grid=param_grid,
# #     scoring="neg_mean_absolute_error"
# # )
# # grid.fit(X_train, y_train)
# # print(grid.best_params_)  # {'C': 3.1622776601683795, 'epsilon': 1.0, 'gamma': 1.0}
# # y_pred = grid.predict(X_test)

# svr = svm.SVR(
#     kernel='rbf', 
#     gamma=1.0, 
#     C=3.1622776601683795, 
#     epsilon=1.0,
# )
# svr.fit(X_train, y_train)
# y_pred = svr.predict(X_test)

# # Calculate MAE
# mae = mean_absolute_error(y_test, y_pred)
# print(f"Mean Absolute Error (MAE): {mae}")

# # Calculate MSE
# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error (MSE): {mse}")

# # Calculate RMSE
# rmse = np.sqrt(mse)
# print(f"Root Mean Squared Error (RMSE): {rmse}")

# # Calculate R-squared
# r2 = r2_score(y_test, y_pred)
# print(f"R-squared (R²): {r2}")

In [None]:
# KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=28)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
from sklearn.metrics import f1_score
accuracy = f1_score(y_test, y_pred, average='micro')
print("F1:", accuracy)

In [None]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(max_iter=300)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy", accuracy)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

# Create the dataset
df = pd.read_csv("../data/mockdata.csv")

# Preprocess the data
X = df[['age', 'housingtype', 'yearly_income', 'cpf_balance', 'yearly_expenditure', 'savings']]
y = df[['quality_of_life', 'disaster_preparedness', 'retirement_readiness']]

# Encode categorical features
housingtype_categories = ["1&2-Room Flat", "3-Room Flat", "4-Room Flat", "5-Room Flat", "Executive Flat", "Condominium", "Apartment", "Landed Property"]
encoder = OrdinalEncoder(categories=[housingtype_categories])
X.loc[:, 'housingtype'] = encoder.fit_transform(X[['housingtype']])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the model
model = RandomForestRegressor(
    max_depth=None,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=10,
    n_estimators=100
)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
rmse = root_mean_squared_error(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')

# Display the predictions vs the actual values
results = pd.DataFrame({'Actual Quality of Life': y_test['quality_of_life'], 'Predicted Quality of Life': y_pred[:, 0],
                        'Actual Disaster Preparedness': y_test['disaster_preparedness'], 'Predicted Disaster Preparedness': y_pred[:, 1],
                        'Actual Retirement Readiness': y_test['retirement_readiness'], 'Predicted Retirement Readiness': y_pred[:, 2]})
results

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the dataset
df = pd.read_csv("../data/mockdata.csv")

# Preprocess the data
X = df[['age', 'housingtype', 'yearly_income', 'cpf_balance', 'yearly_expenditure', 'savings']]
y = df[['quality_of_life', 'disaster_preparedness', 'retirement_readiness']]

# encode housingtype because it is a string
housingtype_categories = ["1&2-Room Flat", "3-Room Flat", "4-Room Flat", "5-Room Flat", "Executive Flat", "Condominium", "Apartment", "Landed Property"]
encoder = OrdinalEncoder(categories=[housingtype_categories])
X.loc[:, 'housingtype'] = encoder.fit_transform(X[['housingtype']])

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Since the standard deviation between each data point is high, we need to standardise/normalise it
# Since we have outliers & n>30 so CLT -> norm dist, we just choose standardisation over normalisation.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# GridSearchCV stuff for hyperparameter tuning
# param_grid = {
#     'estimator__C': np.logspace(-2, 1, num=3),
#     'estimator__epsilon': np.logspace(-2, 1, num=3),
#     'estimator__kernel': ['linear', 'poly', 'rbf']
# }
param_grid = {'estimator__C': [0.31622776601683794], 'estimator__epsilon': [0.31622776601683794], 'estimator__kernel': ['rbf']}

# perform GridSearchCV and return the best model
svr = MultiOutputRegressor(SVR())
grid_search = GridSearchCV(svr, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_  # {'estimator__C': 0.31622776601683794, 'estimator__epsilon': 0.31622776601683794, 'estimator__kernel': 'rbf'}
print("Best parameters found: ", best_params)

# train the best model
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# get predictions and evaluate
y_pred = best_model.predict(X_test)
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2 Score: ", r2_score(y_test, y_pred))

results = pd.DataFrame({
    'Actual Quality of Life': y_test['quality_of_life'],
    'Predicted Quality of Life': y_pred[:, 0],
    'Actual Disaster Preparedness': y_test['disaster_preparedness'],
    'Predicted Disaster Preparedness': y_pred[:, 1],
    'Actual Retirement Readiness': y_test['retirement_readiness'],
    'Predicted Retirement Readiness': y_pred[:, 2]
})
results

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

df = pd.read_csv("../data/mockdata.csv")

X = df[['age', 'housingtype', 'yearly_income', 'cpf_balance', 'yearly_expenditure', 'savings']]
y_quality_of_life = df['quality_of_life']
y_disaster_preparedness = df['disaster_preparedness']
y_retirement_readiness = df['retirement_readiness']

# encode housingtype because it is a string
housingtype_categories = ["1&2-Room Flat", "3-Room Flat", "4-Room Flat", "5-Room Flat", "Executive Flat", "Condominium", "Apartment", "Landed Property"]
encoder = OrdinalEncoder(categories=[housingtype_categories])
X.loc[:, 'housingtype'] = encoder.fit_transform(X[['housingtype']])

# split the data
X_train, X_test, y_train_quality_of_life, y_test_quality_of_life = train_test_split(X, y_quality_of_life, test_size=0.2, random_state=42)
_, _, y_train_disaster_preparedness, y_test_disaster_preparedness = train_test_split(X, y_disaster_preparedness, test_size=0.2, random_state=42)
_, _, y_train_retirement_readiness, y_test_retirement_readiness = train_test_split(X, y_retirement_readiness, test_size=0.2, random_state=42)

# Since the standard deviation between each data point is high, we need to standardise/normalise it
# Since we have outliers & n>30 so CLT -> norm dist, we just choose standardisation over normalisation.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# GridSearchCV stuff for hyperparameter tuning
param_grid = {
    'C': np.logspace(-2, 1, num=3),
    'epsilon': np.logspace(-2, 1, num=3),
    'kernel': ['linear', 'poly', 'rbf']
}

# perform GridSearchCV and return the best model
def train_svr(X_train, y_train):
    svr = SVR()
    grid_search = GridSearchCV(svr, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_estimator_)
    return grid_search.best_estimator_

# train separate SVR models for each target
best_model_quality_of_life = train_svr(X_train, y_train_quality_of_life)
best_model_disaster_preparedness = train_svr(X_train, y_train_disaster_preparedness)
best_model_retirement_readiness = train_svr(X_train, y_train_retirement_readiness)

# getting the predictions predictions
y_pred_quality_of_life = best_model_quality_of_life.predict(X_test)
y_pred_disaster_preparedness = best_model_disaster_preparedness.predict(X_test)
y_pred_retirement_readiness = best_model_retirement_readiness.predict(X_test)

# evaluate the models
rmse_quality_of_life = np.sqrt(mean_squared_error(y_test_quality_of_life, y_pred_quality_of_life))
rmse_disaster_preparedness = np.sqrt(mean_squared_error(y_test_disaster_preparedness, y_pred_disaster_preparedness))
rmse_retirement_readiness = np.sqrt(mean_squared_error(y_test_retirement_readiness, y_pred_retirement_readiness))

r2_quality_of_life = r2_score(y_test_quality_of_life, y_pred_quality_of_life)
r2_disaster_preparedness = r2_score(y_test_disaster_preparedness, y_pred_disaster_preparedness)
r2_retirement_readiness = r2_score(y_test_retirement_readiness, y_pred_retirement_readiness)

print(f'Root Mean Squared Error (Quality of Life): {rmse_quality_of_life}')
print(f'R^2 Score (Quality of Life): {r2_quality_of_life}\n')

print(f'Root Mean Squared Error (Disaster Preparedness): {rmse_disaster_preparedness}')
print(f'R^2 Score (Disaster Preparedness): {r2_disaster_preparedness}\n')

print(f'Root Mean Squared Error (Retirement Readiness): {rmse_retirement_readiness}')
print(f'R^2 Score (Retirement Readiness): {r2_retirement_readiness}\n')

# display the predictions vs the actual values
results = pd.DataFrame({
    'Actual Quality of Life': y_test_quality_of_life,
    'Predicted Quality of Life': y_pred_quality_of_life,
    'Actual Disaster Preparedness': y_test_disaster_preparedness,
    'Predicted Disaster Preparedness': y_pred_disaster_preparedness,
    'Actual Retirement Readiness': y_test_retirement_readiness,
    'Predicted Retirement Readiness': y_pred_retirement_readiness
})
results


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pickle

df = pd.read_csv("../data/mockdata.csv")

X = df[['age', 'housingtype', 'yearly_income', 'cpf_balance', 'yearly_expenditure', 'savings']]
y_quality_of_life = df['quality_of_life']
y_disaster_preparedness = df['disaster_preparedness']
y_retirement_readiness = df['retirement_readiness']

housingtype_categories = ["1&2-Room Flat", "3-Room Flat", "4-Room Flat", "5-Room Flat", "Executive Flat", "Condominium", "Apartment", "Landed Property"]
encoder = OrdinalEncoder(categories=[housingtype_categories])
X.loc[:, 'housingtype'] = encoder.fit_transform(X[['housingtype']])

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train_quality_of_life, y_test_quality_of_life = train_test_split(X_poly, y_quality_of_life, test_size=0.2, random_state=42)
_, _, y_train_disaster_preparedness, y_test_disaster_preparedness = train_test_split(X_poly, y_disaster_preparedness, test_size=0.2, random_state=42)
_, _, y_train_retirement_readiness, y_test_retirement_readiness = train_test_split(X_poly, y_retirement_readiness, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

def train_random_forest(X_train, y_train):
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

best_model_quality_of_life = train_random_forest(X_train, y_train_quality_of_life)
best_model_disaster_preparedness = train_random_forest(X_train, y_train_disaster_preparedness)
best_model_retirement_readiness = train_random_forest(X_train, y_train_retirement_readiness)

with open('model_quality_of_life.pkl', 'wb') as file:
    pickle.dump(best_model_quality_of_life, file)

with open('model_disaster_preparedness.pkl', 'wb') as file:
    pickle.dump(best_model_disaster_preparedness, file)

with open('model_retirement_readiness.pkl', 'wb') as file:
    pickle.dump(best_model_retirement_readiness, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

with open('poly.pkl', 'wb') as file:
    pickle.dump(poly, file)

with open('encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)

y_pred_quality_of_life = best_model_quality_of_life.predict(X_test)
y_pred_disaster_preparedness = best_model_disaster_preparedness.predict(X_test)
y_pred_retirement_readiness = best_model_retirement_readiness.predict(X_test)

rmse_quality_of_life = np.sqrt(mean_squared_error(y_test_quality_of_life, y_pred_quality_of_life))
rmse_disaster_preparedness = np.sqrt(mean_squared_error(y_test_disaster_preparedness, y_pred_disaster_preparedness))
rmse_retirement_readiness = np.sqrt(mean_squared_error(y_test_retirement_readiness, y_pred_retirement_readiness))

r2_quality_of_life = r2_score(y_test_quality_of_life, y_pred_quality_of_life)
r2_disaster_preparedness = r2_score(y_test_disaster_preparedness, y_pred_disaster_preparedness)
r2_retirement_readiness = r2_score(y_test_retirement_readiness, y_pred_retirement_readiness)

print(f'Root Mean Squared Error (Quality of Life): {rmse_quality_of_life}')
print(f'R^2 Score (Quality of Life): {r2_quality_of_life}')

print(f'Root Mean Squared Error (Disaster Preparedness): {rmse_disaster_preparedness}')
print(f'R^2 Score (Disaster Preparedness): {r2_disaster_preparedness}')

print(f'Root Mean Squared Error (Retirement Readiness): {rmse_retirement_readiness}')
print(f'R^2 Score (Retirement Readiness): {r2_retirement_readiness}')

results = pd.DataFrame({
    'Actual Quality of Life': y_test_quality_of_life,
    'Predicted Quality of Life': y_pred_quality_of_life,
    'Actual Disaster Preparedness': y_test_disaster_preparedness,
    'Predicted Disaster Preparedness': y_pred_disaster_preparedness,
    'Actual Retirement Readiness': y_test_retirement_readiness,
    'Predicted Retirement Readiness': y_pred_retirement_readiness
})
results


In [None]:
import pickle
with open("models/aio_model.pkl", "wb") as f:
    pickle.dump(best_model, f)
with open("models/qol_model.pkl", "wb") as f:
    pickle.dump(best_model_quality_of_life, f)
with open("models/d_model.pkl", "wb") as f:
    pickle.dump(best_model_disaster_preparedness, f)
with open("models/r_model.pkl", "wb") as f:
    pickle.dump(best_model_retirement_readiness, f)
with open('models/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
with open('models/encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)