# 2D Design Project

<b>Problem Statement</b>: We wish to predict Singapore's GDP growth amidst COVID-19 based on various factors. By comparing the predicted growth rate with the actual growth rate, we can determine the effectiveness of Singapore's coping strategies against COVID-19.

Factors/Variables to consider (from most to least important):
- Time/date (time series data)
- Vaccination rate
- Daily active cases
- Hospitalised
- Recovered
- Government grants/funding
- Phases (circuit breaker, phase 1 etc.)

Predict : 
- Growth rate of GDP

## Data Pre Processing

In [3]:
import pandas as pd
import numpy as np 

df = pd.read_csv('./output/processed_data.csv')

In [4]:
df.head()

Unnamed: 0,Date,Still Hospitalised,7 days Moving Average,Percentage Vaccinated,STI Price,Phase_Phase 2 (Heightened Alert),Phase_Preparatory Stage,Phase_Stabilisation Phase
0,738004,572,111.0,0.758562,3176.42,1,0,0
1,738005,562,102.0,0.759976,3149.25,1,0,0
2,738006,525,95.0,0.761512,3154.6,1,0,0
3,738007,547,98.0,0.76282,3186.65,1,0,0
4,738008,516,97.0,0.764014,3175.0,1,0,0


In [13]:
feature_names = ['Date','Still Hospitalised','7 days Moving Average','Percentage Vaccinated','Phase_Phase 2 (Heightened Alert)','Phase_Preparatory Stage','Phase_Stabilisation Phase']
target_name = ["STI Price"]
X = df.loc[:,feature_names]
y = df.loc[:,target_name]

# Modeling with Linear Regression

In [16]:
def CostFunction(x,y,w,b):
    cost = np.sum((((x.dot(w) + b) - y) ** 2) / (2*len(y)))
    return cost

def GradientDescent(x, y, w, b, learning_rate, epochs):
    cost_list = [0] * epochs
   
    for epoch in range(epochs):
        z = x.dot(w) + b
        loss = z - y
        
        weight_gradient = x.T.dot(loss) / len(y)
        bias_gradient = np.sum(loss) / len(y)
        
        w = w - learning_rate*weight_gradient
        b = b - learning_rate*bias_gradient
  
        cost = CostFunction(x, y, w, b)
        cost_list[epoch] = cost
        
        if (epoch%(epochs/10)==0):
            print("Cost is:",cost)
        
    return w, b, cost_list

def predict(X, w, b):
    return X.dot(w) + b

def r2score(y_pred, y):
    rss = np.sum((y_pred - y) ** 2)
    tss = np.sum((y-y.mean()) ** 2)
    
    r2 = 1 - (rss / tss)
    return r2

def train_test_split(df_feature, df_target, random_state=None, test_size=0.5):
    np.random.seed(random_state)
    N = df_feature.shape[0]
    sample = int(test_size*N)
    train_idx = np.random.choice(N, sample,replace=False)
    
    df_feature_train = df_feature.iloc[train_idx]
    df_target_train = df_target.iloc[train_idx]

    test_idx = [idx for idx in range(N) if idx not in train_idx]
    
    df_feature_test = df_feature.iloc[test_idx]
    df_target_test = df_target.iloc[test_idx]

    return df_feature_train, df_feature_test, df_target_train, df_target_test

In [15]:
def standard_scaling(df):
    dfout = df.apply(lambda x: (x - np.mean(x)) / np.std(x))
    return dfout

def min_max_scaling(df):
    return df.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

def normalization(df):
    return df.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))

In [45]:
def gradient_descent(X, y, theta, alpha, iterations):
  """
  Compute cost for linear regression.

  Input Parameters
  ----------------
  X : 2D array where each row represent the training example and each column represent the feature ndarray. Dimension(m x n)
      m= number of training examples
      n= number of features (including X_0 column of ones)
  y : 1D array of labels/target value for each traing example. dimension(m x 1)
  theta : 1D array of fitting parameters or weights. Dimension (1 x n)
  alpha : Learning rate. Scalar value
  iterations: No of iterations. Scalar value. 

  Output Parameters
  -----------------
  theta : Final Value. 1D array of fitting parameters or weights. Dimension (1 x n)
  cost_history: Conatins value of cost for each iteration. 1D array. Dimansion(m x 1)
  """
  cost_history = np.zeros(iterations)

  for i in range(iterations):
    predictions = X.dot(theta)
    #print('predictions= ', predictions[:5])
    errors = np.subtract(predictions, y)
    #print('errors= ', errors[:5])
    sum_delta = (alpha / m) * X.transpose().dot(errors);
    #print('sum_delta= ', sum_delta[:5])
    theta = theta - sum_delta;

    cost_history[i] = compute_cost(X, y, theta)  

  return theta, cost_history

In [44]:
def compute_cost(X, y, theta):
  """
  Compute the cost of a particular choice of theta for linear regression.

  Input Parameters
  ----------------
  X : 2D array where each row represent the training example and each column represent the feature ndarray. Dimension(m x n)
      m= number of training examples
      n= number of features (including X_0 column of ones)
  y : 1D array of labels/target value for each traing example. dimension(1 x m)

  theta : 1D array of fitting parameters or weights. Dimension (1 x n)

  Output Parameters
  -----------------
  J : Scalar value.
  """
  predictions = X.dot(theta)
  #print('predictions= ', predictions[:5])
  errors = np.subtract(predictions, y)
  #print('errors= ', errors[:5]) 
  sqrErrors = np.square(errors)
  #print('sqrErrors= ', sqrErrors[:5]) 
  #J = 1 / (2 * m) * np.sum(sqrErrors)
  # OR
  # We can merge 'square' and 'sum' into one by taking the transpose of matrix 'errors' and taking dot product with itself
  # If your confuse about this try to do this with few values for better understanding  
  J = 1/(2 * m) * errors.T.dot(errors)

  return J

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1, iterations +1), cost_history, color ='blue')
plt.rcParams["figure.figsize"] = (10,6)
plt.grid()
plt.xlabel("Number of iterations")
plt.ylabel("cost (J)")
plt.title("Convergence of gradient descent")

In [43]:
# from sklearn.model_selection import train_test_split
# # Dividing the data into training and testing data
# X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size = 0.25)

# # We need theta parameter for every input variable. since we have three input variable including X_0 (column of ones)
# theta = np.zeros(3)
# iterations = 400;
# alpha = 0.15;

# # call the gradient descent function to get the finalised weights and bias (model training)
# w, b, c= GradientDescent(X_train, y_train, np.zeros(X_train.shape[1]), 0, 0.002,100)
# plt.plot(c)

# y_pred = predict(X_test, w, b)

# r2score(y_pred, y_test)

TypeError: GradientDescent() takes 5 positional arguments but 6 were given

In [33]:
np.zeros((X_train.shape[0],1))

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

## Optimization


In [None]:
# Building the optimal model using Backward Elimination

import statsmodels.api as sm
X = np.append(arr = np.ones((59, 1)).astype(int), values = X, axis = 1)
# X = np.append(arr = np.ones((59, 1)).astype('float64'), values = X, axis = 1)

X_Optimal = X[:, [0,1,2,3,4,5]]
X_Optimal = np.array(X_Optimal, dtype=float)
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

X_Optimal = X[:, [0,1,2,4,5]]
X_Optimal = np.array(X_Optimal, dtype=float)
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

X_Optimal = X[:, [0,1,4,5]]
X_Optimal = np.array(X_Optimal, dtype=float)
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

X_Optimal = X[:, [0,1,4]]
X_Optimal = np.array(X_Optimal, dtype=float)
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

# Fitting the Multiple Linear Regression in the Optimal Training set

X_Optimal_Train, X_Optimal_Test = train_test_split(X_Optimal,test_size = 0.2, random_state = 0)
regressor.fit(X_Optimal_Train, Y_Train)

# Predicting the Optimal Test set results

Y_Optimal_Pred = regressor.predict(X_Optimal_Test)

In [None]:
# X = merged_df.iloc[:, :-1].values # selects all the columns excluding STI price

# Evaluating the model


In [None]:
# optimized with bw elimation
X_Optimal = X[:, [0,1,2,3,4,5]]
X_Optimal = np.array(X_Optimal, dtype=float)
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.529
Model:                            OLS   Adj. R-squared:                  0.485
Method:                 Least Squares   F-statistic:                     11.93
Date:                Tue, 26 Oct 2021   Prob (F-statistic):           9.36e-08
Time:                        12:56:52   Log-Likelihood:                -287.59
No. Observations:                  59   AIC:                             587.2
Df Residuals:                      53   BIC:                             599.7
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        2.38e+06   5.89e+05      4.044      0.0

In [None]:
Y_Pred

array([3087.27157853, 3063.1365116 , 3089.03177691, 3085.2729684 ,
       3109.35837855, 3166.32833225, 3066.88064559, 3104.94332752,
       3076.4737427 , 3086.97484616, 3151.43215778, 3116.52180025])

In [None]:
# importing r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


# predicting the accuracy score
score=r2_score(Y_Test,Y_Pred)
print(f"r2 score is {score}")
print(f"mean_sqrd_error is == {mean_squared_error(Y_Test,Y_Pred)}")
print(f"root_mean_squared error of is == {np.sqrt(mean_squared_error(Y_Test,Y_Pred))}")


# After Optimization with BE
print("============================")
print("After Optimization with Backwards Elimination")
# predicting the accuracy score
score=r2_score(Y_Test,Y_Optimal_Pred)
print(f"r2 score is {score}")
print(f"mean_sqrd_error is == {mean_squared_error(Y_Test,Y_Pred)}")
print(f"root_mean_squared error of is == {np.sqrt(mean_squared_error(Y_Test,Y_Pred))}")

r2 score is 0.7798496442687198
mean_sqrd_error is == 375.25938334777646
root_mean_squared error of is == 19.371612822575628
After Optimization with Backwards Elimination
r2 score is 3.958279437643597e-05
mean_sqrd_error is == 375.25938334777646
root_mean_squared error of is == 19.371612822575628


# Data Visualization

In [None]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

x_range = np.linspace(X.min(), X.max(), 100)
y_range = regressor.predict(x_range.reshape(-1, 1))

fig = px.scatter(df, x='total_bill', y='STI index', opacity=0.65)
fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Regression Fit'))
fig.show()

ValueError: X has 1 features, but LinearRegression is expecting 65 features as input.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8063f459-52be-4c78-9eaa-2f01d373f9b4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>