# Modelling Process

We will start with loading in the libraries, reading the data and exploring the columns

In [None]:
#importing libraries
import pandas as pd
import numpy as np

In [None]:
#reading file into dataframe
df = pd.read_csv('Prepared_Data.csv')

In [None]:
#Showing head of the dataframe 
print(df.head())
#printing the shape of the df
print(df.shape)

In [None]:
#reviewing the columns inside the dataframe
df.columns

In [None]:
from scipy.stats import kendalltau, pearsonr, spearmanr
def kendall_pval(x,y):
    return kendalltau(x,y)[1]

def pearsonr_pval(x,y):
    return pearsonr(x,y)[1]

def spearmanr_pval(x,y):
    return spearmanr(x,y)[1]

## Model 1 - 
### Finding Correlations between Target Variable and Predictor Variables

We will use the value of these correlations to find target variables. Ideally higher correlations mean high predictive power. Using the columns which have the highest correlations, we will apply linear regression

In [None]:
#finding correlation between amount of delay and rest of the dataframe

Corr_Matrix_1 = pd.DataFrame(df.drop(['Delayed','PROJECT_ID','Times_Delayed'], axis=1).corr(method ='pearson')['Amount_of_Delay_(Quarters)'])
Corr_Matrix_1 = Corr_Matrix_1.reindex(Corr_Matrix_1['Amount_of_Delay_(Quarters)'].abs().sort_values(ascending=False).index)
print(Corr_Matrix_1)

In [None]:
cols = np.array(df.drop(['Initial_completion_date','final_completion_date','Delayed','PROJECT_ID','Times_Delayed','REGION','MUNICIPALITY'], axis=1).columns)

In [None]:
import statsmodels.api as sm

In [None]:
import matplotlib.pyplot as plt
import numpy as np
#plotting the correlation along with the regression line
#looping through each of the numeric columns
for col in cols:
    plt.scatter(df[col],np.array(df['Amount_of_Delay_(Quarters)']))
    plt.xlabel(col)
    plt.ylabel('Amount_of_Delay_(Quarters)')
    results = sm.OLS(np.array(df['Amount_of_Delay_(Quarters)']),sm.add_constant(np.array(df[col]))).fit()
    X_plot = np.linspace(0,1,100)
    plt.plot(X_plot, X_plot*results.params[0] + results.params[1])

    plt.show()
    
    

### Variables chosen for 1st iteration
We have chosen to run the following variables for the first iteration of our regression model. These have the highest correlation - 'CST_Mixed_Use','Type_Owner_priv','PUBLIC_FUNDING_IND','CST_Education','CT_Institutional'

Note - We excluded Age because of colinearity issues

In [None]:
X = df[['CST_Mixed_Use','Type_Owner_priv','PUBLIC_FUNDING_IND','CST_Education','CT_Institutional']]
X.head()

In [None]:
## Without a constant



#Applying Stats model 

y = df["Amount_of_Delay_(Quarters)"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

We notice that two of the variables have high P-values. Thus we need to remove them because keeping them in the model means are model is not statistically significant, which implies that we cannot reject the null hypothesis that the variables were indeed by chance correlated with the main target

In [None]:
X = df[['CST_Mixed_Use','Type_Owner_priv','PUBLIC_FUNDING_IND']]
X.head()

In [None]:
#re-applying the model
# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

## Model 2 - 
### Finding Correlations between Target Variable and Predictor Variables

We will use the value of these correlations to find target variables. Ideally higher correlations mean high predictive power. Using the columns which have the highest correlations, we will apply linear regression

In [None]:
Corr_Matrix_2 = pd.DataFrame(df.drop(['Budget_Overrun','PROJECT_ID','Final_Estimated_Budget'], axis=1).corr(method ='pearson')['percentage_of_overrun'])
Corr_Matrix_2 = Corr_Matrix_2.reindex(Corr_Matrix_2['percentage_of_overrun'].abs().sort_values(ascending=False).index)
print(Corr_Matrix_2)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
#plotting the correlation along with the regression line
#looping through each of the numeric columns
for col in cols:
    plt.scatter(df[col],np.array(df['percentage_of_overrun']))
    plt.xlabel(col)
    plt.ylabel('Amount_of_Delay_(Quarters)')
    results = sm.OLS(np.array(df['percentage_of_overrun']),sm.add_constant(np.array(df[col]))).fit()
    X_plot = np.linspace(0,1,100)
    plt.plot(X_plot, X_plot*results.params[0] + results.params[1])

    plt.show()

Correlation values in all the cases are very low. To fit the model we will use the top 5 correlated columns as predictor variables

In [None]:
X2 = df[['Region_others','CST_Roads_Highways','Amount_of_Delay_(Quarters)','Type_Owner_prov','Times_Delayed']]
X2.head()

In [None]:
y2 = df["percentage_of_overrun"]

# Note the difference in argument order
model2 = sm.OLS(y2, X2).fit()
predictions = model2.predict(X2) # make the predictions by the model

# Print out the statistics
model2.summary()

We remove the variables which have high p values or low statistical significance

In [None]:
X2 = df[['Region_others','CST_Roads_Highways']]
X2.head()

In [None]:
model2 = sm.OLS(y2, X2).fit()
predictions = model2.predict(X2) # make the predictions by the model

# Print out the statistics
model2.summary()

Needless to say the values of this model in terms of accuracy are very poor. 