<a href="https://colab.research.google.com/github/EoghanDoyle/MachineLearningAssaignment1/blob/main/CompletionRates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Importing Libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from imblearn.over_sampling import SMOTE  # imblearn library can be installed using pip install imblearn
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
import plotly.graph_objs as go
import plotly.figure_factory as ff
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.decomposition import PCA

########################################################################################## Data Preperation ###################################################################################################

# Importing dataset and examining it
dataset = pd.read_csv("/content/drive/My Drive/DBS/CompletionRates.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

#x1 = dataset.drop(['HECR','TS_DPEN','ST_FI_LO','TS_MEN', 'ST_FI_H2','HL_ED_P_HS','ST_FI_M2','HL_ED_P_MS'], axis = 1)

# Plotting Correlation Heatmap
corrs = dataset.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.show()

# Dividing dataset into label and feature sets
X = dataset.drop(['HECR','TS_DPEN','ST_FI_LO','TS_MEN', 'ST_FI_H2','HL_ED_P_HS','ST_FI_M2','HL_ED_P_MS'], axis = 1) # Features
Y = dataset['HECR'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
rfr = RandomForestRegressor(criterion='mse', max_features='sqrt', random_state=1)
grid_param = {'n_estimators': [500,550,600,650,700,750,800]}

gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

print("Cross-validation results:\n", pd.DataFrame.from_dict(gd_sr.cv_results_))

best_parameters = gd_sr.best_params_
print("Optimal parameters:\n", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("Best mean cross-validated score:\n", best_result)

#Building random forest using the tuned parameter
rfr = RandomForestRegressor(n_estimators=700, criterion='mse', max_features='sqrt', random_state=1)
rfr.fit(X_scaled,Y)
featimp = pd.Series(rfr.feature_importances_, index=list(X)).sort_values(ascending=False)
print(featimp)

# Selecting features with higher sifnificance and redefining feature set
X_ = dataset[['HL_ED_P_PS','PER_PT','INST_EXP_PS','AVG_FAC_SAL_PM','ST_FI_M1','TS','ST_FI_H1','TS_MAR']]

feature_scaler = StandardScaler()
X_scaled_ = feature_scaler.fit_transform(X_)

# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
rfr = RandomForestRegressor(criterion='mse', max_features='sqrt', random_state=1)
grid_param = {'n_estimators': [600,650,700,750,800]}

gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled_, Y)

print("Cross-validation results:\n", pd.DataFrame.from_dict(gd_sr.cv_results_))

best_parameters = gd_sr.best_params_
print("Optimal parameters:\n", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("Best mean cross-validated score:\n", best_result)


# ########################################################################### Linear Regression ###########################################################################
#Linear Regression
#Tuning the SGDRegressor parameters 'eta0' (learning rate) and 'max_iter' using Grid Search
sgdr = SGDRegressor(random_state = 1)
grid_param = {'eta0': [.0001, .001, .01, .1, 1], 'max_iter':[10000, 20000, 30000, 40000]}

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

# Building SGDRegressor using the tuned parameters
sgdr = SGDRegressor(eta0=.01, max_iter=10000, penalty=None, random_state=1)
sgdr.fit(X_scaled,Y)
print('Intercept', sgdr.intercept_)
print(pd.DataFrame(zip(X.columns, sgdr.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))



   HIGH_DEG  INST_TYPE  INST_EXP_PS  AVG_FAC_SAL_PM     TS  TS_MEN  TS_WOM  \
0         4          1         3212            4587   3903  0.4766  0.5234   
1         4          1        10418            6398   9562  0.4195  0.5805   
2         4          1         5784            5859   4165  0.4898  0.5102   
3         4          1         5824            6109  14245  0.4795  0.5205   
4         4          1         4622            6102  18520  0.5212  0.4788   

     TS_MAR   TS_DPEN    TS_VET  PER_PT  ST_FI_LO  ST_FI_M1  ST_FI_M2  \
0  0.056698  0.821877  0.018063  0.0699  0.504265  0.172604  0.136979   
1  0.149349  0.590745  0.039623  0.3095  0.490163  0.155722  0.144638   
2  0.221490  0.551088  0.048780  0.3160  0.440343  0.156229  0.154252   
3  0.065476  0.815247  0.024038  0.1046  0.331731  0.133242  0.156364   
4  0.061376  0.825645  0.025997  0.0821  0.272478  0.126271  0.150899   

   ST_FI_H1  ST_FI_H2  HL_ED_P_MS  HL_ED_P_HS  HL_ED_P_PS    HECR  
0  0.110386  0.075765   

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(14300, 12)
(14300,)
Cross-validation results:
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0      17.413742      0.081241         0.406161        0.031104   
1      19.150326      0.054083         0.447899        0.033744   
2      20.848826      0.053787         0.487164        0.033401   
3      22.555847      0.062111         0.527881        0.038508   
4      24.297844      0.113990         0.568606        0.038859   
5      26.034405      0.109396         0.612777        0.044954   
6      27.779769      0.072940         0.650064        0.045997   

  param_n_estimators                 params  split0_test_score  \
0                500  {'n_estimators': 500}           0.608967   
1                550  {'n_estimators': 550}           0.609212   
2                600  {'n_estimators': 600}           0.609419   
3                650  {'n_estimators': 650}           0.609193   
4          