In [63]:
import pandas as pd
import saspy
import numpy as np

# Exploring the Data

Before we do any model building, we should explore the data to see if we can eliminate any models that we shouldn't be using. We can see the linear regression assumptions to see if a general linear model would be a good fit.

In [69]:
%%SAS
PROC IMPORT DATAFILE='TrainingSet.csv'
out=admpred
dbms=csv
replace
;
run;

;
;

In [70]:
%%SAS
ods graphics on;

proc glm data=admpred plots=Diagnostics;
    class Research;
    model 'Chance of Admit'n='GRE Score'n 'TOEFL Score'n SOP LOR CGPA 'University Rating'n Research 'GRE Score'n * Research 'TOEFL Score'n * Research 'University Rating'n * Research;
run;

Class Level Information,Class Level Information,Class Level Information
Class,Levels,Values
Research,2,0 1

0,1
Number of Observations Read,268
Number of Observations Used,268

Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,10,4.26184092,0.42618409,105.9,<.0001
Error,257,1.03429453,0.00402449,,
Corrected Total,267,5.29613545,,,

R-Square,Coeff Var,Root MSE,Chance of Admit Mean
0.804708,8.699597,0.063439,0.729216

Source,DF,Type I SS,Mean Square,F Value,Pr > F
GRE Score,1,3.32351958,3.32351958,825.82,<.0001
TOEFL Score,1,0.31049116,0.31049116,77.15,<.0001
SOP,1,0.14432304,0.14432304,35.86,<.0001
LOR,1,0.11792959,0.11792959,29.3,<.0001
CGPA,1,0.26666368,0.26666368,66.26,<.0001
University Rating,1,0.01494431,0.01494431,3.71,0.0551
Research,1,0.06569728,0.06569728,16.32,<.0001
GRE Score*Research,1,0.00600879,0.00600879,1.49,0.2229
TOEFL Score*Research,1,0.00387416,0.00387416,0.96,0.3274
University *Research,1,0.00838934,0.00838934,2.08,0.1500

Source,DF,Type III SS,Mean Square,F Value,Pr > F
GRE Score,1,0.00608852,0.00608852,1.51,0.2198
TOEFL Score,1,0.02123291,0.02123291,5.28,0.0224
SOP,1,0.00137765,0.00137765,0.34,0.5590
LOR,1,0.03536995,0.03536995,8.79,0.0033
CGPA,1,0.23165673,0.23165673,57.56,<.0001
University Rating,1,0.00708058,0.00708058,1.76,0.1859
Research,1,2.091e-05,2.091e-05,0.01,0.9426
GRE Score*Research,1,0.00019561,0.00019561,0.05,0.8257
TOEFL Score*Research,1,0.00073374,0.00073374,0.18,0.6697
University *Research,1,0.00838934,0.00838934,2.08,0.1500


# Exploring the Diagnostics

It's not looking good. There are four linear regression assumptions that we look out for:

1) Constant variance (Homoskedasticity)
2) Expected value of residuals is 0
3) Residuals are independent
4) Residuals are normally distributed.

The first assumption of homoskedasticity is evidently violated. By observing the Residuals vs Predicted Value plot, there is a cone-like pattern in the data. We expect this to be randomly distributed, similar to a random scatter plot, in order to maintain the constant variance assumption. We also expect the same pattern for the second assumption. The latter two assumptions follow a similar fate as the former two; the Residual vs Quantile Plot informs us about the normalcy of the distribution of the data, the closer the points hug the line the more normal the residuals are.


# Variance Stabilizing Transformation

In [72]:
%%SAS
/* BOX COX TRANSFORMATION PROC TRANSREG */
TITLE "Box Cox Transformation -- Admission Prediction Data";
proc transreg data=admpred plots=boxcox;
model boxcox('Chance of Admit'n / lambda=-10 to 10 by 0.05) = identity('GRE Score'n) identity('TOEFL Score'n) identity(SOP) identity(LOR) identity('University Rating'n) identity('GRE Score'n * Research) identity('TOEFL Score'n * Research) identity(SOP * Research) identity(LOR * Research) identity(CGPA * Research) identity('University Rating'n * Research) class(Research / effects) identity(CGPA);
output coefficients replace;
run;

/* Transforming Chance_of_Admit and building another model with the respective transformation. */
data admpredmodif; set admpred;
transformedCOA = ('Chance of Admit'n ** 3.1);
run;

TITLE "Multiple Linear Regression -- Admission Prediction Data";
PROC GLM DATA=admpredmodif plots=(DIAGNOSTICS);
class Research;
model transformedCOA = 'GRE Score'n 'TOEFL Score'n SOP LOR CGPA 'University Rating'n Research 'GRE Score'n * Research 'TOEFL Score'n * Research SOP * Research LOR * Research CGPA * Research 'University Rating'n * Research
;

run;

Class Level Information,Class Level Information,Class Level Information
Class,Levels,Values
Research,2,0 1

0,1
Number of Observations Read,268
Number of Observations Used,268

Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,13,11.81419476,0.90878421,137.0,<.0001
Error,254,1.68485235,0.00663328,,
Corrected Total,267,13.49904711,,,

R-Square,Coeff Var,Root MSE,transformedCOA Mean
0.875187,19.38439,0.081445,0.420157

Source,DF,Type I SS,Mean Square,F Value,Pr > F
GRE Score,1,9.07096698,9.07096698,1367.49,<.0001
TOEFL Score,1,0.78091519,0.78091519,117.73,<.0001
SOP,1,0.40628062,0.40628062,61.25,<.0001
LOR,1,0.17785398,0.17785398,26.81,<.0001
CGPA,1,0.71776284,0.71776284,108.21,<.0001
University Rating,1,0.05814333,0.05814333,8.77,0.0034
Research,1,0.14243231,0.14243231,21.47,<.0001
GRE Score*Research,1,0.2320079,0.2320079,34.98,<.0001
TOEFL Score*Research,1,0.09426046,0.09426046,14.21,0.0002
SOP*Research,1,0.04305696,0.04305696,6.49,0.0114

Source,DF,Type III SS,Mean Square,F Value,Pr > F
GRE Score,1,0.01417913,0.01417913,2.14,0.1450
TOEFL Score,1,0.04836541,0.04836541,7.29,0.0074
SOP,1,7.866e-05,7.866e-05,0.01,0.9134
LOR,1,0.05507711,0.05507711,8.3,0.0043
CGPA,1,0.53063137,0.53063137,80.0,<.0001
University Rating,1,0.02481456,0.02481456,3.74,0.0542
Research,1,0.04387317,0.04387317,6.61,0.0107
GRE Score*Research,1,0.00267107,0.00267107,0.4,0.5263
TOEFL Score*Research,1,0.00516357,0.00516357,0.78,0.3785
SOP*Research,1,0.00303339,0.00303339,0.46,0.4995
