In [2]:
import pandas as pd
import saspy
import numpy as np

# Exploring the Data

Before we do any model building, we should explore the data to see if we can eliminate any models that we shouldn't be using. We can see the linear regression assumptions to see if a general linear model would be a good fit.

In [49]:
%%SAS
PROC IMPORT DATAFILE='Admission_Predict.csv'
out=admpred
dbms=csv
replace
;
run;

;

In [52]:
%%SAS
ods graphics on;

proc glm data=admpred plots=Diagnostics;
    class Research;
    model 'Chance of Admit'n='GRE Score'n 'TOEFL Score'n SOP LOR CGPA 'University Rating'n Research 'GRE Score'n * Research 'TOEFL Score'n * Research 'University Rating'n * Research;
run;

Class Level Information,Class Level Information,Class Level Information
Class,Levels,Values
Research,2,0 1

0,1
Number of Observations Read,400
Number of Observations Used,400

Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,10,6.54305898,0.6543059,161.96,<.0001
Error,389,1.57157202,0.00404003,,
Corrected Total,399,8.114631,,,

R-Square,Coeff Var,Root MSE,Chance of Admit Mean
0.806329,8.774934,0.063561,0.72435

Source,DF,Type I SS,Mean Square,F Value,Pr > F
GRE Score,1,5.2273118,5.2273118,1293.88,<.0001
TOEFL Score,1,0.39211084,0.39211084,97.06,<.0001
SOP,1,0.23076981,0.23076981,57.12,<.0001
LOR,1,0.19957061,0.19957061,49.4,<.0001
CGPA,1,0.4249159,0.4249159,105.18,<.0001
University Rating,1,0.00657309,0.00657309,1.63,0.2029
Research,1,0.03862165,0.03862165,9.56,0.0021
GRE Score*Research,1,0.0061075,0.0061075,1.51,0.2196
TOEFL Score*Research,1,0.00095837,0.00095837,0.24,0.6265
University *Research,1,0.01611941,0.01611941,3.99,0.0465

Source,DF,Type III SS,Mean Square,F Value,Pr > F
GRE Score,1,0.02905386,0.02905386,7.19,0.0076
TOEFL Score,1,0.03010308,0.03010308,7.45,0.0066
SOP,1,0.00079044,0.00079044,0.2,0.6585
LOR,1,0.06270759,0.06270759,15.52,<.0001
CGPA,1,0.3750191,0.3750191,92.83,<.0001
University Rating,1,0.00384076,0.00384076,0.95,0.3302
Research,1,4.82e-06,4.82e-06,0.0,0.9725
GRE Score*Research,1,3.579e-05,3.579e-05,0.01,0.9251
TOEFL Score*Research,1,0.00017755,0.00017755,0.04,0.8341
University *Research,1,0.01611941,0.01611941,3.99,0.0465


# Exploring the Diagnostics

It's not looking good. There are four linear regression assumptions that we look out for:

1) Constant variance (Homoskedasticity)
2) Expected value of residuals is 0
3) Residuals are independent
4) Residuals are normally distributed.

The first assumption of homoskedasticity is evidently violated. By observing the Residuals vs Predicted Value plot, there is a cone-like pattern in the data. We expect this to be randomly distributed, similar to a random scatter plot, in order to maintain the constant variance assumption. We also expect the same pattern for the second assumption. The latter two assumptions follow a similar fate as the former two; the Residual vs Quantile Plot informs us about the normalcy of the distribution of the data, the closer the points hug the line the more normal the residuals are.


# Variance Stabilizing Transformation

In [60]:
%%SAS
/* BOX COX TRANSFORMATION PROC TRANSREG */
TITLE "Box Cox Transformation -- Admission Prediction Data";
proc transreg data=admpred plots=boxcox;
model boxcox('Chance of Admit'n / lambda=-10 to 10 by 0.05) = identity('GRE Score'n) identity('TOEFL Score'n) identity(SOP) identity(LOR) identity('University Rating'n) identity('GRE Score'n * Research) identity('TOEFL Score'n * Research) identity(SOP * Research) identity(LOR * Research) identity(CGPA * Research) identity('University Rating'n * Research) class(Research / effects) identity(CGPA);
output coefficients replace;
run;

/* Transforming Chance_of_Admit and building another model with the respective transformation. */
data admpredmodif; set admpred;
transformedCOA = ('Chance of Admit'n ** 2.8);
run;

TITLE "Multiple Linear Regression -- Admission Prediction Data";
PROC GLM DATA=admpredmodif plots=(DIAGNOSTICS);
class Research;
model transformedCOA = 'GRE Score'n 'TOEFL Score'n SOP LOR CGPA 'University Rating'n Research 'GRE Score'n * Research 'TOEFL Score'n * Research SOP * Research LOR * Research CGPA * Research 'University Rating'n * Research
;

run;

Class Level Information,Class Level Information,Class Level Information
Class,Levels,Values
Research,2,0 1

0,1
Number of Observations Read,400
Number of Observations Used,400

Source,DF,Sum of Squares,Mean Square,F Value,Pr > F
Model,13,16.5858085,1.27583142,182.68,<.0001
Error,386,2.6957526,0.00698382,,
Corrected Total,399,19.2815611,,,

R-Square,Coeff Var,Root MSE,transformedCOA Mean
0.86019,18.81827,0.083569,0.444086

Source,DF,Type I SS,Mean Square,F Value,Pr > F
GRE Score,1,13.08443649,13.08443649,1873.54,<.0001
TOEFL Score,1,1.00690162,1.00690162,144.18,<.0001
SOP,1,0.6051813,0.6051813,86.65,<.0001
LOR,1,0.36710435,0.36710435,52.57,<.0001
CGPA,1,0.95190991,0.95190991,136.3,<.0001
University Rating,1,0.06608298,0.06608298,9.46,0.0022
Research,1,0.11278525,0.11278525,16.15,<.0001
GRE Score*Research,1,0.22732826,0.22732826,32.55,<.0001
TOEFL Score*Research,1,0.0573691,0.0573691,8.21,0.0044
SOP*Research,1,0.03916674,0.03916674,5.61,0.0184

Source,DF,Type III SS,Mean Square,F Value,Pr > F
GRE Score,1,0.06329576,0.06329576,9.06,0.0028
TOEFL Score,1,0.06819396,0.06819396,9.76,0.0019
SOP,1,0.00023705,0.00023705,0.03,0.8539
LOR,1,0.1080068,0.1080068,15.47,<.0001
CGPA,1,0.73703589,0.73703589,105.53,<.0001
University Rating,1,0.03255569,0.03255569,4.66,0.0315
Research,1,0.0468749,0.0468749,6.71,0.0099
GRE Score*Research,1,0.00016129,0.00016129,0.02,0.8793
TOEFL Score*Research,1,0.00423594,0.00423594,0.61,0.4366
SOP*Research,1,0.00276119,0.00276119,0.4,0.5299
