
**Predicting Admission by Logistic Regression**
Imagine you are in charge of the admission process at UCLA. And it is hectic when you have lots of applicants and the whole decision regarding admission depends upon you. This data is abstracted from the applicant's database of UCLA. The university already put an attribute named chance of admission which gives an idea about admission probability.

Data contained:
1. GRE Scores ( out of 340 ) 
2. TOEFL Scores ( out of 120 ) 
3. University Rating ( out of 5 ) 
4. Statement of Purpose and
5. Letter of Recommendation Strength ( out of 5 ) 
6. Undergraduate GPA ( out of 10 ) 
7. Research Experience ( either 0 or 1 ) 
8. Chance of Admit ( ranging from 0 to 1 )

In [None]:
# install.packages("Rtools")
# install.packages("MASS")
# install.packages("tidyr")
# install.packages("dplyr")
# install.packages("corrplot")
# install.packages("ggplot2")
# install.packages("ROCR")      
# install.packages("tidyverse")
# load the libraries in one time
library(MASS)
library(tidyr)
library(dplyr)
library(corrplot)
library(ggplot2)
library(ROCR)
library(tidyverse)


In [None]:
# utility function for the model's analysis
analyze.model <- function(x.formula){
  model.tmp = glm(x.formula, data = train, family = "binomial")
  print( summary(model.tmp) )
  model.tmp
}



[](http://)**Load Source Data **

In [None]:
#read the dataset
list.files(path = "../input")
raw.df <- read.csv("../input/Admission_Predict.csv")

#copy to save the data in my manipulation
dff <- raw.df


## Analysis the dataset

In [None]:
## Analisyng the dataset
colnames(dff)
dim(dff) #400x9
# checking duplicate rows
duplicate.rows = (nrow(df)- (length(unique(dff$Serial.No.))))



In [None]:
print(paste(" duplicate rows =", length(duplicate.rows)))


In [None]:
#check NA values
sum(is.na(dff)) #0


In [None]:

summary(dff)# no outliers

In [None]:
# Checking missing values
sum(is.na(df))
# No missing values

In [None]:
#serial number is meaningless in the requirements
dff$Serial.No. = NULL

In [None]:
# analyze GRE 
summary(dff$GRE.Score)


In [None]:
print(paste("standard deviation for GRD.Score = ", sd(dff$GRE.Score)))

In [None]:

#calculation over quantiles
quantile(dff$GRE.Score, seq(0,1,0.01))
q1 <- quantile(dff$GRE.Score, c(0.25))
q3 <- quantile(dff$GRE.Score, c(0.75))
IQR <- q3 - q1  
upper_range <- q3 + 1.5*IQR  
lower_range <- q1 - 1.5*IQR


In [None]:
print(nrow(dff[dff$GRE.Score > upper_range,]))
print(nrow(dff[dff$GRE.Score < lower_range,])) # no outliers


In [None]:
# Analysing variable 'TOEFL.Score'
summary(dff$TOEFL.Score)


In [None]:
print(paste("standard deviation for TOEFL.Score = ", sd(dff$TOEFL.Score)))

In [None]:
quantile(dff$TOEFL.Score, seq(0,1,0.01))
q1 <- quantile(dff$TOEFL.Score, c(0.25))
q3 <- quantile(dff$TOEFL.Score, c(0.75))
IQR <- q3 - q1  
upper_range <- dff + 1.5*IQR  
lower_range <- q1 - 1.5*IQR


In [None]:
nrow(dff[dff$TOEFL.Score > upper_range,]) #1 outliers [1] 2400


In [None]:
nrow(dff[dff$TOEFL.Score < lower_range,]) #High TOEFL score is not a problem, I ignore it

Graphs of TOEFL

In [None]:
# histogram
ggplot(dff, aes(TOEFL.Score)) + geom_histogram(fill = "green", bins = "30") + labs(title = "Distribution of TOEFL.Score") 



In [None]:
# bloxplot
ggplot(dff, aes(x = 1, y = TOEFL.Score)) + geom_boxplot(fill = "blue", outlier.color = "red", outlier.shape = 4)


In [None]:
# analysis CGPA
summary(dff$CGPA)


In [None]:
print(paste("standard deviation of CGPA = ", sd(dff$CGPA)))

In [None]:

quantile(dff$CGPA, seq(0,1,0.01))
q1 <- quantile(dff$CGPA, c(0.25))
q3 <- quantile(dff$CGPA, c(0.75))
IQR <- q3 - q1  
upper_range <- q3 + 1.5*IQR  
lower_range <- q1 - 1.5*IQR


In [None]:
nrow(dff[dff$CGPA > upper_range,]) #no outliers


In [None]:
nrow(dff[dff$CGPA < lower_range,]) #1 outlier [1] 1


In [None]:
# Treating outliers, I put the lower_range value
dff$CGPA[which(dff$CGPA < lower_range)] <- lower_range


## Various plots

In [None]:
plot (dff$Chance.of.Admit, dff$CGPA, xlab = "Chance of Admin", ylab = "CGPA")



In [None]:
plot (dff$Chance.of.Admit, dff$TOEFL.Score, xlab = "Chance of Admin", ylab = "TOEFL.Score")


In [None]:
plot (dff$Chance.of.Admit, dff$University.Rating, xlab = "Chance of Admin",ylab = "University.Rating")


In [None]:
plot (dff$Chance.of.Admit, dff$GRE.Score, xlab = "Chance of Admin", ylab = "GRE.Score")

In [None]:
# Analysing variable 'University.Rating' 
# the values are repeated
# then I can read these as "factor" (omogenous group of values)
summary(factor(dff$University.Rating))


In [None]:
print(paste("standard deviation of University.Rating = ", sd(as.numeric(dff$University.Rating))))


In [None]:
dff$University.Rating <- as.factor(dff$University.Rating)


In [None]:
# Analysing variable 'SOP'
# the values are repeated
# then I can read these as "factor" (omogenous group of values)
summary(factor(dff$SOP))


In [None]:
print(paste("standard deviation of SOP = ", sd(as.numeric(dff$SOP))))


In [None]:
dff$SOP <- as.factor(dff$SOP)


In [None]:
# Analysing variable 'LOR'
# the values are repeated
# then I can read these as "factor" (omogenous group of values)
summary(factor(dff$LOR))
print(paste("standard deviation of LOR = ", sd(as.numeric(dff$LOR))))
dff$LOR <- as.factor(dff$LOR)


In [None]:
# Analysing variable 'Research'
# the values are repeated
# then I can read these as "factor" (omogenous group of values)
summary(factor(dff$Research))
print(paste("standard deviation of Research = ", sd(as.numeric(dff$Research))))
dff$Research <- as.factor(dff$Research)


In [None]:
# Create a new variable: get_admission
# Classify data with greater than 0.72 because of 0.5 gives un-leveled data division
table(dff$Chance.of.Admit > 0.5)  # False = 35, True = 365
# I use one variable
chance.of.admit.superior.to.thresold <- (dff$Chance.of.Admit > 0.72)
dff$get_admission = as.factor(ifelse(chance.of.admit.superior.to.thresold, 1, 0)) 
table(chance.of.admit.superior.to.thresold) # False = 196, True = 204


## **Exploratory Data Analysis**

In [None]:
# Selecting all and only Numeric Variables
# creating a new dataframe fro mthe original dff
df_Numeric_Variable <- select_if(dff, is.numeric)
colnames(df_Numeric_Variable)


In [None]:
# Correlation of Numeric variables with chance of admit
corr <- cor(df_Numeric_Variable)
# using lower because the matrix is reflective along the principal diagonal
corrplot(corr,method = "number",type = "lower")
# Exam Scores are highly correlated.

In [None]:
# University Rating
ggplot(dff, aes(x = University.Rating, y = Chance.of.Admit))+ geom_boxplot(outlier.colour = "red", outlier.shape = 4)


From the boxplots it has clearly observed that chance of admission is high when somebody belongs to high ranking university.
Although some students are from average rating uiversity, still they have a chance to get admitted.

In [None]:
# Statement of Purpose (SOP)
ggplot(dff, aes(x = SOP, y = Chance.of.Admit)) + geom_boxplot(outlier.colour = "purple", outlier.shape = 3)


Here we can see that  "high degree of Statement of Purpose means high probability of getting admission. There is some rare cases where you have some less chance to get admission.

In [None]:
# Letter of Recommendation (LOR)
ggplot(dff, aes(x = LOR, y = Chance.of.Admit)) + geom_boxplot(outlier.colour = "orange" ,outlier.shape = 2)


 Letter of Recommendation has great influnce towards getting admission in University.

In [None]:
# Research   
ggplot(dff, aes(x = Research, y = Chance.of.Admit)) + geom_boxplot(outlier.colour = "black" ,outlier.shape = 1)



Its definite who have done some reserch works, they have good chance of getting admission in the university.

### I use the logistic regression using the data as-is


In [None]:
dim(dff)
dff_ml <- dff
indx = which(1:nrow(dff_ml)%%5==0)
train <- dff_ml[-indx,]
dim(train) #80% of the data
test <-  dff_ml[indx,]
dim(test) #20% of the data
model.all.var = analyze.model(get_admission ~ .)
#AIC = 52, Null deviance =  4.4350e+02 (32 more or less)

predictTrain.all.var = predict(model.all.var, type="response")
summary(predictTrain.all.var)
table(train$get_admission, predictTrain.all.var > 0.5)
pred.all.var <- prediction(predictTrain.all.var, train$get_admission)
# calculate ROC
# measure True Positive Rate
# x.measure False Positive Rate
roc.perf.all.var = performance(pred.all.var, measure = "tpr", x.measure = "fpr")

# Test the Model
predictTest.all.var = predict(model.all.var, type = "response", newdata = test)
table(test$get_admission, predictTest.all.var >= 0.5)

# Build ROC curve for test Set
pred2.all.var <- prediction(predictTest.all.var, test$get_admission)
roc.perf2.all.var = performance(pred2.all.var, measure = "tpr", x.measure = "fpr")


### Logistic model over every variable
I use the binomial because I expect only {0,1]-like output, not continous range.
Please read the details in https://stats.stackexchange.com/a/303592


** Dummy Variables to transform the numeric data in categorical column {0, 1} **

In [None]:
# University.Rating
length(levels(dff$University.Rating))
# model.matrix draws a design matrix
dummy_University.Rating <- data.frame(model.matrix( ~University.Rating, data = dff))
head(dummy_University.Rating)
dummy_University.Rating <- dummy_University.Rating[,-1]
length(dummy_University.Rating)
df_1 <- cbind(select(dff, -'University.Rating'), dummy_University.Rating)
ncol(df_1)

dummy_SOP <- data.frame(model.matrix( ~SOP, data = dff))
dummy_SOP <- dummy_SOP[,-1]
length(dummy_SOP)
df_2 <- cbind(select(df_1, -'SOP'), dummy_SOP)
ncol(df_2)

dummy_LOR <- data.frame(model.matrix( ~LOR, data = dff))
dummy_LOR <- dummy_LOR[,-1]
length(dummy_LOR)

df_3 <- cbind(select(df_2, -'LOR'), dummy_LOR)
ncol(df_3)
df_3$Chance.of.Admit = NULL
df_3


In [None]:
#Splitting the data set
set.seed(1000)
indx= sample(1:nrow(df_3), 0.7*nrow(df_3))
train = df_3[indx,]
test = df_3[-indx,]


### I start the models of logistic models with categorial columns ad-hoc


In [None]:
model.all.variables = analyze.model(get_admission ~ .)
#AIC = 186.62, Null deviance =  387.65

In [None]:
model.quantile.SOP.LOR.University.Rating <- analyze.model(get_admission ~ SOP1.5 + SOP2 + SOP3.5 + SOP4.5 + LOR2 + LOR2.5+LOR4+LOR4.5+University.Rating2+University.Rating3+University.Rating5)
#AIC = 260.65, Null deviance =  387.65


In [None]:
TOEFL.Score.CGPA.Research.SOP.LOR.University.Rating <- 
  analyze.model(get_admission ~ TOEFL.Score + CGPA + Research + 
                  SOP1.5 + SOP2 + SOP3.5 + SOP4.5 + LOR2 + LOR2.5 + LOR4 + 
                  LOR4.5 + University.Rating2 + University.Rating3 + University.Rating5)
#AIC = 178.7, Null deviance =  387.65


In [None]:
GRE.Score.CGPA.Research.SOP.LOR.University.Rating <- 
  analyze.model(get_admission ~ GRE.Score + CGPA + Research + SOP1.5 +       SOP2 + SOP3.5 + SOP4.5 + LOR2 + LOR2.5 + LOR4 + LOR4.5 + University.Rating2 + University.Rating3 + University.Rating5)
#AIC = 177.01, Null deviance =  387.65


In [None]:
GRE.Score.CGPA.SOP.LOR.University <- analyze.model(get_admission ~ GRE.Score + CGPA + Research +SOP2+SOP3.5+SOP4.5+LOR2+LOR2.5+LOR4+University.Rating2+University.Rating3)
#AIC = 173.83, Null deviance =  387.65


In [None]:
GRE.Score.CGPA.Research.SOP.LOR.University.Rating <- analyze.model(get_admission ~ GRE.Score + CGPA + Research+SOP3.5+SOP4+LOR4+LOR4.5+University.Rating2)
#AIC = 170.11, Null deviance =  387.65

In [None]:
GRE.Score.CGPA.Research.SOP.LOR.University.Rating <- analyze.model(get_admission ~ GRE.Score + CGPA + Research + SOP3 + SOP4 + LOR3 + LOR4.5 + University.Rating2)
#AIC = 168.65, Null deviance =  387.65


In [None]:
GRE.Score.CGPA.Research.SOP.LOR.University.Rating <- analyze.model(get_admission ~ GRE.Score + CGPA + Research+SOP2+SOP3+SOP4+LOR3+LOR4.5+University.Rating2)
#AIC =  170.51, Null deviance =  387.65

In [None]:
GRE.Score.CGPA.Research.LOR5.LOR4.5.University.Rating2.SOP2.SOP4.LOR4 <- analyze.model(get_admission ~ GRE.Score + CGPA + Research + LOR5+LOR4.5+University.Rating2 + SOP2 + SOP4 + LOR4)
#AIC =  168.59, Null deviance =  387.65

In [None]:
#best model
GRE.Score.CGPA.Research.LOR.SOP.University.Rating.SOP.LOR <- analyze.model( get_admission ~ GRE.Score + CGPA + Research + LOR5+SOP4.5+University.Rating2 + SOP2 + SOP4 + LOR4)
#AIC =  168.34, Null deviance =  387.65


In [None]:
GRE.Score.CGPA.Research.LOR.SOP.University.Rating.LOR <- analyze.model(get_admission ~ GRE.Score + CGPA + Research + LOR1.5+SOP4.5+University.Rating2 + LOR3.5 + LOR2.5 + LOR4)
#AIC = 171.74, Null deviance =  387.65

In [None]:
GRE.Score.Research.CGPA.University.Rating.SOP.LOR <- analyze.model(get_admission ~ GRE.Score + Research + CGPA + University.Rating2 + SOP2 + SOP4 + LOR4)
#AIC =170.86, Null deviance =  387.65


### GRE.Score.CGPA.Research.LOR.SOP.University.Rating.SOP.LOR has lower AIC than the all other models.


In [None]:
# Create Confusion matrix for GRE.Score.CGPA.Research.LOR.SOP.University.Rating.SOP.LOR
predictTrain = predict(GRE.Score.CGPA.Research.LOR.SOP.University.Rating.SOP.LOR, type="response")
summary(predictTrain)


In [None]:
table(train$get_admission, predictTrain > 0.5)


In [None]:
table(train$get_admission, predictTrain > 0.4)

In [None]:
# Build ROC curve for train Set
pred1 <- prediction(predictTrain, train$get_admission)

In [None]:
# calculate ROC
# measure True Positive Rate
# x.measure False Positive Rate
roc.perf = performance(pred1, measure = "tpr", x.measure = "fpr")


In [None]:
# Test the Model
predictTest = predict(GRE.Score.Research.CGPA.University.Rating.SOP.LOR, type = "response", newdata = test)
table(test$get_admission,predictTest >= 0.5)


In [None]:
# Build ROC curve for test Set
pred2 <- prediction(predictTest,test$get_admission)
roc.perf2 = performance(pred2, measure = "tpr", x.measure = "fpr")


# **Compare the graphs of the models**

In [None]:
# set the plotting area into a 2*2 array
par(mfrow = c(2,2))   


In [None]:
# draw the graphs
plot(roc.perf.all.var, colorize=TRUE, main = "Model TRAIN ALL")
plot(roc.perf2.all.var, colorize=TRUE, main = "Model TEST ALL ")
plot(roc.perf, colorize=TRUE, main = "Model TRAIN")
plot(roc.perf2, colorize=TRUE, main = "Model TEST")


Using the categorial column ad-hoc, I have a more precise modelization