# Logistic Regression With College Admission data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load Data

In [2]:
import os
import urllib.request

data_location = "admission-data.csv"
data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/college-admissions/admission-data.csv'

if not os.path.exists (data_location):
    data_location = os.path.basename(data_location)
    if not os.path.exists(data_location):
        print("Downloading : ", data_url)
        urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

Downloading :  https://raw.githubusercontent.com/elephantscale/datasets/master/college-admissions/admission-data.csv
data_location: admission-data.csv


In [3]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format

data = pd.read_csv(data_location)
data.sample(10)

Unnamed: 0,admit,gre,gpa,rank
27,1,520,3.74,4
35,0,400,3.05,2
80,0,700,2.9,4
34,0,360,3.14,1
97,0,480,3.57,2
60,1,620,3.18,2
20,0,500,3.17,3
88,1,700,3.28,1
9,1,700,3.92,2
61,0,560,3.32,4


## Exploratory Data Analysis (EDA)

In [4]:
data.describe()

Unnamed: 0,admit,gre,gpa,rank
count,100.0,100.0,100.0,100.0
mean,0.43,600.0,3.39,2.52
std,0.5,124.46,0.4,1.02
min,0.0,300.0,2.42,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,600.0,3.35,2.0
75%,1.0,700.0,3.67,3.0
max,1.0,800.0,4.0,4.0


In [5]:
data['admit'].value_counts()

0    57
1    43
Name: admit, dtype: int64

In [6]:
data['admit'].value_counts(normalize=True)

0   0.57
1   0.43
Name: admit, dtype: float64

## Shape Data

In [7]:
x = data[['gre', 'gpa', 'rank']]
y = data['admit']

print (x)
print (y)

    gre  gpa  rank
0   380 3.61     3
1   660 3.67     3
2   800 4.00     1
3   640 3.19     4
4   520 2.93     4
..  ...  ...   ...
95  660 3.33     2
96  640 3.52     4
97  480 3.57     2
98  700 2.88     2
99  400 3.31     3

[100 rows x 3 columns]
0     0
1     1
2     1
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Name: admit, Length: 100, dtype: int64


In [8]:
print ('x : ', x.shape)
print ('y : ', y.shape)

x :  (100, 3)
y :  (100,)


## Split train/test

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train, y_test = train_test_split(x,y,  test_size=0.2, random_state=123)


print ("x_train :" , x_train.shape )
print ("x_test :", x_test.shape)
print ("y_train :", y_train.shape)
print ("y_test :", y_test.shape)

x_train : (80, 3)
x_test : (20, 3)
y_train : (80,)
y_test : (20,)


## Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
model = lr.fit(x_train, y_train)
print('coef : ', model.coef_)
print('intercept' , model.intercept_)


coef :  [[ 0.01330791  1.5104773  -0.13075747]]
intercept [-13.27218281]


## Model Evaluation

In [11]:
y_pred = model.predict (x_test)
y_pred

array([0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1])

In [12]:
train_accuracy = model.score(x_train,y_train)
print ("Train accuracy: ", train_accuracy)

Train accuracy:  0.8125


In [13]:
test_accuracy = model.score(x_test, y_test)

print ("Test accuracy: ", test_accuracy)

Test accuracy:  0.7


In [15]:
from sklearn.metrics import confusion_matrix
import numpy as np

cm_labels = np.unique(y)
cm_array = confusion_matrix(y_test, y_pred)
# print (cm_array)

cm_df = pd.DataFrame(cm_array, index=cm_labels, columns=cm_labels)
cm_df

Unnamed: 0,0,1
0,8,3
1,3,6


In [16]:
mport matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (8,5))

# colormaps : cmap="YlGnBu" , cmap="Greens", cmap="Blues",  cmap="Reds"
sns.heatmap(cm_df, annot=True, cmap="Reds", fmt='d').plot()

SyntaxError: invalid syntax (2147145067.py, line 1)