# DS-SF-30 | Unit Project 3: Machine Learning Modeling

In this project, you will perform a logistic regression on the admissions data we've been working with in Unit Projects 1 and 2.

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
pd.set_option('display.notebook_repr_html', True)

import statsmodels.formula.api as smf

from sklearn import linear_model

In [2]:
df = pd.read_csv(os.path.join('..', '..', 'dataset', 'dataset-ucla-admissions.csv'))
df.dropna(inplace = True)

df

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3.0
1,1,660.0,3.67,3.0
2,1,800.0,4.00,1.0
3,1,640.0,3.19,4.0
4,0,520.0,2.93,4.0
...,...,...,...,...
395,0,620.0,4.00,2.0
396,0,560.0,3.04,3.0
397,0,460.0,2.63,2.0
398,0,700.0,3.65,2.0


## Part A.  Frequency Table

> ### Question 1.  Create a frequency table for `prestige` and whether an applicant was admitted.

In [3]:
# TODO
pd.crosstab(df.prestige,df.admit)

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,28,33
2.0,95,53
3.0,93,28
4.0,55,12


In [4]:
# TODO - normalizes over all possibilities - % of total students
pd.crosstab(df.prestige,df.admit,normalize=True)

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.070529,0.083123
2.0,0.239295,0.133501
3.0,0.234257,0.070529
4.0,0.138539,0.030227


In [5]:
# TODO - normalizes over admittance - sums probabilities
pd.crosstab(df.prestige,df.admit,normalize=True).sum()

admit
0    0.68262
1    0.31738
dtype: float64

In [6]:
# TODO - normalizes within rows - proportional probabilities within indices
pd.crosstab(df.prestige,df.admit,normalize='index')

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.459016,0.540984
2.0,0.641892,0.358108
3.0,0.768595,0.231405
4.0,0.820896,0.179104


## Part B.  Variable Transformations

> ### Question 2.  Create a one-hot encoding for `prestige`.

In [7]:
# TODO
tmp = pd.get_dummies(df.prestige)
tmp
tmp.loc[:,4.0]

0      0
1      0
2      0
3      1
4      1
      ..
395    0
396    0
397    0
398    0
399    0
Name: 4.0, dtype: uint8

In [8]:
# recasting prestige as int
df.prestige = df.prestige.astype(int)
df

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.00,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4
...,...,...,...,...
395,0,620.0,4.00,2
396,0,560.0,3.04,3
397,0,460.0,2.63,2
398,0,700.0,3.65,2


In [9]:
one_hot = pd.get_dummies(df.prestige, prefix = 'prestige')
one_hot

Unnamed: 0,prestige_1,prestige_2,prestige_3,prestige_4
0,0,0,1,0
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
395,0,1,0,0
396,0,0,1,0
397,0,1,0,0
398,0,1,0,0


In [10]:
df = df.join(one_hot)
df

Unnamed: 0,admit,gre,gpa,prestige,prestige_1,prestige_2,prestige_3,prestige_4
0,0,380.0,3.61,3,0,0,1,0
1,1,660.0,3.67,3,0,0,1,0
2,1,800.0,4.00,1,1,0,0,0
3,1,640.0,3.19,4,0,0,0,1
4,0,520.0,2.93,4,0,0,0,1
...,...,...,...,...,...,...,...,...
395,0,620.0,4.00,2,0,1,0,0
396,0,560.0,3.04,3,0,0,1,0
397,0,460.0,2.63,2,0,1,0,0
398,0,700.0,3.65,2,0,1,0,0


> ### Question 3.  How many of these "one-hot"/dummy binary variables do we need for modeling?

Answer: Three (n-1)

> ### Question 4.  Why are we doing this?

Answer:

> ### Question 5.  Add all these binary variables in the dataset and remove the now redundant `prestige` feature.

In [11]:
# TODO
df.drop('prestige', inplace=True, axis = 1)
df

Unnamed: 0,admit,gre,gpa,prestige_1,prestige_2,prestige_3,prestige_4
0,0,380.0,3.61,0,0,1,0
1,1,660.0,3.67,0,0,1,0
2,1,800.0,4.00,1,0,0,0
3,1,640.0,3.19,0,0,0,1
4,0,520.0,2.93,0,0,0,1
...,...,...,...,...,...,...,...
395,0,620.0,4.00,0,1,0,0
396,0,560.0,3.04,0,0,1,0
397,0,460.0,2.63,0,1,0,0
398,0,700.0,3.65,0,1,0,0


## Part C.  Hand calculating odds ratios

Let's develop our intuition about expected outcomes by hand calculating odds ratios.

> ### Question 6.  Create a frequency table for `prestige = 1` and whether an applicant was admitted.

In [12]:
# TODO
df2 = df[['admit','prestige_1']]
df2
pd.crosstab(df2.prestige_1, df2.admit,normalize='index')

admit,0,1
prestige_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.723214,0.276786
1,0.459016,0.540984


In [13]:
df3 = df[df.prestige_1==1]
df3######

Unnamed: 0,admit,gre,gpa,prestige_1,prestige_2,prestige_3,prestige_4
2,1,800.0,4.00,1,0,0,0
6,1,560.0,2.98,1,0,0,0
11,0,440.0,3.22,1,0,0,0
12,1,760.0,4.00,1,0,0,0
14,1,700.0,4.00,1,0,0,0
...,...,...,...,...,...,...,...
368,0,580.0,4.00,1,0,0,0
372,1,680.0,2.42,1,0,0,0
373,1,620.0,3.37,1,0,0,0
383,0,660.0,4.00,1,0,0,0


> ### Question 7.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the most prestigious undergraduate schools.

In [14]:
# TODO
tst = pd.crosstab(df2.prestige_1, df2.admit,normalize='index')
p = tst.loc[1,1]
odds_1 = p / (1-p)
odds_1

1.1785714285714288

> ### Question 8.  Now calculate the odds of admission for undergraduates who did not attend a #1 ranked college.

In [15]:
df2 = df[['admit','prestige_1']]
df2
pd.crosstab(df2.prestige_1, df2.admit,normalize='index')
p = tst.loc[0,1]
p
odds_0 = p / (1-p)
odds_0

0.38271604938271608

> ### Question 9.  Finally, what's the odds ratio?

In [16]:
# TODO 
odds_1/odds_0

3.0794930875576041

> ### Question 10.  Write this finding in a sentence.

Answer: There is a 3x chance of getting into the university if you are in a top school, as compared to not being in a top school.

> ### Question 11.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the least prestigious undergraduate schools.  Then calculate their odds ratio of being admitted to UCLA.  Finally, write this finding in a sentence.

In [17]:
# TODO
dfbad = df[['admit','prestige_4']]
tst = pd.crosstab(dfbad.prestige_4, dfbad.admit,normalize='index')
p = tst.loc[1,1]
odds_1 = p / (1-p)
odds_1

0.21818181818181817

Answer: Students who came from the least prestigious schools have a 22% chance of getting in UCLA.

## Part C. Analysis using `statsmodels`

> ### Question 12.  Fit a logistic regression model predicting admission into UCLA using `gre`, `gpa`, and the `prestige` of the undergraduate schools.  Use the highest prestige undergraduate schools as your reference point.

In [None]:
# TODO

> ### Question 13.  Print the model's summary results.

In [None]:
# TODO

> ### Question 14.  What are the odds ratios of the different features and their 95% confidence intervals?

In [None]:
# TODO

> ### Question 15.  Interpret the odds ratio for `prestige = 2`.

Answer:

> ### Question 16.  Interpret the odds ratio of `gpa`.

Answer:

> ### Question 17.  Assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [None]:
# TODO

Answer:

## Part D. Moving the model from `statsmodels` to `sklearn`

> ### Question 18.  Let's assume we are satisfied with our model.  Remodel it (same features) using `sklearn`.  When creating the logistic regression model with `LogisticRegression(C = 10 ** 2)`.

In [59]:
# TODO
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression(C=10**2)
lm

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [77]:
x= df[['gre','gpa','prestige_2','prestige_3','prestige_4']]
y= df['admit']
lm.fit(x,y)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

> ### Question 19.  What are the odds ratios for the different variables and how do they compare with the odds ratios calculated with `statsmodels`?

In [86]:
dat = pd.DataFrame(data = lm.coef_)
dat.columns = x.columns
dat = dat.T
dat.columns = ['coef']
dat['odds_ratio'] = dat.coef.apply(np.exp)
dat.head()
#np.exp(lm.coef_)

Unnamed: 0,coef,odds_ratio
gre,0.002158,1.002161
gpa,0.673155,1.960413
prestige_2,-0.628822,0.533219
prestige_3,-1.252227,0.285867
prestige_4,-1.568792,0.208297


In [None]:
# TODO

Answer:

> ### Question 20.  Again, assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [90]:
# TODO
sample = pd.DataFrame.from_dict({'gre':800,'gpa':4.0,'prestige_2':1,'prestige_3':0,'prestige_4':0},orient = 'index')
sample = sample.T
predictions = lm.predict_proba(sample)
print(predictions)
print(lm.predict(sample))

[[ 0.43153702  0.56846298]]
[1]


In [91]:
sample = pd.DataFrame.from_dict({'gre':800,'gpa':4.0,'prestige_2':0,'prestige_3':1,'prestige_4':0},orient = 'index')
sample = sample.T
predictions = lm.predict_proba(sample)
print(predictions)
print(lm.predict(sample))

[[ 0.58608936  0.41391064]]
[0]


In [92]:
sample = pd.DataFrame.from_dict({'gre':800,'gpa':4.0,'prestige_2':0,'prestige_3':0,'prestige_4':1},orient = 'index')
sample = sample.T
predictions = lm.predict_proba(sample)
print(predictions)
print(lm.predict(sample))

[[ 0.66024514  0.33975486]]
[0]


Answer: