In [None]:
# Summary/Review/One More Thing about regression

We have been doing what are known as REGRESSION models

A Regression task is when you try to predict a *continuous* variable using other variables.

# Linear Regression - A Review

In [37]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score, mean_squared_error

In [30]:
#Review: Linear Regression Models
admissions = pd.read_csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
admissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   admit   400 non-null    int64  
 1   gre     400 non-null    int64  
 2   gpa     400 non-null    float64
 3   rank    400 non-null    int64  
dtypes: float64(1), int64(3)
memory usage: 12.6 KB


In [31]:
admissions.describe()

Unnamed: 0,admit,gre,gpa,rank
count,400.0,400.0,400.0,400.0
mean,0.3175,587.7,3.3899,2.485
std,0.466087,115.516536,0.380567,0.94446
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.395,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


## Question: Which columns are categorical and which are continuous?

## Regression

A Regression task is when you try to predict a *continuous* variable using other variables, which can be of any type.

In [33]:

from sklearn.model_selection import train_test_split
#version 3
def statsmodels_train_test_split(df, stratify=None, **kwargs):

    if stratify is None:
        y, X = df.iloc[:,0], df.drop(columns=df.columns[0])
        X_train, X_test, y_train, y_test = train_test_split(X,y, **kwargs)
    else:
        y, X = stratify, df.drop(columns = stratify.name)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, **kwargs)
    
    return pd.concat([X_train, y_train], axis=1), pd.concat([X_test, y_test], axis=1)


In [34]:
train, test = statsmodels_train_test_split(admissions)

# Let's try to model gre score versus the other variables, e.g.,
# gre ~ C(admit) + gpa'

# There is a deep connection between linear regression and the normal distribution. I'm going to mention it now, but not do anything with it yet.
# Here it is:

## When we do a linear regression of the form we wrote above, we are making the assumption that *gre* is distributed as a normal random variable!!!

In [50]:
# What are the value counts of gre in this dataset?

admissions['gre'].value_counts()

620    30
580    29
540    27
800    25
520    24
560    24
660    24
600    23
700    22
640    21
500    21
680    20
480    16
460    14
720    11
400    11
740    11
440    10
380     8
420     7
780     5
760     5
340     4
360     4
300     3
220     1
Name: gre, dtype: int64

### Let's treat gre as continuous for now, though this is debatable!
### Also notice that all gre scores are greater than zero. Since they are generally not near zero, we'll allow this.

In [52]:
formula = 'gre ~ C(admit) + gpa' #questionable assumption, but let's try it anyway
lm = smf.ols(formula=formula, data=train).fit()
lm.summary()

0,1,2,3
Dep. Variable:,gre,R-squared:,0.169
Model:,OLS,Adj. R-squared:,0.163
Method:,Least Squares,F-statistic:,30.11
Date:,"Tue, 08 Sep 2020",Prob (F-statistic):,1.24e-12
Time:,20:38:28,Log-Likelihood:,-1817.8
No. Observations:,300,AIC:,3642.0
Df Residuals:,297,BIC:,3653.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,216.2298,53.231,4.062,0.000,111.472,320.988
C(admit)[T.1],37.8320,13.248,2.856,0.005,11.760,63.904
gpa,105.2610,15.806,6.659,0.000,74.155,136.367

0,1,2,3
Omnibus:,0.155,Durbin-Watson:,2.025
Prob(Omnibus):,0.925,Jarque-Bera (JB):,0.264
Skew:,-0.04,Prob(JB):,0.876
Kurtosis:,2.878,Cond. No.,32.8


In [53]:
y_test_pred = lm.predict(test)

In [54]:
r2_score(test['gre'], y_test_pred)

0.12239433217408768

In [55]:
mean_squared_error(test['gre'], y_test_pred)

12548.88344424272

If I make a different model, my r-squared and mean_squared_error will be different
In general, if I add a variable I would expect my r-squared to go up.

**Adjusted R-Squared** is generally considered SUPERIOR to R-Squared, because it adjusts for the fact that you've added more variables, so only go up when you add a new variable AND the fit is a better!

In [56]:
formula = 'gre ~ C(admit) + gpa + C(rank)' # adds rank as a categorical variable
lm2 = smf.ols(formula=formula, data=train).fit()
lm2.summary()

0,1,2,3
Dep. Variable:,gre,R-squared:,0.186
Model:,OLS,Adj. R-squared:,0.172
Method:,Least Squares,F-statistic:,13.4
Date:,"Tue, 08 Sep 2020",Prob (F-statistic):,8.82e-12
Time:,20:39:06,Log-Likelihood:,-1814.7
No. Observations:,300,AIC:,3641.0
Df Residuals:,294,BIC:,3664.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,209.8145,56.264,3.729,0.000,99.084,320.545
C(admit)[T.1],33.1256,13.708,2.417,0.016,6.148,60.104
C(rank)[T.2],13.3080,17.965,0.741,0.459,-22.048,48.664
C(rank)[T.3],-23.0318,18.914,-1.218,0.224,-60.255,14.192
C(rank)[T.4],-4.9769,22.249,-0.224,0.823,-48.764,38.810
gpa,108.3309,15.847,6.836,0.000,77.144,139.518

0,1,2,3
Omnibus:,0.763,Durbin-Watson:,2.01
Prob(Omnibus):,0.683,Jarque-Bera (JB):,0.85
Skew:,-0.041,Prob(JB):,0.654
Kurtosis:,2.752,Cond. No.,35.6


In [None]:
# we see that both R-Squared and Adjusted R-squared went up on the TRAINING data

In [57]:
#Let's check the test data
y_test_pred = lm2.predict(test)
r2_score(test['gre'], y_test_pred)


0.10453063162492138

In [None]:
# so, r2_score went up. Adjusted r-squared went up as well, but not as much.

A researcher is interested in how predictor variables, such as 
1) GRE (Graduate Record Exam scores), 
2) GPA (grade point average) and 
3) rank/prestige of the undergraduate institution
effect admission into graduate school.

#The response variable "admission to grad school"  is a binary variable.


The only two choices are admit/don’t admit.
Values are 0 = no admit, 1 = admit


## THIS TASK IS CALLED CLASSIFICATION. 
CLASSIFICATION => The target output is one of a limited number of categories.
In this problem we only have two possible targets: no admit and admit



CHECK FOR UNDERSTANDING: Why is Classification different from Regression, a/k/a Linear Regression?

Note: The fact that you solve CLASSIFICATION problems with a technique called LOGISITIC REGRESSION is unfortunate, but a fact of life.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   admit   400 non-null    int64  
 1   gre     400 non-null    int64  
 2   gpa     400 non-null    float64
 3   rank    400 non-null    int64  
dtypes: float64(1), int64(3)
memory usage: 12.6 KB


In [5]:
df.describe()

Unnamed: 0,admit,gre,gpa,rank
count,400.0,400.0,400.0,400.0
mean,0.3175,587.7,3.3899,2.485
std,0.466087,115.516536,0.380567,0.94446
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.395,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


### admit: is the Categorical target we want to model. 
### 0 = no admit, 1 = admit

#gre is numerical/continuous

#gpa is numerical/continuous

#rank is categorical: 1,2,3, or 4


## You should *always* explicitly separate out Categorical factors.
## Notice this this model has more coefficients. (why?)

In [7]:
fitted_model = smf.logit(formula='admit ~ gre + gpa + C(rank)', data=df).fit()
fitted_model.summary()

Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6


0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,Logit,Df Residuals:,394.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 08 Sep 2020",Pseudo R-squ.:,0.08292
Time:,17:49:01,Log-Likelihood:,-229.26
converged:,True,LL-Null:,-249.99
Covariance Type:,nonrobust,LLR p-value:,7.578e-08

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.9900,1.140,-3.500,0.000,-6.224,-1.756
C(rank)[T.2],-0.6754,0.316,-2.134,0.033,-1.296,-0.055
C(rank)[T.3],-1.3402,0.345,-3.881,0.000,-2.017,-0.663
C(rank)[T.4],-1.5515,0.418,-3.713,0.000,-2.370,-0.733
gre,0.0023,0.001,2.070,0.038,0.000,0.004
gpa,0.8040,0.332,2.423,0.015,0.154,1.454


In [19]:
#CONFUSION MATRIX. HOW WELL DID YOUR MODEL PREDICT THE REALITY OF YOUR DATA?
# pred_table[i,j] refers to the number of times “i” was observed and the model predicted “j”. 
fitted_model.pred_table()

array([[254.,  19.],
       [ 97.,  30.]])

### The return of true positive, false positive, true negative, false negative! Work out which number is which!

true negative =

true positive = 

false positive =

false negative =

### Note: pred_table[i,j] refers to the number of times “i” was observed and the model predicted “j”. 

In [6]:
#Will our model perform better if I make the data more "uniform"?
#what if I try to make the GRE and GPA columns more like a Z score or standard normal? 
#Will that make it easier for the numerical solver in statsmodels to find a better answer?

In [10]:
# Here are the means and standard deviations of the two columns
gre_mean = df['gre'].mean()
gre_std = df['gre'].std()

gpa_mean = df['gpa'].mean()
gpa_std = df['gpa'].std()

print(gre_mean, gre_std)
print(gpa_mean, gpa_std)

587.7 115.51653637223805
3.3899 0.3805667716303841


In [11]:
#let's add two new columns to our dataset to reflect the gpa and gre score on a standardized basis

In [12]:
gre_mean = df['gre'].mean()
gre_std = df['gre'].std()

gpa_mean = df['gpa'].mean()
gpa_std = df['gpa'].std()

In [13]:
df['gre_zscore'] = df['gre'].apply(lambda gre: (gre - gre_mean)/gre_std )
df['gpa_zscore'] = df['gpa'].apply(lambda gpa: (gpa - gpa_mean)/gpa_std )


In [14]:
df.describe() #now we's transformed the numerical data columns to mean zero and variance 1.

Unnamed: 0,admit,gre,gpa,rank,gre_zscore,gpa_zscore
count,400.0,400.0,400.0,400.0,400.0,400.0
mean,0.3175,587.7,3.3899,2.485,-3.907985e-16,2.198242e-16
std,0.466087,115.516536,0.380567,0.94446,1.0,1.0
min,0.0,220.0,2.26,1.0,-3.183094,-2.968993
25%,0.0,520.0,3.13,2.0,-0.5860633,-0.6829288
50%,0.0,580.0,3.395,2.0,-0.06665712,0.01340106
75%,1.0,660.0,3.67,3.0,0.6258844,0.7360075
max,1.0,800.0,4.0,4.0,1.837832,1.603135


In [None]:
Answer: No. Model will not perform better. BUT, the coefficients are different.

In [15]:
fitted_model2 = smf.logit(formula='admit ~ gre_zscore + gpa_zscore + C(rank)', data=df).fit()
fitted_model2.summary()

Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6


0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,Logit,Df Residuals:,394.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 08 Sep 2020",Pseudo R-squ.:,0.08292
Time:,17:51:29,Log-Likelihood:,-229.26
converged:,True,LL-Null:,-249.99
Covariance Type:,nonrobust,LLR p-value:,7.578e-08

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0664,0.266,0.250,0.802,-0.454,0.587
C(rank)[T.2],-0.6754,0.316,-2.134,0.033,-1.296,-0.055
C(rank)[T.3],-1.3402,0.345,-3.881,0.000,-2.017,-0.663
C(rank)[T.4],-1.5515,0.418,-3.713,0.000,-2.370,-0.733
gre_zscore,0.2616,0.126,2.070,0.038,0.014,0.509
gpa_zscore,0.3060,0.126,2.423,0.015,0.058,0.553


In [16]:
fitted_model.summary() # The original model

0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,Logit,Df Residuals:,394.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 08 Sep 2020",Pseudo R-squ.:,0.08292
Time:,17:51:49,Log-Likelihood:,-229.26
converged:,True,LL-Null:,-249.99
Covariance Type:,nonrobust,LLR p-value:,7.578e-08

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.9900,1.140,-3.500,0.000,-6.224,-1.756
C(rank)[T.2],-0.6754,0.316,-2.134,0.033,-1.296,-0.055
C(rank)[T.3],-1.3402,0.345,-3.881,0.000,-2.017,-0.663
C(rank)[T.4],-1.5515,0.418,-3.713,0.000,-2.370,-0.733
gre,0.0023,0.001,2.070,0.038,0.000,0.004
gpa,0.8040,0.332,2.423,0.015,0.154,1.454


In [17]:
fitted_model2.pred_table()

array([[254.,  19.],
       [ 97.,  30.]])

In [18]:
fitted_model2.pred_table??

[0;31mSignature:[0m [0mfitted_model2[0m[0;34m.[0m[0mpred_table[0m[0;34m([0m[0mthreshold[0m[0;34m=[0m[0;36m0.5[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mpred_table[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mthreshold[0m[0;34m=[0m[0;36m.5[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""[0m
[0;34m        Prediction table[0m
[0;34m[0m
[0;34m        Parameters[0m
[0;34m        ----------[0m
[0;34m        threshold : scalar[0m
[0;34m            Number between 0 and 1. Threshold above which a prediction is[0m
[0;34m            considered 1 and below which a prediction is considered 0.[0m
[0;34m[0m
[0;34m        Notes[0m
[0;34m        -----[0m
[0;34m        pred_table[i,j] refers to the number of times "i" was observed and[0m
[0;34m        the model predicted "j". Correct predictions are along the diagonal.[0m
[0;34m        """[0m[0;34m[0m
[0;34m[0m        [0mmodel[0m [0;34m=