In [2]:
import os
import pandas as pd
import patsy
import numpy as np

In [3]:
if not os.getcwd().endswith('assignment3'):
    os.chdir(os.path.join('assignment3'))
# read the file
parole = pd.read_csv('parole.csv')

# rename problematic coulmns
parole = parole.rename(columns={'time.served': 'timeServed', 
                                'max.sentence': 'maxSentence', 
                                'multiple.offenses': 'multipleOffenses'})

### Problem 1.1 - number of observations

In [4]:
len(parole.index)

675

### Problem 1.1 - number of voilations

In [5]:
(parole['violator'] == 1).sum()

78

### Problem 2.1 - convert to categorical

I couldn't find any python parallel to R's .asfactor
Instead, we'll use the patsy.dmatrices

### Problem 3.1 - Splitting into a Training and Testing Set

In [7]:
# scikit-learn used the numpy random number generator (?) 
np.random.seed(7)
from sklearn.model_selection import train_test_split
parole_train, parole_test = train_test_split(parole, test_size=0.3)

In [8]:
parole_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 472 entries, 82 to 175
Data columns (total 9 columns):
male                472 non-null int64
race                472 non-null int64
age                 472 non-null float64
state               472 non-null int64
timeServed          472 non-null float64
maxSentence         472 non-null int64
multipleOffenses    472 non-null int64
crime               472 non-null int64
violator            472 non-null int64
dtypes: float64(2), int64(7)
memory usage: 36.9 KB


### Problem 4.1 - Building a Logistic Regression Model

In [10]:
from statsmodels.discrete.discrete_model import Logit

In [189]:
# we will use pats to generate the matrices,
# the C() operator means - treat as categorical 

y, X = patsy.dmatrices('violator ~ male + race + age + C(state) + '
                       'timeServed + maxSentence + multipleOffenses + '
                       'C(crime)', parole_train, return_type='dataframe')
mod1 = Logit(y, X).fit()
mod1.summary2()

Optimization terminated successfully.
         Current function value: 0.246405
         Iterations 8


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.315
Dependent Variable:,violator,AIC:,258.6063
Date:,2017-03-26 00:40,BIC:,312.647
No. Observations:,472,Log-Likelihood:,-116.3
Df Model:,12,LL-Null:,-169.89
Df Residuals:,459,LLR p-value:,2.1568e-17
Converged:,1.0000,Scale:,1.0
No. Iterations:,8.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-5.1002,1.3713,-3.7191,0.0002,-7.7879,-2.4124
C(state)[T.2],0.2289,0.5125,0.4465,0.6552,-0.7757,1.2334
C(state)[T.3],1.2172,0.5506,2.2106,0.0271,0.1380,2.2965
C(state)[T.4],-3.2544,0.6330,-5.1414,0.0000,-4.4950,-2.0138
C(crime)[T.2],0.4879,0.5332,0.9150,0.3602,-0.5572,1.5330
C(crime)[T.3],-0.1386,0.4321,-0.3207,0.7485,-0.9855,0.7084
C(crime)[T.4],-0.6568,0.6431,-1.0212,0.3072,-1.9173,0.6037
male,0.7200,0.4980,1.4457,0.1483,-0.2561,1.6961
race,0.4665,0.3949,1.1813,0.2375,-0.3075,1.2405


The important variables (probability less then 0.05) are "race", "state4" and "multipleOffenses"

### problem 4.2
What can we say based on the coefficient of the multiple.offenses variable? answer: Our model predicts that a parolee who committed multiple offenses has X times higher odds of being a violator than a parolee who did not commit multiple offenses but is otherwise identical

About the answer:
$$yes/no = exp(\beta_0+\beta_1x)$$ is the odds ratio of a parolee to violate the parole (yes) vs not violating the parole (no). If the coefficient for multipleOffenses is $C_{mOf}$ than he has chances greater by $exp(C_{mOf})$ of violating the parole.

In [190]:
mOf = mod1.params['multipleOffenses']
np.exp(mOf)

7.284746722326692

### Problem 4.3 - Building a Logistic Regression Model
Consider a parolee who is male, of white race, aged 50 years at prison release, from the state of Maryland, served 3 months, had a maximum sentence of 12 months, did not commit multiple offenses, and committed a larceny.

Answer: this is an easy question. As a reminder, the probability of violating the parole is: $$ P(Y=1) = \frac{1}{1+exp(-1*( ... ))}$$

### Problem 5.1 - Evaluating the Model on the Testing Set
Predict probabilities for parolees in the testing set. What is the maximal?

In [191]:
y_test, X_test = patsy.dmatrices('violator ~ male + race + age + C(state) + '
                       'timeServed + maxSentence + multipleOffenses + '
                       'C(crime)', parole_test, return_type='dataframe')
precitions_mod1 = mod1.predict(exog=X_test)
precitions_mod1.max()

0.87611561665337234

### Problem 5.2 - Evaluating the Model on the Testing Set
evaluate the model's predictions on the test set using a threshold of 0.5. What is the model's sensitivity? specificity? accuracy?

Let's generate the confusion table:

In [192]:
from sklearn.metrics import confusion_matrix

thresh = 0.5
con_matx = confusion_matrix(parole_test['violator'].values, precitions_mod1 > thresh)
con_matx

array([[169,  11],
       [ 15,   8]])

In [193]:
TP = con_matx[0, 1]
TN = con_matx[0, 0]
FP = con_matx[1, 1]
FN = con_matx[1, 0]

In [194]:
# sensitivity
(TP) / (TP + FN)

0.42307692307692307

In [195]:
# specificity
TN / (TN + FP)

0.95480225988700562

In [196]:
# accuracy
(TP + TN) / len(precitions_mod1.index)

0.88669950738916259

### Problem 5.3 - What is the accuracy of a simple model that predicts that every parolee is a non-violator?

In [197]:
# In that case:
# BTW we choose > 1 because it is impossible for probability to be greater than 1 
con_matx = confusion_matrix(parole_test['violator'].values, precitions_mod1 > 1)

TP = con_matx[0, 1]
TN = con_matx[0, 0]
FP = con_matx[1, 1]
FN = con_matx[1, 0]

# the accuracy:
(TP + TN) / len(precitions_mod1.index)

0.88669950738916259

Comment: this probably could have been done more efficiently

### Problem 5.6 - what is the AUC value for the model?

In [198]:
from sklearn.metrics import roc_auc_score as auc
auc(y_true=parole_test['violator'].values, y_score=precitions_mod1)

0.64335748792270531