In [9]:
import numpy as np
import pandas as pd

In [10]:
train = pd.read_csv('hw1data/propublicaTrain.csv')
test = pd.read_csv('hw1data/propublicaTest.csv')

In [11]:
train.head()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
0,0,1,64,0,0,0,0,13,0,1
1,0,1,28,0,0,0,0,1,1,0
2,0,1,32,0,0,0,0,1,1,0
3,1,1,20,0,0,1,1,2,1,0
4,0,1,43,1,0,0,0,8,1,0


In [12]:
train.describe()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
count,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0
mean,0.455964,0.813775,34.409167,0.344612,0.055916,0.089513,0.110631,3.278138,0.645308,0.354692
std,0.498117,0.389335,11.570853,0.475299,0.379495,0.486893,0.464278,4.770756,0.478477,0.478477
min,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,31.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1.0,1.0,42.0,1.0,0.0,0.0,0.0,4.0,1.0,1.0
max,1.0,1.0,83.0,1.0,8.0,13.0,7.0,38.0,1.0,1.0


In [13]:
test.describe()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.4545,0.8015,34.7875,0.332,0.066,0.095,0.111,3.1835,0.6385,0.3615
std,0.49805,0.39897,12.042253,0.471049,0.60235,0.520683,0.484558,4.692423,0.480555,0.480555
min,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,31.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1.0,1.0,42.0,1.0,0.0,0.0,0.0,4.0,1.0,1.0
max,1.0,1.0,96.0,1.0,20.0,12.0,9.0,36.0,1.0,1.0


NOTE:
- two_year_recid (Y), sex, race, c_charge_degree_F are categorical
- c_charge_degree_M is ignored because c_charge_degree_F includes all information about the column

# Training

### Mutlivariate Guassian Distibution fitting

In [14]:
# Preprocessing
def preprocess(df):
    '''Remove c_charge_degree_M column due to redunduncy'''
    return df.drop(labels=['c_charge_degree_M'], axis=1)

train_df = preprocess(train)
test_df = preprocess(test)
train_df.head()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F
0,0,1,64,0,0,0,0,13,0
1,0,1,28,0,0,0,0,1,1
2,0,1,32,0,0,0,0,1,1
3,1,1,20,0,0,1,1,2,1
4,0,1,43,1,0,0,0,8,1


In [21]:
# Split the data into categorical and quantitative
def split_categorical_and_quant(df):
    categorical_cols = ['two_year_recid', 'sex', 'race', 'c_charge_degree_F']
    # Need to include 'two_year_recid' in both because it is the target y
    quantitative_cols = ['two_year_recid', 'age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count']
    return df[categorical_cols], df[quantitative_cols]

categ_train_df, quant_train_df = split_categorical_and_quant(train_df)
categ_test_df, quant_test_df = split_categorical_and_quant(test_df)

In [29]:
# Split the data into y = 0 and y = 1
def split_by_y(df):
    return df.loc[df['two_year_recid'] == 0, :], df.loc[df['two_year_recid'] == 1, :]

categ_0_train_df, categ_1_train_df = split_by_y(categ_train_df)
quant_0_train_df, quant_1_train_df = split_by_y(quant_train_df)
categ_0_test_df, categ_1_test_df = split_by_y(categ_test_df)
quant_0_test_df, quant_1_test_df = split_by_y(quant_test_df)

In [77]:
quant_0_train_df.columns

Index(['two_year_recid', 'age', 'juv_fel_count', 'juv_misd_count',
       'juv_other_count', 'priors_count'],
      dtype='object')

In [48]:
# Find mean vector (mu) and covariance matrix (sigma)
def estimate_mu(quant_df):
    '''Estimates the mean vector (mu) using discrete datapoints'''
    return np.array([quant_df[col].mean() for col in quant_df.columns])

mu0 = estimate_mu(quant_0_train_df)
mu1 = estimate_mu(quant_1_train_df)

In [50]:
mu0, mu1

(array([0.00000000e+00, 3.63498015e+01, 2.47022497e-02, 4.19056021e-02,
        6.21967358e-02, 2.01676224e+00]),
 array([ 1.        , 32.09368421,  0.09315789,  0.14631579,  0.16842105,
         4.78315789]))

In [88]:
def estimate_sigma(quant_df, mu):
    '''
    Estimates the covariance matrix (sigma) using discrete datapoints
    See https://github.com/scikit-learn/scikit-learn/blob/7389dba/sklearn/covariance/empirical_covariance_.py#L50
    '''
    X = np.asarray(quant_df)
    sigma = np.dot(X.T, X) / X.shape[0]
    return sigma

sigma0 = estimate_sigma(quant_0_train_df, mu0)
sigma1 = estimate_sigma(quant_1_train_df, mu1)

In [91]:
# Fit a multivariate Gaussian Distribution to each of Y = 0 case and Y = 1 case
def fit_gaussian(quant_df):
    '''
    return mu, and sigma
    '''
    mu = estimate_mu(quant_df)
    sigma = estimate_sigma(quant_df, mu)
    return mu, sigma

mu0, simga0 = fit_gaussian(quant_0_train_df)
mu1, simga1 = fit_gaussian(quant_1_train_df)

In [93]:
sigma0

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.46934230e+03, 7.79885311e-01, 1.21261579e+00,
        1.56153507e+00, 7.93674460e+01],
       [0.00000000e+00, 7.79885311e-01, 4.58756065e-02, 1.45566828e-02,
        3.52889281e-03, 2.16144685e-01],
       [0.00000000e+00, 1.21261579e+00, 1.45566828e-02, 8.77812086e-02,
        8.82223202e-03, 3.16718130e-01],
       [0.00000000e+00, 1.56153507e+00, 3.52889281e-03, 8.82223202e-03,
        1.32774592e-01, 1.65857962e-01],
       [0.00000000e+00, 7.93674460e+01, 2.16144685e-01, 3.16718130e-01,
        1.65857962e-01, 1.58323776e+01]])

In [None]:
from scipy.stats import multivariate_normal
P_0 = multivariate_normal(mu0, sigma0)
P_1 = multivariate_normal(mu1, sigma1)

### Non-recidivism estimation

### Recidivism estimation

### Categorical Feature as joint conditional probabilities

# Prediction: Take argmax of the two estimations