In [4]:
import numpy as np
import pandas as pd

In [5]:
train = pd.read_csv('hw1data/propublicaTrain.csv')
test = pd.read_csv('hw1data/propublicaTest.csv')

In [6]:
train.head()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
0,0,1,64,0,0,0,0,13,0,1
1,0,1,28,0,0,0,0,1,1,0
2,0,1,32,0,0,0,0,1,1,0
3,1,1,20,0,0,1,1,2,1,0
4,0,1,43,1,0,0,0,8,1,0


In [7]:
train.describe()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
count,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0
mean,0.455964,0.813775,34.409167,0.344612,0.055916,0.089513,0.110631,3.278138,0.645308,0.354692
std,0.498117,0.389335,11.570853,0.475299,0.379495,0.486893,0.464278,4.770756,0.478477,0.478477
min,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,31.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1.0,1.0,42.0,1.0,0.0,0.0,0.0,4.0,1.0,1.0
max,1.0,1.0,83.0,1.0,8.0,13.0,7.0,38.0,1.0,1.0


In [8]:
test.describe()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.4545,0.8015,34.7875,0.332,0.066,0.095,0.111,3.1835,0.6385,0.3615
std,0.49805,0.39897,12.042253,0.471049,0.60235,0.520683,0.484558,4.692423,0.480555,0.480555
min,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,31.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1.0,1.0,42.0,1.0,0.0,0.0,0.0,4.0,1.0,1.0
max,1.0,1.0,96.0,1.0,20.0,12.0,9.0,36.0,1.0,1.0


NOTE:
- two_year_recid (Y), sex, race, c_charge_degree_F are categorical
- c_charge_degree_M is ignored because c_charge_degree_F includes all information about the column

# Training

### Mutlivariate Guassian Distibution fitting

In [9]:
# Preprocessing
def preprocess(df):
    '''Remove c_charge_degree_M column due to redunduncy'''
    return df.drop(labels=['c_charge_degree_M'], axis=1)

train_df = preprocess(train)
test_df = preprocess(test)
train_df.head()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F
0,0,1,64,0,0,0,0,13,0
1,0,1,28,0,0,0,0,1,1
2,0,1,32,0,0,0,0,1,1
3,1,1,20,0,0,1,1,2,1
4,0,1,43,1,0,0,0,8,1


In [10]:
# Split the data into categorical and quantitative
def split_categorical_and_quant(df):
    categorical_cols = ['two_year_recid', 'sex', 'race', 'c_charge_degree_F']
    # Need to include 'two_year_recid' in both because it is the target y
    quantitative_cols = ['two_year_recid', 'age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count']
    return df[categorical_cols], df[quantitative_cols]

categ_train_df, quant_train_df = split_categorical_and_quant(train_df)
categ_test_df, quant_test_df = split_categorical_and_quant(test_df)

In [11]:
# Split the data into y = 0 and y = 1
def split_by_y(df):
    return df.loc[df['two_year_recid'] == 0, :], df.loc[df['two_year_recid'] == 1, :]

categ_0_train_df, categ_1_train_df = split_by_y(categ_train_df)
quant_0_train_df, quant_1_train_df = split_by_y(quant_train_df)
categ_0_test_df, categ_1_test_df = split_by_y(categ_test_df)
quant_0_test_df, quant_1_test_df = split_by_y(quant_test_df)

In [55]:
categ_0_train_df.columns

Index(['two_year_recid', 'sex', 'race', 'c_charge_degree_F'], dtype='object')

In [56]:
# Split the label from features
def split_y(df):
    X = df.loc[:, df.columns != 'two_year_recid']
    y = df['two_year_recid']
    return X, y
quant_0_train_X, quant_0_train_y = split_y(quant_0_train_df)
quant_1_train_X, quant_1_train_y = split_y(quant_1_train_df)

In [61]:
quant_1_train_y.head()

3     1
5     1
9     1
10    1
18    1
Name: two_year_recid, dtype: int64

In [62]:
# Find mean vector (mu) and covariance matrix (sigma)
def estimate_mu(quant_df):
    '''Estimates the mean vector (mu) using discrete datapoints'''
    return np.array([quant_df[col].mean() for col in quant_df.columns])

mu0 = estimate_mu(quant_0_train_X)
mu1 = estimate_mu(quant_1_train_X)

In [63]:
mu0, mu1

(array([3.63498015e+01, 2.47022497e-02, 4.19056021e-02, 6.21967358e-02,
        2.01676224e+00]),
 array([32.09368421,  0.09315789,  0.14631579,  0.16842105,  4.78315789]))

In [64]:
def estimate_sigma(quant_X, mu):
    '''
    Estimates the covariance matrix (sigma) using discrete datapoints
    See https://github.com/scikit-learn/scikit-learn/blob/7389dba/sklearn/covariance/empirical_covariance_.py#L50
    '''
    X = np.asarray(quant_X)
    print(mu)
    print(X.T)
    print(X)
    X = np.array([row - mu for row in X])
    # May need to do row by row substraction from X and then perform matrix multiplication
    sigma = X.T @ X / X.shape[0]
    return sigma

sigma0 = estimate_sigma(quant_0_train_X, mu0)
sigma1 = estimate_sigma(quant_1_train_X, mu1)

[3.63498015e+01 2.47022497e-02 4.19056021e-02 6.21967358e-02
 2.01676224e+00]
[[64 28 32 ... 26 24 45]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [13  1  1 ...  0  1  0]]
[[64  0  0  0 13]
 [28  0  0  0  1]
 [32  0  0  0  1]
 ...
 [26  0  0  0  0]
 [24  0  0  0  1]
 [45  0  0  0  0]]
[32.09368421  0.09315789  0.14631579  0.16842105  4.78315789]
[[20 22 23 ... 35 34 27]
 [ 0  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 2  5  3 ...  2  2  4]]
[[20  0  1  1  2]
 [22  0  0  0  5]
 [23  0  0  0  3]
 ...
 [35  0  0  0  2]
 [34  0  0  0  2]
 [27  0  0  0  4]]


In [65]:
sigma1

array([[ 1.07076486e+02, -2.77148476e-01, -7.43707479e-01,
        -9.75252078e-01,  1.54682094e+01],
       [-2.77148476e-01,  2.59216343e-01,  3.21590028e-02,
         1.95734072e-02,  5.09674238e-01],
       [-7.43707479e-01,  3.21590028e-02,  4.11223269e-01,
         1.03778393e-01,  9.24359003e-01],
       [-9.75252078e-01,  1.95734072e-02,  1.03778393e-01,
         3.12686981e-01,  3.45468144e-01],
       [ 1.54682094e+01,  5.09674238e-01,  9.24359003e-01,
         3.45468144e-01,  3.17035058e+01]])

In [67]:
# Fit a multivariate Gaussian Distribution to each of Y = 0 case and Y = 1 case
def fit_gaussian(quant_X):
    '''
    return mu, and sigma
    '''
    mu = estimate_mu(quant_X)
    sigma = estimate_sigma(quant_X, mu)
    return mu, sigma

mu0, simga0 = fit_gaussian(quant_0_train_X)
mu1, simga1 = fit_gaussian(quant_1_train_X)

[3.63498015e+01 2.47022497e-02 4.19056021e-02 6.21967358e-02
 2.01676224e+00]
[[64 28 32 ... 26 24 45]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [13  1  1 ...  0  1  0]]
[[64  0  0  0 13]
 [28  0  0  0  1]
 [32  0  0  0  1]
 ...
 [26  0  0  0  0]
 [24  0  0  0  1]
 [45  0  0  0  0]]
[32.09368421  0.09315789  0.14631579  0.16842105  4.78315789]
[[20 22 23 ... 35 34 27]
 [ 0  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 2  5  3 ...  2  2  4]]
[[20  0  1  1  2]
 [22  0  0  0  5]
 [23  0  0  0  3]
 ...
 [35  0  0  0  2]
 [34  0  0  0  2]
 [27  0  0  0  4]]


In [68]:
sigma1

array([[ 1.07076486e+02, -2.77148476e-01, -7.43707479e-01,
        -9.75252078e-01,  1.54682094e+01],
       [-2.77148476e-01,  2.59216343e-01,  3.21590028e-02,
         1.95734072e-02,  5.09674238e-01],
       [-7.43707479e-01,  3.21590028e-02,  4.11223269e-01,
         1.03778393e-01,  9.24359003e-01],
       [-9.75252078e-01,  1.95734072e-02,  1.03778393e-01,
         3.12686981e-01,  3.45468144e-01],
       [ 1.54682094e+01,  5.09674238e-01,  9.24359003e-01,
         3.45468144e-01,  3.17035058e+01]])

In [69]:
from scipy.stats import multivariate_normal
# Need to fix singularity of sigma0 (multivariate_normal requires invertible covariance matrix) -> fix the first row 
P_0 = multivariate_normal(mu0, sigma0)
P_1 = multivariate_normal(mu1, sigma1)

### Categorical Feature as joint conditional probabilities

In [None]:
'''
Count all points with 0,0,0 or 0,0,1, etc. for categorical ones
and multiply with that particular point's P_0 or P_1
'''
def calculate_cat_pr(df):
    '''
    Take df and return dictionary that maps key (indexes) and
    values (probabilities).
    '''

def get_cat_pr(index):
    '''
    Take index of a row and return correct P_3 or P_4 based on the row's
    categorical columns' values.
    '''