In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('hw1data/propublicaTrain.csv')
test = pd.read_csv('hw1data/propublicaTest.csv')

In [3]:
train.head()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
0,0,1,64,0,0,0,0,13,0,1
1,0,1,28,0,0,0,0,1,1,0
2,0,1,32,0,0,0,0,1,1,0
3,1,1,20,0,0,1,1,2,1,0
4,0,1,43,1,0,0,0,8,1,0


In [4]:
train.describe()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
count,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0
mean,0.455964,0.813775,34.409167,0.344612,0.055916,0.089513,0.110631,3.278138,0.645308,0.354692
std,0.498117,0.389335,11.570853,0.475299,0.379495,0.486893,0.464278,4.770756,0.478477,0.478477
min,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,31.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1.0,1.0,42.0,1.0,0.0,0.0,0.0,4.0,1.0,1.0
max,1.0,1.0,83.0,1.0,8.0,13.0,7.0,38.0,1.0,1.0


In [5]:
test.describe()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.4545,0.8015,34.7875,0.332,0.066,0.095,0.111,3.1835,0.6385,0.3615
std,0.49805,0.39897,12.042253,0.471049,0.60235,0.520683,0.484558,4.692423,0.480555,0.480555
min,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,31.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1.0,1.0,42.0,1.0,0.0,0.0,0.0,4.0,1.0,1.0
max,1.0,1.0,96.0,1.0,20.0,12.0,9.0,36.0,1.0,1.0


NOTE:
- two_year_recid (Y), sex, race, c_charge_degree_F are categorical
- c_charge_degree_M is ignored because c_charge_degree_F includes all information about the column

# Training

### Mutlivariate Guassian Distibution fitting

In [6]:
# Preprocessing
def preprocess(df):
    '''Remove c_charge_degree_M column due to redunduncy'''
    return df.drop(labels=['c_charge_degree_M'], axis=1)

train_df = preprocess(train)
test_df = preprocess(test)
train_df.head()

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F
0,0,1,64,0,0,0,0,13,0
1,0,1,28,0,0,0,0,1,1
2,0,1,32,0,0,0,0,1,1
3,1,1,20,0,0,1,1,2,1
4,0,1,43,1,0,0,0,8,1


In [7]:
# Split the data into categorical and quantitative
def split_categorical_and_quant(df):
    categorical_cols = ['two_year_recid', 'sex', 'race', 'c_charge_degree_F']
    # Need to include 'two_year_recid' in both because it is the target y
    quantitative_cols = ['two_year_recid', 'age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count']
    return df[categorical_cols], df[quantitative_cols]

categ_train_df, quant_train_df = split_categorical_and_quant(train_df)
categ_test_df, quant_test_df = split_categorical_and_quant(test_df)

In [8]:
# Split the data into y = 0 and y = 1
def split_by_y(df):
    return df.loc[df['two_year_recid'] == 0, :], df.loc[df['two_year_recid'] == 1, :]

categ_0_train_df, categ_1_train_df = split_by_y(categ_train_df)
quant_0_train_df, quant_1_train_df = split_by_y(quant_train_df)
categ_0_test_df, categ_1_test_df = split_by_y(categ_test_df)
quant_0_test_df, quant_1_test_df = split_by_y(quant_test_df)

In [9]:
categ_1_train_df.head()

Unnamed: 0,two_year_recid,sex,race,c_charge_degree_F
3,1,1,0,1
5,1,1,0,1
9,1,1,0,1
10,1,1,0,1
18,1,1,0,1


In [10]:
# Split the label from features
def split_y(df):
    X = df.loc[:, df.columns != 'two_year_recid']
    y = df['two_year_recid']
    return X, y
quant_0_train_X, quant_0_train_y = split_y(quant_0_train_df)
quant_1_train_X, quant_1_train_y = split_y(quant_1_train_df)

In [11]:
quant_1_train_y.head()

3     1
5     1
9     1
10    1
18    1
Name: two_year_recid, dtype: int64

In [12]:
# Find mean vector (mu) and covariance matrix (sigma)
def estimate_mu(quant_df):
    '''Estimates the mean vector (mu) using discrete datapoints'''
    return np.array([quant_df[col].mean() for col in quant_df.columns])

mu0 = estimate_mu(quant_0_train_X)
mu1 = estimate_mu(quant_1_train_X)

In [13]:
mu0, mu1

(array([3.63498015e+01, 2.47022497e-02, 4.19056021e-02, 6.21967358e-02,
        2.01676224e+00]),
 array([32.09368421,  0.09315789,  0.14631579,  0.16842105,  4.78315789]))

In [14]:
def estimate_sigma(quant_X, mu):
    '''
    Estimates the covariance matrix (sigma) using discrete datapoints
    See https://github.com/scikit-learn/scikit-learn/blob/7389dba/sklearn/covariance/empirical_covariance_.py#L50
    '''
    X = np.asarray(quant_X)
    X = np.array([row - mu for row in X])
    # May need to do row by row substraction from X and then perform matrix multiplication
    sigma = X.T @ X / X.shape[0]
    return sigma

sigma0 = estimate_sigma(quant_0_train_X, mu0)
sigma1 = estimate_sigma(quant_1_train_X, mu1)

In [15]:
# Fit a multivariate Gaussian Distribution to each of Y = 0 case and Y = 1 case
def fit_gaussian(quant_X):
    '''
    return mu, and sigma
    '''
    mu = estimate_mu(quant_X)
    sigma = estimate_sigma(quant_X, mu)
    return mu, sigma

mu0, simga0 = fit_gaussian(quant_0_train_X)
mu1, simga1 = fit_gaussian(quant_1_train_X)

In [16]:
mu0.shape

(5,)

In [17]:
from scipy.stats import multivariate_normal
dist_0 = multivariate_normal(mu0, sigma0)
dist_1 = multivariate_normal(mu1, sigma1)

In [18]:
dist_0

<scipy.stats._multivariate.multivariate_normal_frozen at 0x2b5d09b6fd0>

### Categorical Feature as joint conditional probabilities

In [19]:
categ_0_train_df.head()

Unnamed: 0,two_year_recid,sex,race,c_charge_degree_F
0,0,1,0,0
1,0,1,0,1
2,0,1,0,1
4,0,1,1,1
6,0,1,0,1


In [20]:
'''
Count all points with 0,0,0 or 0,0,1, etc. for categorical ones
and multiply with that particular point's P_0 or P_1
'''
def build_repr(row):
    key = ''
    if row['sex']:
        key += '1'
    else:
        key += '0'
    if row['race']:
        key += '1'
    else:
        key += '0'
    if row['c_charge_degree_F']:
        key += '1'
    else:
        key += '0'
    return key
def calculate_cat_pr(df):
    '''
    Take df and return dictionary that maps key (indexes) and
    values (probabilities).
    '''
    n = df.shape[0]
    try:
        categ_X, _ = split_y(df)
    except KeyError:
        pass
    keys = ['000', '001', '010', '011', '100', '101', '110', '111']
    values = [0] * 8
    pr_dict = dict(zip(keys, values))
    for index, row in df.iterrows():
        key = build_repr(row)
        pr_dict[key] += 1
        
    for key in pr_dict:
        pr_dict[key] /= n
    return pr_dict

cat_0_pr_dict = calculate_cat_pr(categ_0_train_df)
cat_1_pr_dict = calculate_cat_pr(categ_1_train_df)

In [21]:
cat_0_pr_dict

{'000': 0.0564622849580944,
 '001': 0.06969563299514778,
 '010': 0.049404499338332596,
 '011': 0.04102337891486546,
 '100': 0.17644464049404499,
 '101': 0.3114247904719894,
 '110': 0.11689457432730481,
 '111': 0.17865019850022057}

In [22]:
def get_cat_pr(x):
    '''
    Take index of a row and return correct P_3 or P_4 based on the row's
    categorical columns' values.
    '''
    key = build_repr(x)
    categ_pr = None
    if x['two_year_recid'] == 0:
        categ_pr = cat_0_pr_dict[key]
    else:
        categ_pr = cat_1_pr_dict[key]
    return categ_pr
get_cat_pr(categ_0_train_df.loc[1])

0.3114247904719894

In [23]:
# Probability that y = 0 and y = 1
Pr_y_eq_0 = categ_0_train_df.shape[0] / train_df.shape[0]
Pr_y_eq_1 = categ_1_train_df.shape[0] / train_df.shape[0]

In [24]:
Pr_y_eq_0, Pr_y_eq_1

(0.5440364770818334, 0.45596352291816655)

In [25]:
def predict(row, dist_0, dist_1, cat_0_pr_dict, cat_1_pr_dict, Pr_y_eq_0, Pr_y_eq_1):
    prediction = None
    prob = 0
    # Preprocessing to fit the row into the model
    categ_row, quant_row = split_categorical_and_quant(row)
    quant_row = quant_row[1:]
    categ_row = categ_row[1:]
    
    prob_0 = dist_0.pdf(quant_row)
    prob_0 *= cat_0_pr_dict[build_repr(categ_row)]
    prob_0 *= Pr_y_eq_0
    
    prob_1 = dist_1.pdf(quant_row)
    prob_1 *= cat_1_pr_dict[build_repr(categ_row)]
    prob_1 *= Pr_y_eq_1
    
    if prob_0 >= prob_1:
        prediction = 0
    else:
        prediction = 1
    return prediction

In [26]:
for index, row in test_df.iterrows():
    print(f'Prediction for {index} is ', predict(row, dist_0, dist_1, cat_0_pr_dict, cat_1_pr_dict, Pr_y_eq_0, Pr_y_eq_1))
    break

Prediction for 0 is  1


In [27]:
def evaluate(test_df, dist_0, dist_1, cat_0_pr_dict, cat_1_pr_dict, Pr_y_eq_0, Pr_y_eq_1):
    labels = test_df['two_year_recid']
    total = test_df.shape[0]
    num_correct = 0
    for index, row in test_df.iterrows():
        prediction = predict(row, dist_0, dist_1, cat_0_pr_dict, cat_1_pr_dict, Pr_y_eq_0, Pr_y_eq_1)
        if prediction == labels[index]:
            num_correct += 1
    accuracy = num_correct / total
    return accuracy
evaluate(test_df, dist_0, dist_1, cat_0_pr_dict, cat_1_pr_dict, Pr_y_eq_0, Pr_y_eq_1)

0.633