# Presentation

#### Goal: Enhance emergency housing allocation.


#### Caracteristics of the model:
Accuracy:
- if used as a clearing tool removing overburden upfront, and thus only to get rid of obvious cases, the accuracy of such a tool could be its most important caracteristic.

Interpretability:
- could help families understand the decision (although not as important as in diseases predictions).
- can also highlight and thus control biases (racial, sex, age).
- since the tool would probably be used in combination with human selection, it could help save time by highlighting the main factors for each decision

#### Conclusion:
- a model easily interpretable could be prefered (tree).
- or a highly accurate model (less interpretable) could also be used upfront (NN).

# Classes, Functions & imports

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
from sklearn import tree

#os.chdir('/Users/Pro/Desktop/Git_Contests/Predictions/Emergency_housing/')
from cobratools import Analysis

In [2]:
# Define the test scorer
def competition_scorer(y_true, y_pred):
    return log_loss(y_true, y_pred, sample_weight=10**y_true)

In [3]:
# General constants
USE_PRE_PROCESSED_DATA = False
PRINT_ON = False
PLOT_ON = False

## Load data

In [4]:
if not USE_PRE_PROCESSED_DATA:

    requests_train = pd.read_csv(filepath_or_buffer='data/requests_train.csv',
                                sep=',',
                                low_memory=False,
                                error_bad_lines=False)

    requests_test = pd.read_csv(filepath_or_buffer='data/requests_test.csv',
                                sep=',',
                                low_memory=False,
                                error_bad_lines=False)

    individuals_train = pd.read_csv(filepath_or_buffer='data/individuals_train.csv',
                                    sep=',',
                                    low_memory=False,
                                    error_bad_lines=False)

    individuals_test = pd.read_csv(filepath_or_buffer='data/individuals_test.csv',
                                sep=',',
                                low_memory=False,
                                error_bad_lines=False)

else:
    df_train = pd.read_csv(filepath_or_buffer='data/data_train_preprocessed.csv',
                                sep=',',
                                low_memory=False)

## Join datasets

In [5]:
if not USE_PRE_PROCESSED_DATA:
    df_train = pd.merge(requests_train, individuals_train, on='request_id', how='outer')
    df_test = pd.merge(requests_test, individuals_test, on='request_id', how='outer')
    del requests_train, requests_test
    del individuals_train, individuals_test

    # Set index col as individual id
    df_train.set_index('individual_id', inplace=True)
    df_test.set_index('individual_id', inplace=True)

# Pre-process data
(while analyzing only train data)

In [6]:
# Instanciate analysis object
analyze_train = Analysis(df_train)
analyze_test = Analysis(df_test)
del df_train
del df_test

# Define properties
target = 'granted_number_of_nights' 
analyze_test.target = target
analyze_train.target = target
n_samples = analyze_train.df.shape[0]

## Transform categorical features
- booleans: replace by (1, 0)
- 2 < cats < 11: one-hot encoding
- No transform on features with more than 11 categories

In [7]:
if not USE_PRE_PROCESSED_DATA:
    # Pre-process columns:
    # booleans: 't', 't' => True, False
    # Categorical with few classes => one-hot encoding
    mapping_true_false_train, failed_train = analyze_train.transform_categories()
    mapping_true_false_test, failed_test = analyze_test.transform_categories()

    # Preprocess specific cat columns
    analyze_train.convert_to_bool(col='group_type',
                                true_val='group',
                                false_val='individual')

    # Export data
    #analyze_train.export_data('data/data_train_preprocessed.csv')
    #analyze_test.export_data('data/data_test_preprocessed.csv')

Transform Boolean at col animal_presence: True/False=['f' 't']
Transform Boolean at col child_to_come: True/False=['f' 't']

ERROR - Transform Boolean at col group_type not recognized: ['individual' 'group']

Transform Boolean at col long_term_housing_request: True/False=['t' 'f']
Transform Boolean at col victim_of_violence: True/False=['f' 't']
Transform Boolean at col childcare_center_supervision: True/False=['t' 'f']
Transform Boolean at col disabled_worker_certification: True/False=['f' 't']
Transform Boolean at col gender: True/False=['male' 'female']
Transform Boolean at col pregnancy: True/False=['f' 't']
Transform Boolean at col animal_presence: True/False=['f' 't']
Transform Boolean at col child_to_come: True/False=['f' 't']

ERROR - Transform Boolean at col group_type not recognized: ['group' 'individual']

Transform Boolean at col long_term_housing_request: True/False=['f' 't']
Transform Boolean at col victim_of_violence: True/False=['f' 't']
Transform Boolean at col childca

## Impute NaNs: for categorical variables

In [None]:
# Get Na counts: by feature, by sample
na_ft, na_sp = analyze_train.get_na_counts()
if PRINT_ON:
    print('NaNs count by feature\n\n', na_ft[na_ft!=0])

# Analyze Na distribution by sample
# ie. when a sample has NaNs, it has 5 NaNs most frequently
if PLOT_ON:
    na_sp.hist()
    _ = plt.plot()

### Gender NaNs

#### Why missing?
- hyp1: only 1 entry concerned, thus, it might have been on purpose
- hyp2: person not recognizing itself in one of these 2 genders

#### Then?
- Only 1 nan in the entire df_train, too few to spend time
thinking clever imputation

#### Conclusion
=> Replace by whichever gender

In [None]:
if not USE_PRE_PROCESSED_DATA:
    # Get samples with Gender Nan
    gender_na = analyze_train.df[analyze_train.df['gender'].isna()]

    # > Replace by whichever gender, say 0
    analyze_train.df.loc[gender_na.index[0], 'gender'] = 0

### Pregnancy NaNs

#### Why missing?
- hyp1: only 14 entries concerned, thus, it might have been mostly on purpose
- hyp2: woman has doubts but hasn't verified

#### Then?
- if hyp2 mostly true, the request could have been treated as if pregnant

#### Conclusion
=> if true, replace by pregnant: True

#### Explore impact on target

Results: there seems not to have a significative direct correlation with target

In [21]:
# Summary of target counts:
# ± half individuals granted >=1 night(s)
# which means < 50% of requests
# Rq: still nicely balanced dataset
target_counts = analyze_train.df[target].value_counts()

if PRINT_ON:
    print('Absolute\n', target_counts)
    print('\n\nPercentage\n', target_counts / n_samples * 100)

# Filter pregnants
mask_pregnant = analyze_train.df['pregnancy'] == True

# Count pregnants: 11k+ pregnant
n_pregnants = sum(mask_pregnant)

# Percentage ± 3%
pct_pregnants = n_pregnants/ n_samples * 100


# Correlation target & pregnancy: ± -.1% (significant? not)
analyze_train.df[[target, 'pregnancy']].corr()

# Summary
target_pregnancy_counts = analyze_train.df[target].groupby(analyze_train.df['pregnancy']).value_counts()

if PRINT_ON:
    # Pct pregnants by target
    print('\n\nN of nights granted for pregnants', target_pregnancy_counts[1] / n_pregnants)
    # Pct not pregnants by target
    print('\nN of nights granted for not pregnants', target_pregnancy_counts[0] / (n_samples - n_pregnants))

# => Repartitions are similar

#### Pattern for missing pregnancy?

In [26]:
analyze_train.df['gender'].groupby(analyze_train.df['pregnancy']).value_counts()

pregnancy  gender
0.0        male      233108
           female    139821
           0              1
1.0        female     11159
           male          30
Name: gender, dtype: int64

In [None]:
# Pattern with child_situation
analyze_train.df[target].groupby(analyze_train.df['pregnancy']).value_counts()

In [None]:
# Requests granted (nb_nights > 0)
granted = analyze_train.df[target] > 0


# Outliers

## Gender

Only females are possibly pregnant, thus 30 males have made a mistake

In [27]:
analyze_train.df['gender'].groupby(analyze_train.df['pregnancy']).value_counts()

pregnancy  gender
0.0        male      233108
           female    139821
           0              1
1.0        female     11159
           male          30
Name: gender, dtype: int64

In [None]:
# Retrieve the individual ids, and correct for male -> pregnancy = 0

# Analyze data

## Analysis (df_train)
### General:
- Number of requests: 238191
- Number of individuals: 384133
- Number of features: 39

Requests are made for 1.6 pers on average.


### Correlations with granted_number_of_nights
- housing_situation_id: -0.458581. Strong negative impact. A high value must represent the good quality of the current housing situation.
- housing_situation_2_id: 0.283840. Strong positive impact. A higher value must conversely represent a degraded quality.

### Principal components
- housing_situation_2_label: with value "emergency accomodation". High probability to get 1 or two nights.

In [None]:
Analyze_df_train = Analysis(df_train)
#Analyze_df_train.describe(investigation_level=3)
#Analyze.visualize()

In [None]:
target = 'granted_number_of_nights'
feature = 'animal_presence'
mask = df_train[feature] == 't'
df_train[mask][[feature, target]]

In [None]:
# Check for NA, inf
columns_selected = ['animal_presence']
df_train[columns_selected].isnull().sum()
df_train.isnull().sum()

## Analysis of features

feature: housing_situation_2_label
- A majority of requests with the label "emergency accomodation" obtains 1 or 2 nights.

In [None]:
# Impact of feature on target
join_key = 'request_id'
target = 'granted_number_of_nights'
feature = 'housing_situation_2_label'
mask = df_train[feature] == 'emergency accomodation'

# Hist: drop duplicate requests (due to indiv data merged)
df_train[mask][[join_key, target]].drop_duplicates().hist()
plt.show()

# Predict

## Build models

### Benchmarks

In [None]:
# Random uniform train/test
random_preds_train = np.random.uniform(size=(requests_train.shape[0], 4))
random_preds_test = np.random.uniform(size=(requests_test.shape[0], 4))

# Dumb (always pred 3)
dumb_preds_train = np.zeros((requests_train.shape[0], 4))
dumb_preds_test = np.zeros((requests_test.shape[0], 4))
# Set 10% pred everywhere (if not, log penalyzes hardly)
dumb_preds_train[:,:] = .01
dumb_preds_test[:,:] = .01
# Set 20% pred on class 3
dumb_preds_train[:,2] = .02
dumb_preds_test[:,2] = .02

### Univariate predictions

We observed a significant (negative) correlation of housing_situation_id with granted_number_of_nights, let's train a univariate model


In [None]:
# Set model' parameters
clf = tree.DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=2,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=None,
    max_leaf_nodes=None,
    class_weight=None)

# Build train/test datasets with housing_situation_id only
X_train = np.array(requests_train['housing_situation_id']).reshape(-1, 1)
X_test = np.array(requests_test['housing_situation_id'].values).reshape(-1, 1)
Y_train = requests_train.granted_number_of_nights.values
Y_test = requests_test.granted_number_of_nights.values

# Transform categorical target into a one-hot vector
Y_train_onehot = to_onehot(Y_train)
Y_test_onehot = to_onehot(Y_test)

# Train the model
clf = clf.fit(X_train, Y_train_onehot)

# Yield train/test predictions
preds_train_tree_univar = clf.predict(X_train)
preds_test_tree_univar = clf.predict(X_test)

# Fill predictions to .2 elsewhere
preds_train_tree_univar[preds_train_tree_univar == 0] = .2
preds_test_tree_univar[preds_test_tree_univar == 0] = .2

# Evaluate train/test
score_train = competition_scorer(Y_train, preds_train_tree_univar)
score_test = competition_scorer(Y_test, preds_test_tree_univar)

# Display results
print(f'train score: {score_train:.2f}')
print(f'test score: {score_test:.2f}')

In [None]:
probas = clf.predict_proba(X_train)
v0 = probas[0].max(1)
v1 = probas[1].max(1)
v2 = probas[2].max(1)
v3 = probas[3].max(1)

## Evaluate models

In [None]:
y_true_test = requests_test.granted_number_of_nights.values

# Evaluate benchmarks
random_score_test = competition_scorer(y_true_test, random_preds_test)
dumb_score_test = competition_scorer(y_true_test, dumb_preds_test)

# Display results
print(f'test score random: {random_score_test:.2f}')
print(f'test score dumb: {dumb_score_test:.2f}')

### Train set

In [None]:
y_true_train = requests_train.granted_number_of_nights.values

# Evaluate benchmarks
random_score_train = competition_scorer(y_true_train, random_preds_train)
dumb_score_train = competition_scorer(y_true_train, dumb_preds_train)

# Display results
print(f'train score random: {random_score_train:.2f}')
print(f'train score dumb: {dumb_score_train:.2f}')

### Test set

## Interpret models

In [None]:
# Tree
fn = ['housing_situation_id']
cn = ['0', '1', '2', '3']

tree.plot_tree(clf,
               feature_names = fn, 
               class_names=cn,
               filled = True)

requests_train['housing_situation_id'].hist()