# Exploratory Data Analysis and Feature Importance

In this notebook, we will:
- evaluate the datatset conditions
- look for relationships in within our data
- apply techniques to discover with features are the most important oens

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns

sns.set_style('whitegrid')

# base folders
CLEAN_DATA_DIR = Path('../clean_data/')

In [11]:
df = pd.read_csv(CLEAN_DATA_DIR / 'cleaned_data.csv', index_col=0)
df.sample(3)

Unnamed: 0_level_0,Housing Units,Median Gross Rent ($),Median Owner Cost ($),Median Home Value - Census ($),New Policies,Renewed Policies,Nonrenewed Policies (by Owner),Nonrenewed Policies (by Company),Nonrenewed Policies,Expiring Policies,...,All Disasters 5y,All Disasters 10y,Fire Disasters 1y,Fire Disasters 3y,Fire Disasters 5y,Fire Disasters 10y,Avg % White-only Pop,% Change White-only Pop,Avg Median Income,% Change Median Income
ZIP Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
95220,2938.0,1187.0,694.0,642500.0,882.0,6216.0,634.0,203.0,837.0,7053.0,...,3.0,4.0,0.0,2.0,2.0,3.0,49.414286,-18.8,91238.428571,0.771018
94123,14908.0,2868.0,1438.0,2000001.0,1211.0,11637.0,1017.0,223.0,1240.0,12877.0,...,2.0,3.0,0.0,2.0,2.0,3.0,44.428571,-6.7,179010.714286,0.651885
90065,17135.0,1588.0,642.0,833400.0,3799.0,34204.0,2813.0,780.0,3593.0,37797.0,...,2.0,3.0,0.0,2.0,2.0,3.0,45.857143,-16.4,75247.0,0.590669


## Pearson's Correlation

In [21]:
targets = ['% FAIR Plan Units', 'Total Exposure ($)']
features = df.columns.to_list() 

for target in targets:
    features.remove(target)

In [22]:
from scipy.stats import pearsonr

corrs = []

for col  in features:
    corr_coeff, p_value = pearsonr(df[targets[0]], df[col])
    corr_coeff2, p_value2 = pearsonr(df[targets[1]], df[col])    
    corr_coeff3, p_value3 = pearsonr(np.log(df[targets[1]]), df[col])
    corrs.append([col, corr_coeff, p_value, corr_coeff2, p_value2, corr_coeff3, p_value3])

In [23]:
results = pd.DataFrame(corrs, columns=['Feature', 'Corr FAIR Plan', 'p-value FAIR Plan',
                            'Corr Exposure', 'p-value Exposure',
                            'Corr Log-Exposure', 'p-value Log-Exposure']).sort_values(by='Corr FAIR Plan')

In [24]:
results

Unnamed: 0,Feature,Corr FAIR Plan,p-value FAIR Plan,Corr Exposure,p-value Exposure,Corr Log-Exposure,p-value Log-Exposure
28,% Market Units,-1.0,0.0,-0.620753,4.6682690000000005e-125,-0.530221,1.923907e-85
25,Voluntary Market Units,-0.329182,7.648882000000001e-31,-0.04587,0.1176355,0.053652,0.06715896
5,Renewed Policies,-0.303307,3.249402e-26,-0.003147,0.9145529,0.076909,0.008635794
9,Expiring Policies,-0.290855,3.794717e-24,0.013854,0.636646,0.08736,0.002842455
0,Housing Units,-0.285613,2.6251830000000003e-23,-0.017873,0.5422358,0.058498,0.04590791
14,% Change - Renewed Policies,-0.264831,3.7780619999999997e-20,-0.167165,9.467118e-09,-0.232943,8.055819e-16
27,Total Res Units,-0.259666,2.0909409999999998e-19,0.042727,0.1449852,0.111221,0.0001424041
6,Nonrenewed Policies (by Owner),-0.259152,2.4740759999999996e-19,0.028918,0.3240377,0.068227,0.01986239
1,Median Gross Rent ($),-0.20537,1.465006e-12,0.010792,0.7128908,0.039197,0.1812369
4,New Policies,-0.201175,4.20496e-12,0.104463,0.0003549676,0.130237,8.214907e-06


In [25]:
results.to_excel(CLEAN_DATA_DIR / 'correlations.xlsx', index=False)

## Feature Importance

In [26]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    df,
    test_size=.2,
    random_state=13348,
    shuffle=True
)

In [27]:
from sklearn.ensemble import RandomForestRegressor

# Train two random forests to predict the FAIR Plan Units and Total Exposure
forest1 = RandomForestRegressor(n_estimators=500, max_depth=5)
forest1.fit(df_train[features], df_train[targets[0]])

forest2 = RandomForestRegressor(n_estimators=500, max_depth=5)
forest2.fit(df_train[features], df_train[targets[0]])

In [30]:
importance = pd.DataFrame({
    'Feature': features,
    'Imp. (% FAIR)': forest1.feature_importances_,
    'Imp. (Exposure)': forest2.feature_importances_
}).sort_values(by='Imp. (% FAIR)', ascending=False)

importance

Unnamed: 0,Feature,Imp. (% FAIR),Imp. (Exposure)
28,% Market Units,0.99334,0.993873
16,% Change - Nonrenewed Policies (by Company),0.001436,0.001333
17,% Change - Nonrenewed Policies,0.00078,0.000696
18,% Change - Expiring Policies,0.000721,0.000451
15,% Change - Nonrenewed Policies (by Owner),0.000498,0.000401
45,Avg Median Income,0.000273,0.000135
23,% Change - Claims (Fire and Smoke),0.000201,0.000122
46,% Change Median Income,0.000188,0.000285
12,% Nonrenewed Policies (by Company),0.000143,0.000163
30,Zillow Home Value 2019 ($),0.000121,8.2e-05


In [31]:
importance.to_csv (CLEAN_DATA_DIR / 'feature_importance.csv', index=False)