# Feature Selection for Drug Overdose Data

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import patsy 
import math

pd.set_option('display.max_columns', 100)
%matplotlib inline

## Load CSV Files into Pandas DataFrames
The csv files are names as follows for each state:
* `{state}0.csv` - Introduction
* `{state}1.csv` - Outcomes & Factors Rankings
* `{state}2.csv` - Outcomes & Factors SubRankings
* `{state}3.csv` - Ranked Measure Data
* `{state}4.csv` - Additional Measure Data
* `{state}5.csv` - Ranked Measure Sources & Years
* `{state}6.csv` - Addtl Measure Sources & Years

The Additional Measure Data looks to contain the valuable information for this usecase.

In [2]:
states = ['alabama', 'florida', 'louisiana', 'nebraska', 'oklahoma', 'vermont', 'alaska', 
          'georgia', 'maine', 'nevada', 'oregon', 'virginia', 'arizona', 'hawaii', 
          'maryland', 'new-hampshire', 'pennsylvania', 'washington', 'arkansas', 'idaho',
          'massachusetts', 'new-jersey', 'rhode-island', 'west-virginia', 'california',
          'illinois', 'michigan', 'new-mexico', 'south-carolina', 'wisconsin', 'colorado',
          'indiana', 'minnesota', 'new-york', 'south-dakota', 'wyoming', 'connecticut', 
          'iowa', 'mississippi', 'north-carolina', 'tennessee', 'delaware', 'kansas',
          'missouri', 'north-dakota', 'texas', 'district-of-columbia', 'kentucky', 
          'montana', 'ohio', 'utah'
]

In [3]:
measure_df = pd.DataFrame()
for state in states:
    data_path = 'data/csv/{}.csv.4'.format(state)
    measure_df = pd.concat([measure_df, pd.read_csv(data_path)])

In [4]:
measure_df.describe()

Unnamed: 0,FIPS,# Deaths,Age-Adjusted Mortality,95% CI - Low,95% CI - High,Age-Adjusted Mortality (Black),Age-Adjusted Mortality (Hispanic),Age-Adjusted Mortality (White),# Deaths.1,Child Mortality Rate,95% CI - Low.1,95% CI - High.1,Child Mortality Rate (Black),Child Mortality Rate (Hispanic),Child Mortality Rate (White),# Deaths.2,Infant Mortality Rate,95% CI - Low.2,95% CI - High.2,Infant Mortality Rate (Black),Infant Mortality Rate (Hispanic),Infant Mortality Rate (White),% Frequent Physical Distress,95% CI - Low.3,95% CI - High.3,% Frequent Mental Distress,95% CI - Low.4,95% CI - High.4,% Diabetic,95% CI - Low.5,95% CI - High.5,# HIV Cases,HIV Prevalence Rate,# Food Insecure,% Food Insecure,# Limited Access,% Limited Access,# Drug Overdose Deaths,Drug Overdose Mortality Rate,Range Drug Overdose Mortality Rate,# Motor Vehicle Deaths,MV Mortality Rate,95% CI - Low.6,95% CI - High.6,% Insufficient Sleep,95% CI - Low.7,95% CI - High.7,# Uninsured,% Uninsured,95% CI - Low.8,95% CI - High.8,# Uninsured.1,% Uninsured.1,95% CI - Low.9,95% CI - High.9,Costs,Other PCP Rate,% Disconnected Youth,Household Income,95% CI - Low.10,95% CI - High.10,Household income (Black),Household income (Hispanic),Household income (White),% Free or Reduced Lunch,Segregation index,Segregation Index,Homicide Rate,95% CI - Low.11,95% CI - High.11,# Firearm Fatalities,Firearm Fatalities Rate,95% CI - Low.12,95% CI - High.12,Population,% < 18,% 65 and over,# African American,% African American,# American Indian/Alaskan Native,% American Indian/Alaskan Native,# Asian,% Asian,# Native Hawaiian/Other Pacific Islander,% Native Hawaiian/Other Pacific Islander,# Hispanic,% Hispanic,# Non-Hispanic White,% Non-Hispanic White,# Not Proficient in English,% Not Proficient in English,95% CI - Low.13,95% CI - High.13,% Female,# Rural,% Rural
count,3193.0,3128.0,3128.0,3128.0,3128.0,1335.0,798.0,1557.0,1982.0,1982.0,1982.0,1982.0,499.0,360.0,605.0,1312.0,1312.0,1312.0,1312.0,350.0,229.0,405.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3143.0,3143.0,2475.0,2475.0,3193.0,3193.0,3174.0,3174.0,1706.0,1706.0,3185.0,2742.0,2742.0,2742.0,2742.0,3193.0,3193.0,3193.0,3192.0,3192.0,3192.0,3192.0,3192.0,3192.0,3192.0,3192.0,3186.0,3158.0,2119.0,3192.0,3192.0,3192.0,1939.0,2458.0,2759.0,3173.0,2102.0,2837.0,1296.0,1296.0,1296.0,2236.0,2236.0,2236.0,2236.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3193.0,3017.0,3017.0,3193.0,3186.0,3186.0
mean,30360.922643,2329.359974,400.472666,353.251982,451.146228,524.963446,254.895113,408.987861,147.260848,61.589908,41.042735,90.242028,96.495936,47.92456,49.389702,241.349848,6.874979,5.136738,9.071646,12.118114,5.168202,5.078608,11.983965,11.604198,12.373695,12.208911,11.867371,12.558964,11.350767,8.885746,14.254757,767.685657,190.820727,27373.86,14.131475,11480.44,8.615288,188.213951,19.835819,8.050557,177.711889,18.655892,13.04132,26.159117,33.057112,32.046344,34.082142,16103.35,14.25574,12.406953,16.104527,2410.670426,6.534262,4.747028,8.321496,9624.307241,71.718584,18.526083,49660.226504,45138.036507,54182.4165,35625.556988,42240.774614,52496.852483,54.440246,45.782296,31.218844,6.078997,4.195833,8.694444,155.190966,14.645196,9.886047,21.217844,202397.4,22.347379,18.342257,25198.39,8.97725,2539.711243,2.303904,11474.18,1.531183,483.228938,0.139056,35997.67,9.328109,124002.3,76.457899,8393.362,1.773349,1.116786,2.682814,49.909942,37335.6,58.058842
std,15171.912105,13026.009115,109.295143,101.275065,126.681152,140.099899,99.290883,102.295564,684.378347,24.168537,15.675854,41.905883,29.528216,15.416086,19.44931,933.345714,2.274151,1.651324,3.446827,3.481924,1.419646,1.453792,2.320661,2.266205,2.380465,1.893919,1.850709,1.945604,2.516553,2.072692,3.191413,5332.722589,212.212314,170113.6,4.213833,66921.65,8.311503,778.360931,10.385364,3.884775,954.405257,9.367326,6.241031,14.69382,4.149738,4.138487,4.167087,114178.6,6.229976,5.639342,6.840049,17980.739681,3.497928,2.643342,4.372485,1494.585527,53.270214,8.56917,12887.902013,12594.114135,13335.069196,17570.558191,15662.291153,13698.578286,17.849035,16.78712,13.298935,4.533063,3.584339,6.294501,741.605158,6.640522,4.489768,10.856385,1241020.0,3.459019,4.556529,162332.3,14.261164,18299.888099,7.668229,122754.9,2.940771,5120.505762,0.993618,380452.2,13.615769,656619.5,20.053413,89118.73,2.918071,2.357362,3.91186,2.259518,183940.9,31.552734
min,1000.0,20.0,133.0,94.9,155.8,173.0,90.6,125.1,10.0,17.095316,8.6,24.5,38.920602,19.941023,14.758727,20.0,2.727836,1.7,3.3,4.737415,2.534416,1.993701,6.936987,6.755524,7.129305,8.034557,7.227566,8.269715,3.0,2.3,4.0,5.0,10.4,20.0,3.4,0.0,0.0,10.0,2.925634,1.0,10.0,2.923155,1.9,3.2,23.028348,22.126452,23.94679,9.0,2.615943,2.258497,2.97339,6.0,0.826673,0.588375,1.064971,3895.85,0.0,0.0,22045.0,19704.765957,24282.617021,6667.0,6902.0,22159.0,0.0,0.258414,0.03662,0.689198,0.3,1.0,10.0,1.725875,1.4,2.0,88.0,0.0,4.632851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.501749,22.0,2.811823,0.0,0.0,0.0,0.232356,27.801194,0.0,0.0
25%,18173.0,185.0,322.275,282.25,361.375,426.95,185.425,338.2,16.0,45.660701,30.025,62.3,76.654452,38.027623,37.279816,29.0,5.296077,3.9,6.7,9.624406,4.142012,4.078559,10.204906,9.849098,10.551742,10.793477,10.493412,11.118567,9.6,7.4,12.1,15.0,75.35,1570.0,11.4,614.5207,3.64579,17.0,12.720624,5.0,23.0,11.976904,8.6,15.5,30.039239,29.059901,31.04919,811.5,9.09409,7.783175,10.463854,157.0,4.060709,2.914115,5.207719,8623.29,40.782652,12.725,41128.0,36661.462765,45314.819149,24948.0,32679.0,43417.0,42.362795,34.257863,22.378189,3.095247,2.0,4.4,17.0,10.17335,6.8,13.8,11191.0,20.274134,15.442751,113.0,0.671955,69.0,0.362445,53.0,0.439625,5.0,0.028748,353.0,2.216515,8172.0,64.724099,42.0,0.290574,0.0,0.88767,49.470662,7144.0,32.560275
50%,29175.0,414.5,387.3,342.55,437.85,529.1,232.1,405.2,27.0,56.982689,38.7,81.75,94.997693,45.658312,46.272926,50.0,6.487905,4.9,8.4,11.893473,5.030378,4.961794,11.714028,11.341893,12.0885,12.090073,11.756154,12.426118,11.2,8.7,14.2,39.0,123.7,3860.0,13.6,1681.089,6.507295,34.0,17.613771,7.0,44.0,16.97184,11.9,23.4,32.987269,31.964297,34.058084,2038.5,13.544914,11.730658,15.377571,344.0,5.586927,3.965344,7.192374,9599.61,62.375695,17.19,47731.5,43118.457446,52335.925532,30952.0,40141.0,50079.0,53.035994,46.514512,30.732886,4.924057,3.2,7.0,30.0,13.852293,9.4,19.4,26407.0,22.330821,18.008987,834.0,2.224463,196.0,0.613861,173.0,0.705495,15.0,0.057886,1094.0,4.150043,20819.0,83.753385,197.0,0.756731,0.19633,1.494815,50.336546,14707.5,58.84108
75%,45075.0,1033.25,469.375,419.35,528.95,612.4,301.6,477.7,67.0,71.536074,48.6,106.375,112.092347,54.712524,56.638505,123.5,8.008715,6.025,10.7,14.258515,5.964821,5.944559,13.526917,13.113286,13.928644,13.463466,13.082846,13.857865,13.0,10.2,16.5,144.0,228.45,10140.0,16.3,5213.224,10.611164,91.75,24.237568,10.0,88.0,23.408087,16.2,33.4,36.132389,35.062205,37.114323,5519.5,18.21769,15.98612,20.533737,905.25,8.185353,5.921402,10.432079,10500.1925,89.764035,22.61,55447.5,50563.627659,60391.090426,41725.5,48999.5,57989.0,64.73191,57.898326,39.654278,7.674497,5.3,11.1,69.0,18.122711,12.225,26.4,72351.0,24.05979,20.788675,5930.0,10.249284,697.0,1.309717,913.0,1.403066,71.0,0.11326,5376.0,9.534225,57233.0,92.624597,975.0,1.935788,1.167465,2.848546,51.024233,26233.75,86.187079
max,56045.0,334851.0,1142.6,933.7,1487.8,1691.0,986.5,943.5,14769.0,248.812486,147.2,445.2,251.635632,118.595825,196.439533,15936.0,29.562982,18.7,44.4,29.957204,11.477762,10.289634,24.618454,23.80099,25.45332,22.20633,21.499727,22.926385,21.1,17.2,25.9,128681.0,2590.2,4885220.0,37.9,2180417.0,71.844209,13834.0,87.202309,52.0211,24427.0,76.781327,59.4,141.2,46.707783,45.660209,47.827391,3862049.0,43.395037,41.488654,45.30142,747567.0,26.153846,19.824027,32.587889,19802.56,1335.65938,83.52,134609.0,129032.82979,140185.17021,243531.0,175893.0,144896.0,100.0,90.365566,90.73763,40.94782,38.5,43.8,15315.0,78.718021,44.5,132.1,39250020.0,40.849795,56.309881,3298870.0,85.151548,648055.0,93.067462,5817509.0,44.265777,196944.0,50.0,15280770.0,96.254016,14802980.0,97.977244,3718345.0,32.689951,24.808927,82.998352,56.546085,3847522.0,100.0


## Create Urban Feature


In [5]:
measure_df['URBAN'] = measure_df['% Rural'] <=50