# Logistic Regression

### Predicting the status of a food desert based on distance from grocery store to bus stop, along with other demographic variables

#### Selected Variables:


LILATracts_halfAnd10: Flag for food desert when considering low accessibilty at 1/2 and 10 miles


distance: avg location from a SNAP store to a bus stop

POP2010: Population, tract total

OHU2010: Housing units, total

PovertyRate: Tract Poverty Rate

MedianFamilyIncome: Tract Median Family Income*

TractLOWI: Tract low-income population, number

TractKids: Tract children age 0-17, number

TractSeniors: Tract seniors age 65+, number

TractWhite: Tract White population, number

TractBlack: Track Black or African American population, number

TractAsian: Tract Asian population, number

TractNHOPI: Tract Native Hawaiian and Other Pacific Islander population, number

TractAIAN: Tract American Indian and Alaska Native population, number

TractOMultir: Tract Other/Multiple race population, number

TractHispanic: Tract Hispanic or Latino population, number

TractHUNV: Tract housing unites without a vehicle, number

TractSNAP: Tract housing units reveiving SNAP benefits, number







# Code

In [395]:
#dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sn
import matplotlib.pyplot as plt
import pickle

In [285]:
data = pd.read_excel(open('DataDownload2015.xlsx', 'rb'), sheet_name = 'Food Access Research Atlas', converters={'CensusTract':str})


In [286]:
data.head()

Unnamed: 0,CensusTract,State,County,Urban,POP2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
0,1001020100,Alabama,Autauga,1,1912,693,0,0,0.0,0,...,221,1622,217,14,0,14,45,44,26,112
1,1001020200,Alabama,Autauga,1,2170,743,0,181,0.08341,0,...,214,888,1217,5,0,5,55,75,87,202
2,1001020300,Alabama,Autauga,1,3373,1256,0,0,0.0,0,...,439,2576,647,17,5,11,117,87,108,120
3,1001020400,Alabama,Autauga,1,4386,1722,0,0,0.0,0,...,904,4086,193,18,4,11,74,85,19,82
4,1001020500,Alabama,Autauga,1,10766,4082,0,181,0.016812,0,...,1126,8666,1437,296,9,48,310,355,198,488


In [104]:
filtered_data = data[data['County'] == 'Maricopa']

In [105]:
filtered_data.head()

Unnamed: 0,CensusTract,State,County,Urban,POP2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
1459,4013010101,Arizona,Maricopa,0,5073,2246,0,0,0.0,0,...,1645,4869,44,61,7,12,80,161,42,0
1460,4013010102,Arizona,Maricopa,1,4640,2041,0,0,0.0,0,...,1240,4367,21,108,5,6,133,169,73,14
1461,4013030401,Arizona,Maricopa,1,4283,2008,0,47,0.010974,0,...,1431,4100,30,65,0,11,77,111,58,21
1462,4013030402,Arizona,Maricopa,1,3895,1706,0,0,0.0,0,...,792,3627,27,27,3,18,193,320,0,19
1463,4013040502,Arizona,Maricopa,1,4861,2188,0,169,0.034766,1,...,1340,4343,12,24,1,85,396,759,195,258


In [106]:
for col in filtered_data.columns:
    print (col)

CensusTract
State
County
Urban
POP2010
OHU2010
GroupQuartersFlag
NUMGQTRS
PCTGQTRS
LILATracts_1And10
LILATracts_halfAnd10
LILATracts_1And20
LILATracts_Vehicle
HUNVFlag
LowIncomeTracts
PovertyRate
MedianFamilyIncome
LA1and10
LAhalfand10
LA1and20
LATracts_half
LATracts1
LATracts10
LATracts20
LATractsVehicle_20
LAPOP1_10
LAPOP05_10
LAPOP1_20
LALOWI1_10
LALOWI05_10
LALOWI1_20
lapophalf
lapophalfshare
lalowihalf
lalowihalfshare
lakidshalf
lakidshalfshare
laseniorshalf
laseniorshalfshare
lawhitehalf
lawhitehalfshare
lablackhalf
lablackhalfshare
laasianhalf
laasianhalfshare
lanhopihalf
lanhopihalfshare
laaianhalf
laaianhalfshare
laomultirhalf
laomultirhalfshare
lahisphalf
lahisphalfshare
lahunvhalf
lahunvhalfshare
lasnaphalf
lasnaphalfshare
lapop1
lapop1share
lalowi1
lalowi1share
lakids1
lakids1share
laseniors1
laseniors1share
lawhite1
lawhite1share
lablack1
lablack1share
laasian1
laasian1share
lanhopi1
lanhopi1share
laaian1
laaian1share
laomultir1
laomultir1share
lahisp1
lahisp1share
lahunv1

In [288]:
selected_columns = ['CensusTract', 'State', 'County', 'POP2010', 'OHU2010',
                   'NUMGQTRS', 'LILATracts_halfAnd10', 'PovertyRate', 
                   'MedianFamilyIncome', 'TractLOWI', 'TractKids',
                   'TractSeniors', 'TractWhite', 'TractBlack', 'TractAsian',
                   'TractNHOPI', 'TractAIAN', 'TractOMultir', 'TractHispanic',
                   'TractHUNV', 'TractSNAP']

In [289]:
filtered_data = filtered_data[selected_columns]

filtered_data.head()

Unnamed: 0,CensusTract,State,County,POP2010,OHU2010,NUMGQTRS,LILATracts_halfAnd10,PovertyRate,MedianFamilyIncome,TractLOWI,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
1459,4013010101,Arizona,Maricopa,5073,2246,0,0,4.5,116833,429,...,1645,4869,44,61,7,12,80,161,42,0
1460,4013010102,Arizona,Maricopa,4640,2041,0,0,7.9,143490,598,...,1240,4367,21,108,5,6,133,169,73,14
1461,4013030401,Arizona,Maricopa,4283,2008,47,0,6.2,107693,628,...,1431,4100,30,65,0,11,77,111,58,21
1462,4013030402,Arizona,Maricopa,3895,1706,0,0,7.6,108188,599,...,792,3627,27,27,3,18,193,320,0,19
1463,4013040502,Arizona,Maricopa,4861,2188,169,1,20.7,50377,2513,...,1340,4343,12,24,1,85,396,759,195,258


In [377]:
tract_stats = pd.read_csv('tract_stats.csv', dtype={'Census Tract':str})

In [378]:
tract_stats.head()

len(tract_stats)

916

In [379]:
#len(avg_distances[(avg_distances['distance'] == 99) | (avg_distances['distance'] == 88)])

In [380]:
#filtered_distances = avg_distances[(avg_distances['distance'] != 99) & (avg_distances['distance'] != 88)]

In [381]:
#combined_df = filtered_data.merge(filtered_distances, left_on='CensusTract', right_on='Census Tract')
combined_df = filtered_data.merge(tract_stats, left_on='CensusTract', right_on='Census Tract')

combined_df.head()

Unnamed: 0,CensusTract,State,County,POP2010,OHU2010,NUMGQTRS,LILATracts_halfAnd10,PovertyRate,MedianFamilyIncome,TractLOWI,...,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP,Census Tract,distance,bus_stops,store_count
0,4013010101,Arizona,Maricopa,5073,2246,0,0,4.5,116833,429,...,7,12,80,161,42,0,4013010101,3.989205,0.0,0.0
1,4013010102,Arizona,Maricopa,4640,2041,0,0,7.9,143490,598,...,5,6,133,169,73,14,4013010102,3.989205,0.0,0.0
2,4013030401,Arizona,Maricopa,4283,2008,47,0,6.2,107693,628,...,0,11,77,111,58,21,4013030401,3.989205,0.0,1.0
3,4013030402,Arizona,Maricopa,3895,1706,0,0,7.6,108188,599,...,3,18,193,320,0,19,4013030402,3.989205,0.0,0.0
4,4013040502,Arizona,Maricopa,4861,2188,169,1,20.7,50377,2513,...,1,85,396,759,195,258,4013040502,3.989205,0.0,3.0


In [382]:
combined_df.head()

len(combined_df[combined_df['bus_stops'] == 0])

211

In [383]:
combined_df.to_csv('combined_data.csv')

# Explore the Data

In [384]:
combined_df.groupby('LILATracts_halfAnd10').mean()

Unnamed: 0_level_0,POP2010,OHU2010,NUMGQTRS,PovertyRate,MedianFamilyIncome,TractLOWI,TractKids,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP,distance,bus_stops,store_count
LILATracts_halfAnd10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,4130.689655,1579.183908,27.834154,10.198194,79790.779967,1071.510673,1043.738916,534.676519,3289.325123,166.799672,169.045977,7.623974,65.98358,431.91133,783.676519,70.014778,115.824302,2.69802,6.696223,0.592775
1,4239.501629,1465.34202,118.0,31.991857,39129.599349,2517.990228,1212.456026,446.32899,2552.384365,289.700326,95.361564,10.250814,124.250814,1167.553746,2122.091205,177.322476,329.13355,2.366975,11.664495,0.677524


In [396]:
combined_df.mean()

CensusTract                      inf
POP2010                 4.167158e+03
OHU2010                 1.541029e+03
NUMGQTRS                5.805349e+01
LILATracts_halfAnd10    3.351528e-01
PovertyRate             1.750240e+01
MedianFamilyIncome      6.616307e+04
TractLOWI               1.556302e+03
TractKids               1.100285e+03
TractSeniors            5.050666e+02
TractWhite              3.042337e+03
TractBlack              2.079902e+02
TractAsian              1.443504e+02
TractNHOPI              8.504367e+00
TractAIAN               8.551201e+01
TractOMultir            6.784640e+02
TractHispanic           1.232250e+03
TractHUNV               1.059793e+02
TractSNAP               1.873155e+02
Census Tract                     inf
distance                2.587070e+00
bus_stops               8.361354e+00
store_count             6.211790e-01
dtype: float64

## Model

In [385]:
predictors1 = ['OHU2010','NUMGQTRS', 'PovertyRate',
              'MedianFamilyIncome', 'TractLOWI', 'TractKids',
              'TractSeniors', 'TractWhite', 'TractBlack', 'TractAsian',
              'TractNHOPI', 'TractAIAN', 'TractOMultir', 'TractHispanic',
              'TractHUNV', 'TractSNAP']
predictors2 = ['OHU2010','NUMGQTRS',
              'MedianFamilyIncome', 'TractKids',
              'TractSeniors', 'TractWhite', 'TractBlack', 'TractAsian',
              'TractNHOPI', 'TractAIAN', 'TractOMultir', 'TractHispanic',
              'TractHUNV', 'TractSNAP', 'distance', 'bus_stops', 'store_count']

predictors3 = ['POP2010','OHU2010', 'PovertyRate',
              'MedianFamilyIncome']

predictors4 = ['POP2010','OHU2010', 'PovertyRate',
              'MedianFamilyIncome', 'distance']

predictors5 = ['distance', 'bus_stops', 'store_count']

predictors6 = ['PovertyRate', 'TractNHOPI']

predictors7 = ['PovertyRate', 'distance', 'TractHUNV']

predictors8 = ['PovertyRate']

#X = combined_df['distance'].values.reshape(-1,1)
X = combined_df[predictors2]
y = combined_df['LILATracts_halfAnd10']

In [386]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X)
X = X_scaler.transform(X)

In [387]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [388]:
logistic_regression= LogisticRegression(max_iter=10000)
logistic_regression.fit(X_train,y_train)
y_pred=logistic_regression.predict(X_test)

In [389]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
#sn.heatmap(confusion_matrix, annot=True)

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,y_pred)


array([[140,   6],
       [ 23,  60]], dtype=int64)

In [390]:
y_pred

array([0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0], dtype=int64)

In [391]:
print('Accuracy: ',metrics.accuracy_score(y_test, y_pred))
plt.show()

Accuracy:  0.8733624454148472


In [392]:
logistic_regression.coef_

array([[-0.30700505,  0.90958638, -4.75073858, -1.11620411,  0.82274167,
        -1.34428485,  0.05480481, -1.2214366 ,  0.74045791, -0.07309539,
         2.14019402,  2.12893727,  1.14421447,  3.06824759,  0.47489015,
         1.17487838, -0.10538663]])

In [393]:
coef_dict = dict(zip(predictors2 , logistic_regression.coef_[0]))

In [394]:
coef_dict

{'OHU2010': -0.3070050471694519,
 'NUMGQTRS': 0.9095863780454777,
 'MedianFamilyIncome': -4.750738577049845,
 'TractKids': -1.116204105878588,
 'TractSeniors': 0.8227416746306037,
 'TractWhite': -1.344284849362822,
 'TractBlack': 0.054804805562803705,
 'TractAsian': -1.2214366008591375,
 'TractNHOPI': 0.7404579083086088,
 'TractAIAN': -0.07309539048826902,
 'TractOMultir': 2.14019401542369,
 'TractHispanic': 2.128937274725728,
 'TractHUNV': 1.144214469965708,
 'TractSNAP': 3.0682475892882843,
 'distance': 0.47489015322554595,
 'bus_stops': 1.1748783793322015,
 'store_count': -0.10538662709581492}

In [397]:
#save the model
filename = 'logistic_regression.sav'
scalename = 'x_scale.sav'

pickle.dump(logistic_regression, open(filename, 'wb'))
pickle.dump(X_scaler, open(scalename, 'wb'))