In [1]:
# import dependencies
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Import our input dataset
socio_health_df = pd.read_csv('Resources/us_county_sociohealth_data.csv')
socio_health_df.head()

Unnamed: 0,SN,fips,state,county,lat,lon,total_population,area_sqmi,population_density_per_sqmi,num_deaths,...,percentile_rank_minorities,percentile_rank_limited_english_abilities,percentile_rank_minority_status_and_language_theme,percentile_rank_multi_unit_housing,percentile_rank_mobile_homes,percentile_rank_overcrowding,percentile_rank_no_vehicle,percentile_rank_institutionalized_in_group_quarters,percentile_rank_housing_and_transportation,percentile_rank_social_vulnerability
0,1,1001,Alabama,Autauga,32.534928,-86.642748,55049,594.44612,92.605533,791.0,...,0.6339,0.5355,0.5976,0.6791,0.7268,0.2477,0.3298,0.1251,0.2881,0.3773
1,2,1003,Alabama,Baldwin,30.727489,-87.722575,199510,1589.807425,125.493187,2967.0,...,0.5253,0.5282,0.5294,0.9733,0.5387,0.2639,0.0872,0.3438,0.3324,0.2757
2,3,1005,Alabama,Barbour,31.869589,-85.393213,26614,884.875776,30.076538,472.0,...,0.9042,0.6979,0.8558,0.2814,0.937,0.4438,0.8816,0.9427,0.9312,0.9847
3,4,1007,Alabama,Bibb,32.998634,-87.12648,22572,622.582355,36.255444,471.0,...,0.645,0.3553,0.5018,0.4072,0.9249,0.0248,0.5645,0.9156,0.6663,0.5737
4,5,1009,Alabama,Blount,33.980878,-86.567383,57704,644.806508,89.490412,1085.0,...,0.4238,0.7482,0.5992,0.1344,0.8465,0.5056,0.1907,0.1515,0.1827,0.4986


In [2]:
# add a categorical column for socio-economic score
se_score = []
for row in socio_health_df['social_economic_score']:
    if row > 150:
        se_score.append('low')
    else:
        se_score.append('high')
socio_health_df['se_score'] = se_score

In [3]:
# keep only relevant columns
se_category_df = socio_health_df[['lat', 'lon', 'se_score']].dropna(axis=0)
se_category_df.head()

Unnamed: 0,lat,lon,se_score
0,32.534928,-86.642748,high
1,30.727489,-87.722575,high
2,31.869589,-85.393213,low
3,32.998634,-87.12648,low
4,33.980878,-86.567383,low


In [4]:
# encode categorical data

se_category_encoded = pd.get_dummies(se_category_df, columns=['se_score'])
se_category_encoded.head()

Unnamed: 0,lat,lon,se_score_high,se_score_low
0,32.534928,-86.642748,1,0
1,30.727489,-87.722575,1,0
2,31.869589,-85.393213,0,1
3,32.998634,-87.12648,0,1
4,33.980878,-86.567383,0,1


In [5]:
# import dependencies for machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [6]:
# Split data into feature and target
y = se_category_encoded['se_score_high']
X = se_category_encoded.drop('se_score_low', axis=1)

In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [10]:
# Create a LogisticRegression model
classifier = LogisticRegression(solver='saga',
   max_iter=500,
   random_state=1)

In [11]:
# Train the model with the training data
classifier.fit(X_train, y_train)



LogisticRegression(max_iter=500, random_state=1, solver='saga')

In [12]:
# test the model with the test data
y_pred = classifier.predict(X_test)

In [13]:
# Calculate accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

1.0


In [14]:
# Get confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[376   0]
 [  0 410]]
