In [1]:
# import dependencies
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Import our input dataset
hospitals_df = pd.read_csv('Resources/us_hospital_locations.csv')
hospitals_df.head()

Unnamed: 0,X,Y,FID,ID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,...,VAL_DATE,WEBSITE,STATE_ID,ALT_NAME,ST_FIPS,OWNER,TTL_STAFF,BEDS,TRAUMA,HELIPAD
0,-13318890.0,4346975.0,1,5793230,CENTRAL VALLEY GENERAL HOSPITAL,1025 NORTH DOUTY STREET,HANFORD,CA,93230,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.hanfordhealth.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,49,NOT AVAILABLE,N
1,-13226510.0,4049626.0,2,53391362,LOS ROBLES HOSPITAL & MEDICAL CENTER - EAST CA...,150 VIA MERIDA,WESTLAKE VILAGE,CA,91362,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.losrobleshospital.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,62,NOT AVAILABLE,N
2,-13156200.0,4031978.0,3,11190023,EAST LOS ANGELES DOCTORS HOSPITAL,4060 WHITTIER BOULEVARD,LOS ANGELES,CA,90023,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.elalax.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,127,NOT AVAILABLE,N
3,-13171900.0,4041752.0,4,17090028,SOUTHERN CALIFORNIA HOSPITAL AT HOLLYWOOD,6245 DE LONGPRE AVENUE,HOLLYWOOD,CA,90028,NOT AVAILABLE,...,2014/02/10 00:00:00,http://sch-hollywood.com/,NOT AVAILABLE,HOLLYWOOD COMMUNITY HOSPITAL OF HOLLYWOOD,6,PROPRIETARY,-999,100,NOT AVAILABLE,N
4,-13132080.0,4037270.0,5,23691706,KINDRED HOSPITAL BALDWIN PARK,14148 FRANCISQUITO AVENUE,BALDWIN PARK,CA,91706,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.khbaldwinpark.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,95,NOT AVAILABLE,N


In [2]:
# Extract Acute Care Hospitals form DataFrame
acute_care_df = hospitals_df[hospitals_df['TYPE']=='GENERAL ACUTE CARE']
acute_care_df.head()

Unnamed: 0,X,Y,FID,ID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,...,VAL_DATE,WEBSITE,STATE_ID,ALT_NAME,ST_FIPS,OWNER,TTL_STAFF,BEDS,TRAUMA,HELIPAD
0,-13318890.0,4346975.0,1,5793230,CENTRAL VALLEY GENERAL HOSPITAL,1025 NORTH DOUTY STREET,HANFORD,CA,93230,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.hanfordhealth.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,49,NOT AVAILABLE,N
1,-13226510.0,4049626.0,2,53391362,LOS ROBLES HOSPITAL & MEDICAL CENTER - EAST CA...,150 VIA MERIDA,WESTLAKE VILAGE,CA,91362,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.losrobleshospital.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,62,NOT AVAILABLE,N
2,-13156200.0,4031978.0,3,11190023,EAST LOS ANGELES DOCTORS HOSPITAL,4060 WHITTIER BOULEVARD,LOS ANGELES,CA,90023,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.elalax.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,127,NOT AVAILABLE,N
3,-13171900.0,4041752.0,4,17090028,SOUTHERN CALIFORNIA HOSPITAL AT HOLLYWOOD,6245 DE LONGPRE AVENUE,HOLLYWOOD,CA,90028,NOT AVAILABLE,...,2014/02/10 00:00:00,http://sch-hollywood.com/,NOT AVAILABLE,HOLLYWOOD COMMUNITY HOSPITAL OF HOLLYWOOD,6,PROPRIETARY,-999,100,NOT AVAILABLE,N
4,-13132080.0,4037270.0,5,23691706,KINDRED HOSPITAL BALDWIN PARK,14148 FRANCISQUITO AVENUE,BALDWIN PARK,CA,91706,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.khbaldwinpark.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,95,NOT AVAILABLE,N


In [3]:
# Find how many acute care locations
len(acute_care_df.index)

4481

In [4]:
# Keep only OPEN acute care hospitals
acute_care_df = acute_care_df[acute_care_df['STATUS']=='OPEN']
len(acute_care_df.index)

4268

In [5]:
 # Keep only relevant columns
cond_acute_care_df = acute_care_df[['NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'LATITUDE', 'LONGITUDE']]
cond_acute_care_df.head()

Unnamed: 0,NAME,ADDRESS,CITY,STATE,ZIP,LATITUDE,LONGITUDE
1,LOS ROBLES HOSPITAL & MEDICAL CENTER - EAST CA...,150 VIA MERIDA,WESTLAKE VILAGE,CA,91362,34.154939,-118.815736
2,EAST LOS ANGELES DOCTORS HOSPITAL,4060 WHITTIER BOULEVARD,LOS ANGELES,CA,90023,34.023647,-118.184165
3,SOUTHERN CALIFORNIA HOSPITAL AT HOLLYWOOD,6245 DE LONGPRE AVENUE,HOLLYWOOD,CA,90028,34.096391,-118.325235
4,KINDRED HOSPITAL BALDWIN PARK,14148 FRANCISQUITO AVENUE,BALDWIN PARK,CA,91706,34.063039,-117.967438
5,LAKEWOOD REGIONAL MEDICAL CENTER,3700 EAST SOUTH STREET,LAKEWOOD,CA,90712,33.859707,-118.148403


In [6]:
# make new column from turple of 'LATITUDE' and 'LONGITUDE'
cond_acute_care_df['GEOLOCATION'] = list(zip(cond_acute_care_df['LATITUDE'], cond_acute_care_df['LONGITUDE']))
cond_acute_care_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cond_acute_care_df['GEOLOCATION'] = list(zip(cond_acute_care_df['LATITUDE'], cond_acute_care_df['LONGITUDE']))


Unnamed: 0,NAME,ADDRESS,CITY,STATE,ZIP,LATITUDE,LONGITUDE,GEOLOCATION
1,LOS ROBLES HOSPITAL & MEDICAL CENTER - EAST CA...,150 VIA MERIDA,WESTLAKE VILAGE,CA,91362,34.154939,-118.815736,"(34.1549388720001, -118.815736391)"
2,EAST LOS ANGELES DOCTORS HOSPITAL,4060 WHITTIER BOULEVARD,LOS ANGELES,CA,90023,34.023647,-118.184165,"(34.023647302, -118.184164805)"
3,SOUTHERN CALIFORNIA HOSPITAL AT HOLLYWOOD,6245 DE LONGPRE AVENUE,HOLLYWOOD,CA,90028,34.096391,-118.325235,"(34.0963913570001, -118.325234871)"
4,KINDRED HOSPITAL BALDWIN PARK,14148 FRANCISQUITO AVENUE,BALDWIN PARK,CA,91706,34.063039,-117.967438,"(34.063038932, -117.967437788)"
5,LAKEWOOD REGIONAL MEDICAL CENTER,3700 EAST SOUTH STREET,LAKEWOOD,CA,90712,33.859707,-118.148403,"(33.8597066200001, -118.148402965)"


In [7]:
# generate random locations in USA
def random_lat_lon(n, lat_min, lat_max, lon_min, lon_max):
    """
    this code produces an array with pairs lat, lon
    """
    lat = np.random.uniform(lat_min, lat_max, n)
    lon = np.random.uniform(lon_min, lon_max, n)
    
    return list(zip(lat, lon))

#    return np.array(tuple(zip(lat, lon))).tolist()
rand_loc = random_lat_lon(1000, 26.5, 47.0, -70.0, -130.0)
print(rand_loc)

[(27.783614388618506, -81.24981944601676), (37.114955989647065, -125.66822873732477), (32.559801338883574, -83.48043864291694), (31.796704750477282, -124.0792911494604), (39.87327099773287, -83.3659601007121), (39.12467837793843, -93.32652380198472), (29.59790064435151, -79.56584735652802), (31.397788896408343, -83.51957737952358), (40.096748323429864, -103.38980483401629), (28.232831314964148, -110.09466890455386), (43.521834521336906, -98.37406674498162), (35.3313370191259, -82.6664478788968), (39.36821815396229, -84.85781304289095), (42.312379853414484, -112.41331716180026), (44.447441112513985, -85.94254653588511), (34.02610867663159, -125.52817401635565), (43.921953418714835, -119.72112611554064), (33.13363403255432, -74.20168437181484), (43.232417759224745, -108.22927809442507), (37.39168372861033, -85.9003235674934), (30.04744371216457, -72.08509326848322), (28.324606734989462, -73.71359376597222), (40.543379003200414, -96.20343057805941), (37.51744881741379, -89.48686592307854)

In [8]:
# Find closest acute care hospital from each random location
import geopy.distance
from vincenty import vincenty

min_dist = []
med_desert = []
for i in rand_loc:
    dist_list = []
    for row in cond_acute_care_df['GEOLOCATION']:
        dist = vincenty(i, row, miles=True)
        dist_list.append(dist)
    min_dist.append(min(dist_list))
    if min(dist_list)>60:
        med_desert.append('Yes')
    else:
        med_desert.append('No')
        

In [9]:
# make series into dataframe columns

medical_desert_df = pd.DataFrame()
medical_desert_df['Random_Location'] = rand_loc
medical_desert_df['Min_Distance_From_Acute_Hospital'] = min_dist
medical_desert_df['Is_Medical_Desert'] = med_desert
medical_desert_df[['latitude','longitude']] = pd.DataFrame(medical_desert_df['Random_Location'].tolist(),index=medical_desert_df.index)
medical_desert_df.head()

Unnamed: 0,Random_Location,Min_Distance_From_Acute_Hospital,Is_Medical_Desert,latitude,longitude
0,"(27.783614388618506, -81.24981944601676)",20.807054,No,27.783614,-81.249819
1,"(37.114955989647065, -125.66822873732477)",176.380978,Yes,37.114956,-125.668229
2,"(32.559801338883574, -83.48043864291694)",9.640488,No,32.559801,-83.480439
3,"(31.796704750477282, -124.0792911494604)",287.62579,Yes,31.796705,-124.079291
4,"(39.87327099773287, -83.3659601007121)",4.87799,No,39.873271,-83.36596


In [10]:
# Encode categorical data
medical_desert_encoded = pd.get_dummies(medical_desert_df, columns=['Is_Medical_Desert'])
medical_desert_encoded.head()

Unnamed: 0,Random_Location,Min_Distance_From_Acute_Hospital,latitude,longitude,Is_Medical_Desert_No,Is_Medical_Desert_Yes
0,"(27.783614388618506, -81.24981944601676)",20.807054,27.783614,-81.249819,1,0
1,"(37.114955989647065, -125.66822873732477)",176.380978,37.114956,-125.668229,0,1
2,"(32.559801338883574, -83.48043864291694)",9.640488,32.559801,-83.480439,1,0
3,"(31.796704750477282, -124.0792911494604)",287.62579,31.796705,-124.079291,0,1
4,"(39.87327099773287, -83.3659601007121)",4.87799,39.873271,-83.36596,1,0


In [11]:
# backup dataframe
medical_desert_encoded_bkup = medical_desert_encoded

In [12]:
# import dependencies for machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [13]:
# Split data into feature and target
y = medical_desert_encoded['Is_Medical_Desert_Yes']
X = medical_desert_encoded.drop(['Is_Medical_Desert_No', 'Is_Medical_Desert_Yes', 'Random_Location', 'Min_Distance_From_Acute_Hospital'], axis=1)

In [14]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [29]:
# Create a LogisticRegression model
classifier = LogisticRegression(solver='saga',
   max_iter=500,
   random_state=1)

In [30]:
# Train the model with the training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=1, solver='saga')

In [31]:
# test the model with the test data
y_pred = classifier.predict(X_test)

In [32]:
# Calculate accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.736


In [33]:
# Get confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[143  15]
 [ 51  41]]


In [34]:
# print classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.74      0.91      0.81       158
           1       0.73      0.45      0.55        92

    accuracy                           0.74       250
   macro avg       0.73      0.68      0.68       250
weighted avg       0.74      0.74      0.72       250

