In [14]:
import lime
import lime.lime_tabular

import pandas as pd
import numpy as np

# Read the Excel file
df = pd.read_excel('FoodAccessResearchAtlasData2019.xlsx', sheet_name='Food Access Research Atlas')

# Convert to CSV and save
df.to_csv('Food_Access_Research_Atlas_Raw_Data.csv', index=False)

In [15]:
# Open the CSV file and read it into a DataFrame
df = pd.read_csv('Food_Access_Research_Atlas_Raw_Data.csv')


In [16]:
# Create a new DataFrame with only the columns relevant to research
new_df = df[['CensusTract', 'Urban', 'Pop2010','LILATracts_1And10', 'LILATracts_halfAnd10', 'LILATracts_1And20', 'LILATracts_Vehicle',
            'HUNVFlag', 'LowIncomeTracts', 'PovertyRate', 'MedianFamilyIncome', 'LA1and10', 'LAhalfand10', 'LA1and20', 'LATracts_half',
             'LATracts1', 'LATracts10', 'LATracts20', 'LATractsVehicle_20', 'LAPOP1_10', 'LAPOP05_10', 'LAPOP1_20', 'LALOWI1_10',
             'LALOWI05_10', 'LALOWI1_20', 'lapophalf', 'lapophalfshare', 'lalowihalf', 'lalowihalfshare', 'lalowi1', 'lalowi1share',
             'lapop10', 'lapop10share', 'lalowi10', 'lalowi10share', 'lahunv10', 'lahunv10share', 'lapop20', 'lapop20share', 'lalowi20', 
             'lalowi20share', 'lahunv20', 'lahunv20share', 'TractLOWI'
]]

# Save the new DataFrame to a CSV file
new_df.to_csv('Food_Access_Research_Atlas_Restricted_Data.csv', index=False)

In [17]:
#Check for missing values in each column
#print(new_df.isnull().sum())

In [18]:
#get rid of missing values
new_df = new_df.dropna()

In [19]:
#check for values again
#print(new_df.isnull().sum())

In [20]:

train_features = new_df[['CensusTract', 'Pop2010', 'PovertyRate', 'MedianFamilyIncome', 'LAPOP1_10', 'LAPOP05_10', 'LAPOP1_20', 'LALOWI1_10',
             'LALOWI05_10', 'LALOWI1_20', 'lapophalf', 'lapophalfshare', 'lalowihalf', 'lalowihalfshare', 'lalowi1', 'lalowi1share',
             'lapop10', 'lapop10share', 'lalowi10', 'lalowi10share', 'lahunv10', 'lahunv10share', 'lapop20', 'lapop20share', 'lalowi20', 
             'lalowi20share', 'lahunv20', 'lahunv20share']]

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Create a new outcome column based on the combination of features
new_df['NewOutcome'] = ((new_df['PovertyRate'] >= 20) & 
                        (new_df['LAPOP1_10'] >= 50) & 
                        (new_df['lapophalfshare'] >= 10)).astype(int)

# Separate the data into features and target
X = train_features
Y = new_df['NewOutcome']

# Split data into training/testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

# Evaluate
accuracy = model.score(X_test, Y_test)
print(accuracy)
print(classification_report(Y_test, Y_pred))


0.8047138047138047
              precision    recall  f1-score   support

           0       0.80      1.00      0.89       239
           1       0.00      0.00      0.00        58

    accuracy                           0.80       297
   macro avg       0.40      0.50      0.45       297
weighted avg       0.65      0.80      0.72       297



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
print(new_df['CensusTract'])
def predict_food_desert(census_tract):
    # Find the row corresponding to the given census tract
    row = train_features.loc[train_features['CensusTract'] == census_tract]
    # Check if the row was found
    if row.empty:
        return "Error: Census tract not found"
    # Extract the features from the row
    features = row.drop(['CensusTract'], axis=1)
    # Make a prediction using the trained model
    prediction = model.predict(train_features)
    # Return the prediction
    if prediction[0] == 0:
        return "Not a food desert"
    else:
        return "Food desert"

tract_code = 45015020401
if tract_code in new_df["CensusTract"].values:
    print("Census tract found in dataset")
else:
    print("Census tract not found in dataset")
predict_food_desert(45015020401)

251       1047957000
284       1053969800
633       1085781100
866       1099075600
998       1113031100
            ...     
72519    56039967600
72523    56041975200
72524    56041975300
72526    56043000200
72529    56045951100
Name: CensusTract, Length: 1483, dtype: int64
Census tract found in dataset


'Not a food desert'