In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Read the CSV and Perform Basic Data Cleaning

In [8]:
# Load the data
file = "FoodAccess.csv"
df = pd.read_csv(file)
# check data type
df.dtypes 

CensusTract             int64
Urban                   int64
Pop2010                 int64
OHU2010                 int64
PovertyRate           float64
MedianFamilyIncome    float64
LIPct                 float64
LA1and10                int64
dtype: object

In [9]:
# Find null values
for column in df.columns :
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column CensusTract has 0 null values
Column Urban has 0 null values
Column Pop2010 has 0 null values
Column OHU2010 has 0 null values
Column PovertyRate has 3 null values
Column MedianFamilyIncome has 748 null values
Column LIPct has 4 null values
Column LA1and10 has 0 null values


In [6]:
# Remove null rows
#--Median Family Income has 748 null values which is 1.04%. This figure is insignificant compared to the importance of the
#feature towards prediction. So we deciide to drop null values than remove the column.
df=df.dropna()

In [7]:
#Find duplicate entries
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 0


# Define Model

In [6]:
# Create features
X = df.drop(columns=['CensusTract',"LA1and10"]) 

# Create target
y = df["LA1and10"]

# Split model
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



### Logistic Regression

In [7]:
# Define model
model = LogisticRegression(solver='lbfgs', random_state=1)
# fit model

# Fit model
model.fit(X_train, y_train)
# make predictions on the entire training dataset
predictions = model.predict(X_test)

# Evaluate the model
y_pred = model.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.639


In [44]:
cm_logreg = confusion_matrix(y_test,y_pred)
cm_logreg

array([[10389,   736],
       [ 5747,  1074]])

In [9]:
#Run Logistic Regression on scaled data
# Fit model
model.fit(X_train_scaled, y_train)
# make predictions on the entire training dataset
predictions_scaled = model.predict(X_test_scaled)

# Evaluate the model
y_pred_scaled = model.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred_scaled):.3f}")

 Logistic regression model accuracy: 0.688


In [10]:
confusion_matrix(y_test,y_pred_scaled)

array([[9562, 1563],
       [4045, 2776]])

In [11]:
pd=pd.DataFrame({"Prediction": predictions, "Scaled Prediction":predictions_scaled, "Actual": y_test})
pd.head(15)

Unnamed: 0,Prediction,Scaled Prediction,Actual
36858,0,0,1
39902,0,0,0
45495,0,1,0
8231,0,1,0
64321,0,0,0
39259,0,1,1
51030,0,0,0
29719,0,0,1
24646,1,1,1
36362,0,0,1


### Undersampling

In [12]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [13]:
Counter(y)

Counter({1: 27282, 0: 44500})

In [14]:
rus = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 20461, 1: 20461})

In [15]:
# Retry Logistic Regression with undersampled data
model.fit(X_resampled, y_resampled)
y_pred_resamp = model.predict(X_test)
print(f" Resampled logistic regression model accuracy: {accuracy_score(y_test,y_pred_resamp):.3f}")
confusion_matrix(y_test, y_pred_resamp)

 Resampled logistic regression model accuracy: 0.584


array([[6095, 5030],
       [2434, 4387]])

### SVM

In [16]:
# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred_svm = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred_svm):.3f}")

confusion_matrix(y_test, y_pred_svm)

 SVM model accuracy: 0.639


array([[10292,   833],
       [ 4918,  1903]])

In [49]:
# Create the SVM model
svm_rbf = SVC()

# Train the model
svm_rbf.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred_svmrbf = svm_rbf.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred_svmrbf):.3f}")

confusion_matrix(y_test, y_pred_svmrbf)

 SVM model accuracy: 0.702


array([[9693, 1432],
       [3912, 2909]])

### Random Forest

In [22]:
# Create Random Forest Classifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=78) 

In [23]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)
# Making predictions using the testing data.
rf_y_pred = rf_model.predict(X_test_scaled)
# Evaluate the model
print(f"Random Forest model accuracy: {accuracy_score(y_test,rf_y_pred):.3f}")

Random Forest model accuracy: 0.690


In [19]:
confusion_matrix(y_test,rf_y_pred)

array([[9194, 1931],
       [3649, 3172]])

In [30]:
rf_model.feature_importances_


array([0.06541107, 0.20399479, 0.19462424, 0.16802995, 0.18453502,
       0.18340494])

### Neural Network

In [26]:
import tensorflow as tf

In [27]:
# Define neural network model
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=15, activation="relu", input_dim=6))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
561/561 - 1s - loss: 0.5722 - accuracy: 0.7031
Loss: 0.5721873044967651, Accuracy: 0.7031093239784241


In [48]:
print(f'Logistic Regression: {accuracy_score(y_test,y_pred):.3f}')
print(f'Scaled Logistic Regression: {accuracy_score(y_test,y_pred_scaled):.3f}')
print(f'Undersampled Logistic Regression: {accuracy_score(y_test,y_pred_resamp):.3f}')
print(f'SVM Accuracy: {accuracy_score(y_test,y_pred_svm):.3f}')

Logistic Regression: 0.639
Scaled Logistic Regression: 0.688
Undersampled Logistic Regression: 0.584
SVM Accuracy: 0.680
