In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the dataset
data = pd.DataFrame(pd.read_csv('EPA_SmartLocationDatabase_V3_Jan_2021_Final.csv'))
print(data.shape)  # Output: (220740, 117)

# Drop specified columns
columns_to_drop = ['OBJECTID', 'Shape_Length', 'Shape_Area', 'CBSA_Name', 'CSA_Name']
data = data.drop(columns=columns_to_drop, errors='ignore')

# Drop rows with missing values
data = data.dropna()

# Separate features and target
X = data.drop(columns=['NatWalkInd'])
y = data['NatWalkInd']

# Identify categorical columns (non-numeric)
categorical_cols = X.select_dtypes(include=['object']).columns

# Apply one-hot encoding
X = pd.get_dummies(X, columns=categorical_cols)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build MLP model
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f'Test MAE: {mae}')

# Display the first few rows to understand the structure
data.head()
