In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

# Load the dataset
data = pd.read_csv('ARCHIVED__Restaurant_Inspection_Scores__2016-2019__20250107desk.csv')

# Aggregate violations by zip code
violation_counts = data.groupby('business_postal_code')['violation_id'].count().reset_index()
violation_counts.columns = ['postal_code', 'violation_count']

# Merge with other relevant data (e.g., inspection_score, risk_category)
features = data[['business_postal_code', 'inspection_score', 'risk_category', 'inspection_type']].drop_duplicates()

# Encode categorical variables
le = LabelEncoder()
features['inspection_type'] = le.fit_transform(features['inspection_type'])
features['risk_category'] = le.fit_transform(features['risk_category'])

# Merge violation data with feature set
merged_data = pd.merge(features, violation_counts, left_on='business_postal_code', right_on='postal_code')

# Prepare data for model training
X = merged_data[['inspection_score', 'inspection_type', 'risk_category']]
y = merged_data['violation_count']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Predict the zip code with the most violations
predicted_violations = rf.predict(X)

# Identify the zip code with the highest predicted violations
merged_data['predicted_violations'] = predicted_violations
most_violations_zip_code = merged_data.loc[merged_data['predicted_violations'].idxmax()]

print(f'Zip code with most violations: {most_violations_zip_code["business_postal_code"]}')

Mean Absolute Error: 831.4375970973234
Zip code with most violations: 94110
