# Exploratory Data Analysis

We do some exploratory data analys of the cleaned rat sightings data and rat inspection data.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
import missingno as msno
import glob
import os


## EDA on Rat Sightings Data

We begin with some EDA on the rat sightings data.

### EDA: Bar Charts and Missingness

In [None]:
rs = pd.read_csv('../scr/data/cleaned_rat_sightings_data/cleaned_rat_sightings.csv')

In [None]:
plt.figure(figsize=(14, 5))
sns.set_style("whitegrid")

ax = sns.countplot(
    x=rs["borough"],
    order=rs["borough"].value_counts().index
)

# adds the count labels on top of bars
for patch in ax.patches:
    height = patch.get_height()
    ax.annotate(
        f"{int(height)}",
        (patch.get_x() + patch.get_width() / 2, height),
        ha="center",
        va="bottom",
        fontsize=11
    )

ax.set_title("Distribution Rat Sightings by Boroughs", fontsize=18)
ax.set_xlabel("Boroughs", fontsize=14)
ax.set_ylabel("Number of Rat Sightings", fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
df_plot = rs.dropna(subset=['zip', 'borough']).copy()
df_plot['zip'] = df_plot['zip'].astype(str)
df_plot['borough'] = df_plot['borough'].str.strip().str.upper()

# Aggregate counts
agg = df_plot.groupby(['zip', 'borough']).size().unstack(fill_value=0)

colors = {
    'MANHATTAN': '#1f77b4',
    'BROOKLYN': '#ff7f0e',
    'QUEENS': '#2ca02c',
    'BRONX': '#d62728',
    'STATEN ISLAND': '#9467bd',
    'UNSPECIFIED' :  "#000000"
}

agg.plot(kind='barh', stacked=True, figsize=(20,50),
         color=[colors[b] for b in agg.columns], width = 0.9)

plt.title("Distribution of Rat Sightings by Zip Code and Borough", fontsize=30)
plt.xlabel("Frequency of Rat Sightings", fontsize=20)
plt.ylabel("Zip Code", fontsize=18)
plt.show()

In [None]:
# make sure the data is clean
df_plot = rs.dropna(subset=['zip', 'borough']).copy()
df_plot['zip'] = df_plot['zip'].astype(str)
df_plot['borough'] = df_plot['borough'].str.strip().str.upper()

# aggregrate counts
agg = df_plot.groupby(['zip', 'borough']).size().unstack(fill_value=0)

# selects the top n zip codes
n=20
agg_top5 = agg.loc[agg.sum(axis=1).nlargest(n).index]

# sort the zip codes
agg_top5 = agg_top5.loc[agg_top5.sum(axis=1).sort_values().index]

# colors for bars
colors = {
    'MANHATTAN': '#1f77b4',
    'BROOKLYN': '#ff7f0e',
    'QUEENS': '#2ca02c',
    'BRONX': '#d62728',
    'STATEN ISLAND': '#9467bd',
    'UNSPECIFIED': '#000000'
}

# plot
fig, ax = plt.subplots(figsize=(14, 6))

agg_top5.plot(
    kind='barh',
    stacked=True,
    ax=ax,
    color=[colors[b] for b in agg_top5.columns],
    width=0.9
)

# adds the total counts for each bar
totals = agg_top5.sum(axis=1)
for i, total in enumerate(totals):
    ax.text(
        total + (totals.max() * 0.01),  # small offset
        i,
        f"{int(total)}",
        va="center",
        fontsize=11,
        fontweight="bold"
    )

# labels and axes
ax.set_title(f"Top {n} ZIP Codes with Highest Rat Sightings with Colors by Borough", fontsize=18)
ax.set_xlabel("Frequency of Rat Sightings", fontsize=14)
ax.set_ylabel("ZIP Code", fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
msno.matrix(rs)
msno.heatmap(rs)

In [None]:
# Let's make a histogram of the number of rat sightings by building type.
# let's also color code the bar by the amount corresponding to which borough.
plt.figure(figsize=(10, 6))
sns.histplot(data=rs, y='location_type', hue='borough', multiple="stack")
plt.ylabel('Location Type')
plt.xlabel('Count of Rat Sightings')
plt.title('Distribution of Rat Sightings by Location Type')
plt.xticks(rotation=45)
plt.show()

In [None]:
# extract month and the name (we care about the name for the bar chart)
rs['created_date'] = pd.to_datetime(rs['created_date']) 
rs['month'] = rs['created_date'].dt.month
rs['month_name'] = rs['created_date'].dt.month_name()

# group everything
monthly_counts = rs.groupby(['month', 'month_name']).size().reset_index(name='count')

# sort by month
monthly_counts = monthly_counts.sort_values('month')

# plot the bar chart
plt.figure(figsize=(12,6))
plt.bar(monthly_counts['month_name'], monthly_counts['count'], color='teal', alpha=0.8)
plt.xlabel("Month", fontsize=14)
plt.ylabel("Number of Rat Sightings", fontsize=14)
plt.title("Rat Sightings in NYC by Month (All Years Combined)", fontsize=16)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

### EDA: Time Series

We do some basic analysis of the time series given by the number of rat sightings per day.

In [None]:
rs['created_date'] = pd.to_datetime(rs['created_date']) 
rs['closed_date'] = pd.to_datetime(rs['closed_date'])
rs['resolution_action_updated_date'] = pd.to_datetime(rs['resolution_action_updated_date'])

cdate_rat = rs.groupby(rs['created_date'].dt.date).size().reset_index(name='count')

plt.figure(figsize=(35,20))
plt.plot(cdate_rat['created_date'], cdate_rat['count'], 'o', alpha=0.75)
plt.xlabel("Date", fontsize=20)
plt.ylabel("Number of Rat Sightings", fontsize=20)
plt.title("Rat Sightings in NYC Over Time", fontsize=24)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()

In [None]:
# Group by date and borough
cdate_borough = rs.groupby([rs['created_date'].dt.date, 'borough']).size().reset_index(name='count')

# Get the list of valid boroughs
boroughs = [b for b in rs['borough'].unique() if pd.notnull(b) and b != 'Unspecified']

# Create a 2x2 subplot (4 plots)

fig = plt.figure(figsize=(50,80))
gs = gridspec.GridSpec(5,1, figure=fig, wspace=0.3, hspace=0.3)

colors = ["r", "b", "g", "purple", "b"]

for i, borough in enumerate(boroughs):
    ax = fig.add_subplot(gs[i])
    borough_data = cdate_borough[cdate_borough['borough'] == borough]
    ax.plot(borough_data['created_date'], borough_data['count'], 'o', color=colors[i], markersize=10)
    ax.set_title(f"{borough}", fontsize=35)
    ax.set_xlabel("Date", fontsize=15)
    ax.set_ylabel("Number of Rat Sightings", fontsize=25)
    ax.grid(True)
    ax.set_ylim(0,70)
    ax.tick_params(axis='x', labelsize=24)
    ax.tick_params(axis='y', labelsize=24)

plt.suptitle("Rat Sightings in NYC Over Time by Borough", fontsize=36)
plt.show()

The following graph should be explained a bit. We essentially look at the previous year and look at the average with a window of 30 days.

In [None]:
rs_window = 30
rs_season = 365

fig = plt.figure(figsize=(50,80))
gs = gridspec.GridSpec(5,1, figure=fig, wspace=0.3, hspace=0.3)

colors = ["r", "b", "g", "purple", "b"]

for i, borough in enumerate(boroughs):
    ax = fig.add_subplot(gs[i])
    
    borough_data = (
        cdate_borough[cdate_borough['borough'] == borough]
        .sort_values("created_date")
        .copy()
    )
    
    # Seasonal naive: average of previous year
    borough_data["seasonal_avg_prev_year"] = (
        borough_data["count"]
        .rolling(window=rs_window, min_periods=0)
        .mean()
        .shift(rs_season)
    )

    # Original data
    ax.plot(
        borough_data["created_date"],
        borough_data["count"],
        "o",
        color=colors[i],
        markersize=10,
        label="Observed"
    )
    
    # Seasonal average line
    ax.plot(
        borough_data["created_date"],
        borough_data["seasonal_avg_prev_year"],
        color="black",
        linewidth=4,
        label=f"Seasonal Avg (Window = {rs_window} and Shift = {rs_season})"
    )

    ax.set_title(f"{borough}", fontsize=35)
    ax.set_xlabel("Date", fontsize=15)
    ax.set_ylabel("Number of Rat Sightings", fontsize=25)
    ax.grid(True)
    ax.set_ylim(0,70)
    ax.tick_params(axis='x', labelsize=24)
    ax.tick_params(axis='y', labelsize=24)
    ax.legend(fontsize=22)

plt.suptitle("Rat Sightings in NYC Over Time by Borough", fontsize=36)
plt.show()

In [None]:
def seasonal_average_forecast(data, target_dates, years_back=5, day_window=5):
    """
    data: DataFrame with columns ['created_date', 'count']
    target_dates: DatetimeIndex of future dates
    """
    df = data.copy()

    # ensure datetime type
    df["created_date"] = pd.to_datetime(df["created_date"])
    df["doy"] = df["created_date"].dt.dayofyear
    df["year"] = df["created_date"].dt.year

    forecasts = []
    for target_date in target_dates:
        target_doy = target_date.dayofyear
        target_year = target_date.year
        mask = (
            (df["year"] >= target_year - years_back) &
            (df["year"] < target_year) &
            (np.abs(df["doy"] - target_doy) <= day_window)
        )

        forecasts.append(df.loc[mask, "count"].mean())

    return pd.Series(forecasts, index=target_dates)


# ensure global dataframe is datetime
cdate_borough["created_date"] = pd.to_datetime(cdate_borough["created_date"])

# define future dates
# number of years to forecast
years_to_forecast = 1

last_date = cdate_borough["created_date"].max()
future_dates = pd.date_range(
    start=last_date + pd.Timedelta(days=1),
    periods=365 * years_to_forecast,
    freq="D"
)

fig = plt.figure(figsize=(50,80))
gs = gridspec.GridSpec(5,1, figure=fig, wspace=0.3, hspace=0.3)

colors = ["r", "b", "g", "purple", "b"]

for i, borough in enumerate(boroughs):
    ax = fig.add_subplot(gs[i])

    borough_data = (
        cdate_borough[cdate_borough["borough"] == borough]
        .sort_values("created_date")
        .copy()
    )

    # compute seasonal-average forecast
    # due to possibly sparse data, we use a window
    forecast = seasonal_average_forecast(
        borough_data,
        future_dates,
        years_back=5,
        day_window=5
    )

    # plot observed data
    ax.plot(
        borough_data["created_date"],
        borough_data["count"],
        "o",
        color=colors[i],
        markersize=10,
        label="Observed"
    )

    # plot forecast
    ax.plot(
        forecast.index,
        forecast.values,
        color="black",
        linewidth=5,
        linestyle="--",
        label="5-Year Seasonal Avg Forecast (Â±10 days)"
    )

    ax.set_title(f"{borough}", fontsize=35)
    ax.set_xlabel("Date", fontsize=15)
    ax.set_ylabel("Number of Rat Sightings", fontsize=25)
    ax.grid(True)
    ax.set_ylim(0,70)
    ax.tick_params(axis='x', labelsize=24)
    ax.tick_params(axis='y', labelsize=24)
    ax.legend(fontsize=22)

plt.suptitle("Rat Sightings in NYC: 5-Year Seasonal Average Forecast", fontsize=36)
plt.show()

### EDA: Cumulative Rat Sightings and Linear Regression Test

We consider cumulative number of rat sightings over time. We quickly fit a linear regression model (but do not do cross validation!) to see if there are any possible trends to consider.

In [None]:
# plot the cumulative number of rats seen since 2020.
rs['created_date'] = pd.to_datetime(rs['created_date'])
rs = rs.sort_values('created_date')
rs['cumulative_count'] = np.arange(1, len(rs) + 1)
plt.figure(figsize=(40,20))
plt.plot(rs['created_date'], rs['cumulative_count'], 'o', alpha=0.75)
plt.xlabel("Date", fontsize=20)
plt.ylabel("Cumulative Number of Rat Sightings", fontsize=20)
plt.title("Cumulative Rat Sightings in NYC Over Time", fontsize=24)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()

In [None]:
# a basic linear regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Convert created_date to ordinal for regression
rs['created_date_ordinal'] = rs['created_date'].apply(lambda x: x.toordinal())
# Prepare the data for regression
X = rs['created_date_ordinal'].values.reshape(-1, 1)
y = rs['cumulative_count'].values
# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)
# Predict the values and calculate R^2 score
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
print(f"R^2 score of the linear regression model (2020-present): {r2:.4f}")
print(f"The model's linear equation (2020-present) is: cumulative_count = {model.coef_[0]:.4f} * created_date_ordinal + {model.intercept_:.4f}")
plt.figure(figsize=(40,20))
plt.plot(rs['created_date'], rs['cumulative_count'], 'o', alpha=0.75, label='Actual Data')
plt.plot(rs['created_date'], y_pred, color='red', label='Linear Fit')
plt.xlabel("Date", fontsize=20)
plt.ylabel("Cumulative Number of Rat Sightings", fontsize=20)
plt.title("Cumulative Rat Sightings in NYC Over Time with Linear Fit from 2020-2026", fontsize=24)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend()
plt.show()

mask = (rs['created_date'].dt.year >= 2020) & (rs['created_date'].dt.year <= 2022)
filtered_data = rs[mask]

# Convert created_date to ordinal for regression
filtered_data['created_date_ordinal'] = filtered_data['created_date'].apply(lambda x: x.toordinal())

# Reshape
X_filtered = filtered_data['created_date_ordinal'].values.reshape(-1, 1)
y_filtered = filtered_data['cumulative_count'].values

# Fit the linear regression model
model_filtered = LinearRegression()
model_filtered.fit(X_filtered, y_filtered)

# Predict values and calculate R^2 score
y_pred_filtered = model_filtered.predict(X_filtered)
r2_filtered = r2_score(y_filtered, y_pred_filtered)
print(f"R^2 score of the linear regression model (2020-2022): {r2_filtered:.4f}")
print(f"The model's linear equation (2020-2022) is: cumulative_count = {model_filtered.coef_[0]:.4f} * created_date_ordinal + {model_filtered.intercept_:.4f}")

# Plot original data and regression line
plt.figure(figsize=(40,20))
plt.plot(filtered_data['created_date'], filtered_data['cumulative_count'], 'o', alpha=0.75, label='Actual Data (2020-2022)')
plt.plot(filtered_data['created_date'], y_pred_filtered, color='red', label='Linear Fit (2020-2022)')
plt.xlabel("Date", fontsize=20)
plt.ylabel("Cumulative Number of Rat Sightings", fontsize=20)
plt.title("Cumulative Rat Sightings in NYC Over Time with Linear Fit (2020-2022)", fontsize=24)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend()
plt.show()

In [None]:
# Let's also separate the plots by borough. We do rat sightings first.
plt.figure(figsize=(35,20))
for borough in rs['borough'].unique():
    borough_data = rs[rs['borough'] == borough]
    borough_data = borough_data.sort_values('created_date')
    borough_data['cumulative_count'] = np.arange(1, len(borough_data) + 1)
    plt.plot(borough_data['created_date'], borough_data['cumulative_count'], 'o', alpha=0.75, label=borough)
plt.xlabel("Date", fontsize=20)
plt.ylabel("Cumulative Number of Rat Sightings", fontsize=20)
plt.title("Cumulative Rat Sightings in NYC Over Time by Borough", fontsize=24)
plt.legend()
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()

### EDA: ACF and PACF Graphs for Rat Sightings Data

In [None]:
import statsmodels.api as sm

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(55,36))
sm.graphics.tsa.plot_acf(cdate_rat['count'], 
                         lags = 700,
                         ax=ax)
plt.xlabel("Lag",fontsize=30)
plt.ylabel("Autocorrelation",fontsize=50)
plt.xticks(fontsize=24)
plt.yticks(fontsize=24)
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(55,36))
sm.graphics.tsa.plot_pacf(cdate_rat['count'], 
                         lags = 365,
                         ax=ax)
plt.xlabel("Lag",fontsize=30)
plt.ylabel("Partial Autocorrelation",fontsize=50)
plt.xticks(fontsize=24)
plt.yticks(fontsize=24)
plt.show()

The autocorrelation graph exhibits a trend to zero which is accurate since our data for rat sightings exhibited seasonality (more rat sightings in the summer as opposed to the winter).

### EDA: Holt-Winters Forecast for Rat Sightings Data

In [None]:
from statsmodels.tsa.api import ExponentialSmoothing
from datetime import date


In [None]:
cdate_rat_test = cdate_rat[cdate_rat['created_date']>=date.fromisoformat("2025-01-01")].copy()
cdate_rat_train = cdate_rat[cdate_rat['created_date'] <date.fromisoformat("2025-01-01")].copy()

holt_winter = ExponentialSmoothing(cdate_rat_train['count'].values,
                                      seasonal='mul',
                                      seasonal_periods=365).fit(smoothing_level=0.1,
                                                                  smoothing_trend=0.1,
                                                                  smoothing_seasonal=.8,
                                                                  optimized=False)

In [None]:
plt.figure(figsize=(150,40))

plt.plot(cdate_rat_train['created_date'], 
         cdate_rat_train['count'],
         'b-',
         label="Training Data")
plt.plot(cdate_rat_train['created_date'], 
         holt_winter.fittedvalues, 
         'g-',
         label="Holt-Winter Fit")

plt.plot(cdate_rat_test['created_date'], 
         cdate_rat_test['count'],
         'r-',
         label="Test Data", alpha=0.2)

plt.plot(cdate_rat_test['created_date'], 
         holt_winter.forecast(len(cdate_rat_test['created_date'])),
         'r--',
         label="Holt-Winters Predictions")


plt.xlabel("Date", fontsize=80)
plt.ylabel("Rat Sightings by Day", fontsize=80)
         
plt.xticks(fontsize=64)
plt.yticks(fontsize=60)
plt.title("Holt-Winters Prediction tested on 2020-2025", fontsize=90)
         
plt.legend(fontsize=60)

plt.show()

In [None]:
cdate_rat_test = cdate_rat[cdate_rat['created_date']>=date.fromisoformat("2025-01-01")].copy()

cdate_rat_train = cdate_rat[cdate_rat['created_date']<date.fromisoformat("2025-01-01")].copy()
cdate_rat_train = cdate_rat_train[cdate_rat['created_date']>=date.fromisoformat("2023-01-01")]

holt_winter = ExponentialSmoothing(cdate_rat_train['count'].values,
                                      seasonal='mul',
                                      seasonal_periods=365).fit(smoothing_level=0.1,
                                                                  smoothing_trend=0.1,
                                                                  smoothing_seasonal=.8,
                                                                  optimized=False)


plt.figure(figsize=(30,3))

plt.plot(cdate_rat_train['created_date'], 
         cdate_rat_train['count'],
         'b-', alpha=0.4,
         label="Training Data")
plt.plot(cdate_rat_train['created_date'], 
         holt_winter.fittedvalues, 
         'g-',
         label="Holt-Winter Fit", alpha = 0.5)

plt.plot(cdate_rat_test['created_date'], 
         cdate_rat_test['count'],
         'r-',
         label="Test Data", alpha=0.2)

plt.plot(cdate_rat_test['created_date'], 
         holt_winter.forecast(len(cdate_rat_test['created_date'])),
         'r--',
         label="Holt-Winters Predictions")


plt.xlabel("Date", fontsize=10)
plt.ylabel("Rat Sightings by Day", fontsize=10)
         
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.title("Holt-Winters Prediction tested on 2023-2025", fontsize=20)
         
plt.legend(fontsize=10)

plt.show()

In [None]:
cdate_rat_test = cdate_rat[cdate_rat['created_date']>=date.fromisoformat("2026-01-01")].copy()
cdate_rat_train = cdate_rat[cdate_rat['created_date']<date.fromisoformat("2026-01-01")].copy()
cdate_rat_train = cdate_rat_train[cdate_rat['created_date']>=date.fromisoformat("2023-01-01")]

holt_winter = ExponentialSmoothing(cdate_rat_train['count'].values,
                                      seasonal='mul',
                                      seasonal_periods=365).fit(smoothing_level=0.1,
                                                                  smoothing_trend=0.1,
                                                                  smoothing_seasonal=.8,
                                                                  optimized=False)


plt.figure(figsize=(15,5))

plt.plot(cdate_rat_train['created_date'], 
         cdate_rat_train['count'],
         'b-', alpha=0.8,
         label="Training Data")
plt.plot(cdate_rat_train['created_date'], 
         holt_winter.fittedvalues, 
         'g-',
         label="Holt-Winter Fit", alpha = 0.5)

plt.plot(cdate_rat_test['created_date'], 
         cdate_rat_test['count'],
         'r-',
         label="Test Data", alpha=0.2)

plt.plot(cdate_rat_test['created_date'], 
         holt_winter.forecast(len(cdate_rat_test['created_date'])),
         'r--',
         label="Holt-Winters Predictions")


plt.xlabel("Date", fontsize=10)
plt.ylabel("Rat Sightings by Day", fontsize=10)
         
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.title("(by day) Holt-Winters Prediction tested on 2023-2026", fontsize=20)
         
plt.legend(fontsize=10)

plt.show()

There's a lot of noise in the above data ad the Holt-Winters forecast seems to do just "okay". 
Perhaps grouping entries by week instead of day would lead to less noise.
It is also worth asking the following question: If we focused on data by borough or ZIP, then would we have "better" forcasting and would we make better inferences?

### EDA: By Weekly Number of Sightings

In [None]:
# Group the number of rat sightings by week.

rs['week'] = rs['created_date'].dt.isocalendar().week
rs['dummy_count']=1

weekly_city = (
    rs
    .set_index('created_date')
    .resample('W-MON')['dummy_count']
    .sum()
)


weekly_city


In [None]:
weekly_city.index = pd.to_datetime(weekly_city.index)
rolling_8w = weekly_city.rolling(window=8, min_periods=1).mean()

plt.figure(figsize=(12, 4))

plt.plot(weekly_city, label="Weekly Count", linewidth=1.2, color="steelblue")
plt.fill_between(
    weekly_city.index,
    weekly_city.values,
    alpha=0.25,   # lower opacity
    label="Weekly Count (filled)"
)


plt.plot(rolling_8w, color="crimson", linewidth=2, label="8-wk rolling avg")

plt.title("Weekly Counts with 8-Week Rolling Average")
plt.xlabel("Week")
plt.tick_params(axis="x", rotation=30)
plt.ylabel("Sightings")

plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


## EDA on Rat Inspection Data

We do some EDA on the rat inspection data.

In [None]:
path = r'../scr/data/cleaned_rat_inspection_data' 
all_files = glob.glob(os.path.join(path , "*.csv"))
ri = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

display(ri.sample(5))
print(f"\n The columns are\n{ri.columns}")

In [None]:
# make sure inspection date is in datetime format
ri['inspection_date'] = pd.to_datetime(ri['inspection_date']) 

### EDA: Time Series

In [None]:
failed_rat_act = ri[ri['result'] == 'Failed for Rat Act']
failedidate = failed_rat_act.groupby(failed_rat_act['inspection_date'].dt.date).size().reset_index(name='count')
notfail = ri[ri['result'] != 'Failed for Rat Act']
idate = notfail.groupby(notfail['inspection_date'].dt.date).size().reset_index(name='count')


plt.figure(figsize=(35,20))
plt.plot(idate['inspection_date'], idate['count'], 'o', color="b", alpha=0.50, label='Passed Inspections')
plt.plot(failedidate['inspection_date'], failedidate['count'], 'o', color="r", alpha=0.50, label='Failed Inspections')
plt.xlabel('Inspection Date', size= 18)
plt.ylabel('Count of Inspections', size= 18)
plt.legend(prop={'size': 20})
plt.title('Count of Inspections Over Time (Blue = All Inspections, Red = Failed due to Rat Activity', size =20)
plt.show()

In [None]:
plt.figure(figsize=(60,15))
plt.plot(failedidate['inspection_date'], failedidate['count'], 'o', color="r", alpha=0.50, label='Failed Inspections')
plt.xlabel('Inspection Date', size= 24)
plt.ylabel('Count of Inspections', size= 24)
plt.title('Failed Inspections Over Time', size =30)
plt.show()

In [None]:
# Group by date and borough
failed_rat_act_by_borough = failed_rat_act.groupby([failed_rat_act['inspection_date'].dt.date, 'borough']).size().reset_index(name='count')

# Get the list of valid boroughs
boroughs = [b for b in failed_rat_act_by_borough['borough'].unique() if pd.notnull(b) and b != 'Unspecified']

# Create a 2x2 subplot (4 plots)

fig = plt.figure(figsize=(120,70))
gs = gridspec.GridSpec(5,1, figure=fig, wspace=0.3, hspace=0.3)

colors = ["r", "b", "g", "purple", "b"]

i = 0

for i, borough in enumerate(boroughs):
    ax = fig.add_subplot(gs[i])
    borough_data = failed_rat_act_by_borough[failed_rat_act_by_borough['borough'] == borough]
    ax.plot(borough_data['inspection_date'], borough_data['count'], 'o', color=colors[i], markersize=10)
    ax.set_title(f"{borough}", fontsize=35)
    ax.set_xlabel("Date", fontsize=15)
    ax.set_ylabel("Number of Failed Rat Inspections by Borough", fontsize=25)
    ax.grid(True)
    ax.set_ylim(0,70)
    ax.tick_params(axis='x', labelsize=24)
    ax.tick_params(axis='y', labelsize=24)

plt.suptitle("Rat Sightings in NYC Over Time by Borough", fontsize=36)
plt.show()

### EDA: Hexbin Maps

In [None]:
import plotly.figure_factory as ff

# Add a dummy column to count each row
ri['dummy_count'] = 1

fig = ff.create_hexbin_mapbox(
    data_frame=ri,
    lat="latitude",
    lon="longitude",
    nx_hexagon=80,             # Number of hexagons in x direction
    color="dummy_count",       # Sum of dummy_count = number of occurrences
    agg_func=np.sum,           # Sum the dummy column
    opacity=0.85,
    labels={"color": "Number of Inspections"},
)

fig.update_layout(
    mapbox_style="open-street-map",
    margin=dict(b=0, t=0, l=0, r=0),
)
fig.show()

ri.drop(columns=['dummy_count'], inplace=True)


In [None]:
# Add a dummy column to count each row

failed_rat_act['dummy_count'] = 1

fig = ff.create_hexbin_mapbox(
    data_frame=failed_rat_act,
    lat="latitude",
    lon="longitude",
    nx_hexagon=80,             # Number of hexagons in x direction
    color="dummy_count",       # Sum of dummy_count = number of occurrences
    agg_func=np.sum,           # Sum the dummy column
    opacity=0.85,
    labels={"color": "Number of Inspections"},
)

fig.update_layout(
    mapbox_style="open-street-map",
    margin=dict(b=0, t=0, l=0, r=0),
)
fig.show()

failed_rat_act.drop(columns=['dummy_count'], inplace=True)


### EDA: Cumulative Failed Rat Inspections

In [None]:
# plot the cumulative number of failed rat inspections since 2010.
failed_rat_act = failed_rat_act.sort_values('inspection_date')
failed_rat_act['cumulative_count'] = np.arange(1, len(failed_rat_act) + 1)
plt.figure(figsize=(40,20))
plt.plot(failed_rat_act['inspection_date'], failed_rat_act['cumulative_count'], 'o', alpha=0.75)
plt.xlabel("Date", fontsize=20)
plt.ylabel("Cumulative Number of Failed Rat Inspections", fontsize=20)
plt.title("Cumulative Number of Failed Rat Inspections in NYC Over Time", fontsize=24)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()

In [None]:
# Let's also do failed rat inspections by borough.
plt.figure(figsize=(35,20))
for borough in failed_rat_act['borough'].unique():
    borough_data = failed_rat_act[failed_rat_act['borough'] == borough]
    borough_data = borough_data.sort_values('inspection_date')
    borough_data['cumulative_failed'] = np.arange(1, len(borough_data) + 1)
    plt.plot(borough_data['inspection_date'], borough_data['cumulative_failed'], 'o', alpha=0.75, label=borough)
plt.xlabel("Date", fontsize=20)
plt.ylabel("Cumulative Number of Failed Inspections due to Rat Activity", fontsize=20)
plt.title("Cumulative Failed Inspections due to Rat Activity in NYC Over Time by Borough", fontsize=24)
plt.legend()
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()

In [None]:
# look for zip codes with the most failed rat inspections

failed_rat_act['dummy_count'] = 1
fri = failed_rat_act

top_zips_list = fri.groupby("zip_code")["dummy_count"].sum().nlargest(10).index.tolist()

fri_pivot = fri[fri["zip_code"].isin(top_zips_list)].groupby(
    ["inspection_date", "zip_code"]
)["dummy_count"].sum().reset_index()

fig, ax = plt.subplots(figsize=(14, 8))

for zip_code in sorted(top_zips_list):
    zip_data = fri_pivot[fri_pivot["zip_code"] == zip_code].sort_values("inspection_date")
    zip_data['cumu_sum'] = np.arange(1, len(zip_data) + 1)
    ax.plot(zip_data["inspection_date"], zip_data["cumu_sum"], marker='.', label=f"ZIP {zip_code}", markersize=0.1)

ax.set_xlabel("Inspection Date", fontsize=12)
ax.set_ylabel("Cumulative Number of Failed Inspections", fontsize=12)
ax.set_title("Cumulative Number of Failed Inspections in Zip Codes with most Failed Inspections over Time", fontsize=14, fontweight='bold')
ax.legend(loc='best', ncol=2, fontsize=12)
ax.grid(True)
plt.show()


### EDA: Missingness Analysis

In [None]:
msno.matrix(ri)
msno.heatmap(ri)

### EDA: Autocorrelation Graphs for Rat Inspections.

In [None]:
failed_cdate_rat = failed_rat_act.groupby(failed_rat_act['inspection_date'].dt.date).size().reset_index(name='count')

fig, ax = plt.subplots(1, 1, figsize=(55,36))
sm.graphics.tsa.plot_acf(failed_cdate_rat['count'], 
                         lags = 700,
                         ax=ax)
plt.xlabel("Lag",fontsize=30)
plt.ylabel("Autocorrelation",fontsize=30)
plt.xticks(fontsize=24)
plt.yticks(fontsize=24)
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(55,36))
sm.graphics.tsa.plot_pacf(failed_cdate_rat['count'], 
                         lags = 365,
                         ax=ax)
plt.xlabel("Lag",fontsize=30)
plt.ylabel("Partial Autocorrelation",fontsize=30)
plt.xticks(fontsize=24)
plt.yticks(fontsize=24)
plt.show()

## EDA on IRS ZIP Data

We do some EDA on the IRS zip code data. It has already been cleaned to extract only the adjusted gross income, zip code, and ranges for adjusted gross income.

In [None]:
irs = pd.read_csv("../scr/data/cleaned_irs_zip/cleaned_2011_2022_ZIP_AGI.csv")
# the cleaned irs data has counties that are not relevant to us.
irs = irs[irs['zip'].isin(rs['zip'].unique())]
irs.sample(2)

In [None]:
## Zone Code vs AGI Trend Over Years (Top 10 ZIP Codes)
df = irs.copy()

top_zips_list = df.groupby("zip")["agi"].sum().nlargest(10).index.tolist()

df_pivot = df[df["zip"].isin(top_zips_list)].groupby(
    ["year", "zip"]
)["agi"].sum().reset_index()

fig, ax = plt.subplots(figsize=(14, 8))

for zip_code in sorted(top_zips_list):
    zip_data = df_pivot[df_pivot["zip"] == zip_code].sort_values("year")
    ax.plot(zip_data["year"], zip_data["agi"], marker='o', label=f"ZIP {zip_code}", linewidth=2)

ax.set_xlabel("year", fontsize=12)
ax.set_ylabel("Adjusted Gross Income (AGI)", fontsize=12)
ax.set_title("AGI Trends Over Years - Top 10 ZIP Codes in NYC", fontsize=14, fontweight='bold')
ax.legend(loc='best', ncol=2, fontsize=10)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
## Low vs High Income by All Brackets - Side by Side for Top ZIP Codes
top_zips_overall = df.groupby("zip")["agi"].sum().nlargest(10).index.tolist()

pivot_data = df[df["zip"].isin(top_zips_overall)].groupby(
    ["zip", "Size of adjusted gross income"]
)["agi"].sum().unstack(fill_value=0)

bracket_order = [
    "$1 under $25,000",
    "$25,000 under $50,000",
    "$50,000 under $75,000",
    "$75,000 under $100,000",
    "$100,000 under $200,000",
    "$200,000 or more",
]
pivot_data = pivot_data[[col for col in bracket_order if col in pivot_data.columns]]

fig, ax = plt.subplots(figsize=(16, 8))
x = range(len(pivot_data))
width = 0.13 
colors = plt.cm.Set3(range(len(pivot_data.columns)))

for i, bracket in enumerate(pivot_data.columns):
    offset = (i - len(pivot_data.columns) / 2) * width + width / 2
    ax.bar([p + offset for p in x], pivot_data[bracket].values, width, label=bracket, color=colors[i])

ax.set_xlabel("ZIP Code")
ax.set_ylabel("Total AGI (2011-2022)")
ax.set_title("Income Distribution by All Brackets - Using Top 10 ZIP Codes in NYC")
ax.set_xticks(x)
ax.set_xticklabels(pivot_data.index, rotation=45, ha='right')
ax.legend(loc='upper left', fontsize=9)
plt.tight_layout()
plt.show()


In [None]:
# Zone Code (ZIP) vs AGI - Side by Side by Year
years = sorted(df['year'].unique())
n_years = len(years)
fig, axes = plt.subplots(3, 4, figsize=(20, 14))
axes = axes.flatten()

for idx, year in enumerate(years):
    df_year = df[df['year'] == year]
    zip_agi_year = df_year.groupby("zip")["agi"].sum().sort_values(ascending=False)
    top_zips = zip_agi_year.head(10)
    axes[idx].barh(range(len(top_zips)), top_zips.values)
    axes[idx].set_yticks(range(len(top_zips)))
    axes[idx].set_yticklabels(top_zips.index)
    axes[idx].set_xlabel("Total AGI")
    axes[idx].set_title(f"Top 10 ZIP Codes in NYZ - {year}")
    axes[idx].invert_yaxis()

for idx in range(len(years), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()


In [None]:
df_county = df.copy()
major_counties = df_county[df_county["county"] != "Other"].groupby("county")["agi"].sum().nlargest(6).index

## Income Bracket Trends by County (All Brackets)
bracket_order = [
    "$1 under $25,000",
    "$25,000 under $50,000",
    "$50,000 under $75,000",
    "$75,000 under $100,000",
    "$100,000 under $200,000",
    "$200,000 or more",
]

# Creating subplots for each income bracket
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

colors = plt.cm.tab10(range(len(major_counties)))

for idx, bracket in enumerate(bracket_order):
    ax = axes[idx]
    bracket_data = df_county[df_county["Size of adjusted gross income"] == bracket]
    
    for color_idx, county in enumerate(major_counties):
        county_data = bracket_data[bracket_data["county"] == county].groupby("year")["agi"].sum()
        if not county_data.empty:
            ax.plot(county_data.index, county_data.values, marker='o', label=county, color=colors[color_idx], linewidth=2)
    
    ax.set_xlabel("Year")
    ax.set_ylabel("Total AGI")
    ax.set_title(f"Trend: {bracket}")
    ax.grid(True, alpha=0.3)
    ax.legend(fontsize=8)
    ax.tick_params(axis='x', rotation=45)

plt.suptitle("Income Trends by County - Each Income Bracket Separately (2011-2022)", fontsize=14, y=1.00)
plt.tight_layout()
plt.show()


In [None]:
## Year-over-Year Growth Rates
top_10_zips = df.groupby("zip")["agi"].sum().nlargest(10).index.tolist()

yoy_growth = []
for year in sorted(df['year'].unique())[1:]:
    prev_year = year - 1
    
    df_prev = df[df['year'] == prev_year].groupby('zip')['agi'].sum()
    df_curr = df[df['year'] == year].groupby('zip')['agi'].sum()
    
    growth = ((df_curr - df_prev) / df_prev * 100).fillna(0)
    yoy_growth.append({'year': year, 'Growth_Rate': growth[growth.index.isin(top_10_zips)].mean()})

yoy_df = pd.DataFrame(yoy_growth)

fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# YoY Growth Rate Trend
colors = ['green' if x > 0 else 'red' for x in yoy_df['Growth_Rate']]
axes[0].bar(yoy_df['year'], yoy_df['Growth_Rate'], color=colors, alpha=0.7)
axes[0].axhline(y=0, color='black', linestyle='-', linewidth=0.8)
axes[0].set_xlabel("Year")
axes[0].set_ylabel("Average YoY Growth (%)")
axes[0].set_title("Year-over-Year Growth Rate (Top 10 ZIP Codes)", fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Growth Rate by Top 10 ZIPs (Latest 3 Years vs Previous 3 Years)
recent_years = sorted(df['year'].unique())[-3:]
earlier_years = sorted(df['year'].unique())[-6:-3]

df_recent = df[df['year'].isin(recent_years)].groupby('zip')['agi'].sum()
df_earlier = df[df['year'].isin(earlier_years)].groupby('zip')['agi'].sum()

growth_comparison = pd.DataFrame({
    'Recent (2020-2022)': ((df_recent - df_earlier) / df_earlier * 100).fillna(0),
    'Earlier (2017-2019)': 0
})

# Calculating the actual growth for earlier period
df_earliest = df[df['year'].isin(sorted(df['year'].unique())[:3])].groupby('zip')['agi'].sum()
growth_comparison['Earlier (2011-2013)'] = ((df_earlier - df_earliest) / df_earliest * 100).fillna(0)

growth_comparison = growth_comparison[growth_comparison.index.isin(top_10_zips)].sort_values('Recent (2020-2022)', ascending=True)

growth_comparison[['Recent (2020-2022)', 'Earlier (2011-2013)']].plot(kind='barh', ax=axes[1], color=['steelblue', 'lightcoral'])
axes[1].set_xlabel("Growth Rate (%)")
axes[1].set_ylabel("ZIP Code")
axes[1].set_title("Growth Rates Comparison: Recent Period vs Earlier Period", fontweight='bold')
axes[1].grid(axis='x', alpha=0.3)
axes[1].axvline(x=0, color='black', linestyle='--', linewidth=0.8)

plt.tight_layout()
plt.show()


## Comparing Rat Sightings Data to Rat Inspection Data

In [None]:
failed_rat_act = ri[ri['result'] == 'Failed for Rat Act']
failed_rat_act = failed_rat_act[(failed_rat_act['inspection_date'] >= '2020-01-01')]
failedidate = failed_rat_act.groupby(failed_rat_act['inspection_date'].dt.date).size().reset_index(name='count')



rs['created_date'] = pd.to_datetime(rs['created_date']) 
rs['closed_date'] = pd.to_datetime(rs['closed_date'])
rs['resolution_action_updated_date'] = pd.to_datetime(rs['resolution_action_updated_date'])

cdate_rat = rs.groupby(rs['created_date'].dt.date).size().reset_index(name='count')


plt.figure(figsize=(70,35))
plt.plot(failedidate['inspection_date'], failedidate['count'], 'o', color="r", alpha=0.75, label='Failed Inspections')
plt.plot(cdate_rat['created_date'], cdate_rat['count'], 'o', alpha=0.75, label='Rat Sightings')
plt.xlabel('Date', size= 18)
plt.ylabel('Count', size= 18)
plt.legend(prop={'size': 40})
plt.title('Count of Rat Sightings and Failed Rat Inspections Over Time', size =20)
plt.xticks(fontsize=24)
plt.yticks(fontsize=24)
plt.show()



In [None]:
# Let's see if there is some correlation between rat sightings and locations of catch basins.
# We make a hexbin map which counts the number of rat sightings within a certain radius around a catch basin.
plt.figure(figsize=(20, 20))
sns.scatterplot(y=failed_rat_act['latitude'], x=failed_rat_act['longitude'], color='blue', alpha=1, s=1, label='Failed Rat Insepections')
sns.scatterplot(y=rs['latitude'], x=rs['longitude'], color='red', alpha=0.5, s=1, label='Rat Sightings')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Rat Sightings and Rat Inspections in NYC')
plt.legend()
plt.show()

## Comparing IRS ZIP Data with Rat Sightings Data

In [None]:
# --- IRS: sum AGI per ZIP ---
irs_agi_by_zip = (
    irs
    .groupby('zip', as_index=True)['agi']
    .sum()
)

# --- RS: count occurrences per ZIP ---
rs_counts_by_zip = (
    rs
    .groupby('zip')
    .size()
)

# --- Combine and align ---
zip_data = pd.DataFrame({
    'IRS_AGI_SUM': irs_agi_by_zip,
    'RS_COUNT': rs_counts_by_zip
}).fillna(0)

# Optional: limit to top N ZIPs by IRS AGI
zip_data = zip_data.sort_values('IRS_AGI_SUM', ascending=False)
# zip_data = zip_data.head(100)

# --- Plot with dual y-axes ---
fig, ax1 = plt.subplots(figsize=(75, 40))

# IRS AGI (left axis)
zip_data['IRS_AGI_SUM'].plot(
    kind='bar',
    ax=ax1,
    color='tab:blue',
    position=0,
    width=0.4,
    label='IRS Total AGI'
)

ax1.set_ylabel('IRS Total AGI', size=30)
ax1.set_xlabel('ZIP Code', size=30)
ax1.tick_params(axis='x', labelsize=24)
ax1.tick_params(axis='y', labelsize=24)

# RS counts (right axis)
ax2 = ax1.twinx()
zip_data['RS_COUNT'].plot(
    kind='bar',
    ax=ax2,
    color='tab:orange',
    position=1,
    width=0.4,
    label='RS Count'
)

ax2.set_ylabel('RS Entry Count', size=30)
ax2.tick_params(axis='y', labelsize=24)


# --- Title & legend ---
plt.title('IRS Total AGI (Summed) vs RS Entry Counts by ZIP Code')

# Manually combine legends
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(handles1 + handles2, labels1 + labels2, loc='upper right', fontsize=30)

plt.tight_layout()
plt.show()

## Comparing IRS ZIP DATA with Rat Inspection Data
