# Rat sightings data cleaning

This file is an initial play with rat sightings data.

In [None]:
## Importing Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno


In [None]:
rat_sighting = pd.read_csv("scr/data/rat_sightings_data/Rat_Sightings_NYC.csv")

In [None]:
display(rat_sighting.sample(5)) #get a sense of what data looks like

print(f"Below are the columns in the dataframe.\n")
display(rat_sighting.columns)

In [None]:
import plotly.figure_factory as ff


# Add a dummy column to count each row
rat_sighting['dummy_count'] = 1

# Now let's do make a hexbin for each month to see if there are any seasonal patterns in the sightings. We will use the 'Created Date' column to extract the month.
rat_sighting['Created Date'] = pd.to_datetime(rat_sighting['Created Date'])
rat_sighting['Month'] = rat_sighting['Created Date'].dt.month

for month in range(1, 13):
    print(f"Hexbin map for month: {month}")
    month_data = rat_sighting[rat_sighting['Month'] == month]
    fig = ff.create_hexbin_mapbox(
        data_frame=month_data,
        lat="Latitude",
        lon="Longitude",
        nx_hexagon=100,             # Number of hexagons in x direction
    color="dummy_count",       # Sum of dummy_count = number of occurrences
    agg_func=np.sum,           # Sum the dummy column
    opacity=0.85,
    labels={"color": f"Number of Sightings for Month {month}"},
    color_continuous_scale="Viridis",
    zoom=10,)

    fig.update_layout(
        mapbox_style="open-street-map",
        margin=dict(b=0, t=0, l=0, r=0),
    )
    fig.show()



# Drop the dummy_count that was introduced to make the hexbin map
rat_sighting = rat_sighting.drop(columns=['dummy_count'])



General cleaning & neatness

In [None]:
#make letters lowercase, replace spaces with underscores, get rid of text after '(' etc

rat_sighting.columns = [t.partition('(')[0].strip().lower().replace(' ', '_') for t in rat_sighting.columns] #apply to column headers
rat_sighting['location_type'] = rat_sighting['location_type'].str.strip().str.replace(' ', '_').str.lower()  #apply to location_type column

In [None]:
rat_sighting.info()

In [None]:
# Identify columns with only 1 unique value (including NaNs)
cols_to_drop = [c for c in rat_sighting.columns if (rat_sighting[c].nunique(dropna=False) == 1)]

# Drop them all in one go
rat_sighting = rat_sighting.drop(columns=cols_to_drop)

In [None]:
#make the datetime the correct format
rat_sighting['created_date'] = pd.to_datetime(rat_sighting['created_date']) 
rat_sighting['closed_date'] = pd.to_datetime(rat_sighting['closed_date'])
rat_sighting['resolution_action_updated_date'] = pd.to_datetime(rat_sighting['resolution_action_updated_date'])


Consolidate redundant location_type with very similar names

In [None]:
rat_sighting['location_type'].value_counts()

In [None]:
# Create a dictionary of the "wrong" names and the "right" name
mapping = {
    '3+_family_apartment_building': '3+_family_apt._building',
    '3+family_apt.': '3+_family_apt._building',
    '3+_family_apt.': '3+_family_apt._building',
    '3+_family_apt': '3+_family_apt._building',
    'residential_building': '3+_family_apt._building',
    'residence': '3+_family_apt._building',
    'apartment': '3+_family_apt._building',
    '1-2_familydwelling': '1-2_family_dwelling',
    'school': 'school/pre-school/nursery',
    'school/pre-school': 'school/pre-school/nursery',
    'day_care_or_nursery': 'school/pre-school/nursery',
    'day_care/nursery': 'school/pre-school/nursery',
    'street':'street_area',
    'restaurant': 'restaurant/bar/deli/bakery',
    'catch_basin_or_sewer': 'catch_basin/sewer',
    'parking_lot_or_garage': 'parking_lot/garage',
    'government_building': 'office/government_building',
    'office/government_ building': 'office/government_building',
    'other_(explain_below)': 'other'
}

# Apply the fix
rat_sighting['location_type'] = rat_sighting['location_type'].replace(mapping)

In [None]:
rat_sighting['location_type'].value_counts()

Consolidate redundant columns

In [None]:
#check if park_borough and borough columns are redundant
print(rat_sighting['park_borough'].equals(rat_sighting['borough']))

In [None]:
rat_sighting = rat_sighting.drop(columns='park_borough')

In [None]:
#notice that information from lat, lon are repeated in point

# 1. Update the mask to ensure both lat and lon are present
not_null_mask = (
    rat_sighting['location'].notnull() & 
    rat_sighting['longitude'].notnull() & 
    rat_sighting['latitude'].notnull()
)

# 2. Extract BOTH lon and lat from the POINT string
# POINT (-73.9685 40.7540) -> index 0 is lon, index 1 is lat
coords_from_point = rat_sighting['location'].str.extract(r'POINT \(([^ ]+) ([^)]+)\)').astype(float)

# 3. Check if extracted lon matches 'longitude' AND extracted lat matches 'latitude'
lon_matches = np.isclose(coords_from_point.loc[not_null_mask, 0], rat_sighting.loc[not_null_mask, 'longitude'], atol=1e-4)
lat_matches = np.isclose(coords_from_point.loc[not_null_mask, 1], rat_sighting.loc[not_null_mask, 'latitude'], atol=1e-4)

# 4. Combine them: True only if BOTH match
final_matches = lon_matches & lat_matches

print(pd.Series(final_matches).value_counts())

In [None]:
# Drop the `location' column since it is redundant with latitude and longitude.

rat_sighting = rat_sighting.drop(columns=['location'])

In [None]:
# This block currently isn't working it seems.

# # Create a temporary table of just the rows that don't match
# mismatches = rat_sighting[matches == False]

# # Show the community_board and borough columns side-by-side
# mismatches[['community_board', 'borough']].head(20)

In [None]:
# 1. Strip out the word 'Unspecified' and extra spaces from the community board column
cleaned_board = rat_sighting['community_board'].str.replace('Unspecified', '', case=False).str.strip()

# 2. Extract the borough name from what remains (e.g., '03 BRONX' -> 'BRONX')
extracted_borough_cleaned = cleaned_board.str.extract(r'\d*\s*(.*)')

# 3. Re-run the comparison
matches_new = (extracted_borough_cleaned[0] == rat_sighting['borough'])

# 4. Check the results
print(pd.Series(matches_new).value_counts())

In [None]:
# Change 'matches' to 'matches_new' to see the actual errors
mismatches = rat_sighting[matches_new == False]

# Show the columns side-by-side
mismatches[['community_board', 'borough']].head(20)

Investigate the dates reports were created vs closed

In [None]:
# Create a mask for rows where both dates exist
both_dates_exist = rat_sighting['created_date'].notnull() & rat_sighting['closed_date'].notnull()

# Compare only the valid rows
real_date_mismatches = rat_sighting[
    (rat_sighting['created_date'] != rat_sighting['closed_date']) & 
    both_dates_exist
]

print(f"Actual mismatches (excluding NaT): {len(real_date_mismatches)}")
real_date_mismatches[['created_date', 'closed_date']].head()

In [None]:
closed_count = rat_sighting['closed_date'].count()
closed_count

In [None]:
print(rat_sighting['status'].value_counts())

In [None]:
# Create a filter for rows where status is exactly 'Unspecified'
unspecified_status_rows = rat_sighting[rat_sighting['status'] == 'Unspecified']

# Print the resulting rows
unspecified_status_rows

In [None]:
# Filter for status 'In Progress' that also have a closed_date
in_progress_with_dates = rat_sighting[
    (rat_sighting['status'] == 'In Progress') & 
    (rat_sighting['closed_date'].notnull())
]

# Display the findings
in_progress_with_dates[['unique_key', 'status', 'created_date', 'closed_date']]

In [None]:
# Find rows where status is 'In Progress' but a closed_date exists
status_mismatch = rat_sighting[
    (rat_sighting['status'] == 'In Progress') & 
    (rat_sighting['closed_date'].notnull())
]

print(f"Rows that are 'In Progress' but have a date: {len(status_mismatch)}")
status_mismatch[['unique_key', 'status', 'created_date', 'closed_date']]

In [None]:
# 1. Create the column first
rat_sighting['days_to_close'] = (rat_sighting['closed_date'] - rat_sighting['created_date']).dt.days

# 2. Now you can filter for negative values
time_travelers = rat_sighting[rat_sighting['days_to_close'] < 0]

# 3. Display the results
time_travelers[['unique_key', 'created_date', 'closed_date', 'days_to_close']]

# Filter for rows where the math resulted in a negative number
time_travelers = rat_sighting[rat_sighting['days_to_close'] < 0]

# Display the key columns to see the date conflict
time_travelers[['unique_key', 'created_date', 'closed_date', 'days_to_close']]

Data for how long it took case to close

In [None]:
# 1. Filter for only the 'Closed' status rows
closed_only = rat_sighting[rat_sighting['status'] == 'Closed'].copy()

# 2. Recalculate the days to close for this subset
closed_only['days_to_close'] = (
    (closed_only['closed_date'] - closed_only['created_date']).dt.total_seconds() / 86400
)

# 3. See the summary
print(closed_only['days_to_close'].describe())

In [None]:
# Save cleaned up rat_sighting data to a new CSV file

rat_sighting.to_csv("scr/data/rat_sightings_data/Rat_Sightings_Cleaned.csv", index=False)


In [None]:
# Here's a sample of the cleaned up data

rat_sighting.sample(5)

# Data Visualization

We try to visualize some of the data involved.

In [None]:
# Histogram of location_type
plt.figure(figsize=(20,14))

sns.set_style("whitegrid")
sns.histplot(y = rat_sighting["location_type"], bins=range(0, len(rat_sighting["location_type"])))
plt.title("Distribution of Location Types of Rat Sightings")
plt.xlabel("Frequency of Rat Sightings")
plt.ylabel("Location Types")
plt.show()

In [None]:
# Histogram by Borough 
plt.figure(figsize=(20,14))

sns.set_style("whitegrid")
sns.histplot(rat_sighting["borough"], bins=range(0, len(rat_sighting["borough"])))
plt.title("Distribution of Boroughs of Rat Sightings")
plt.ylabel("Frequency of Rat Sightings")
plt.xlabel("Borough")
plt.show()

In [None]:
# We check missingness of the data.

msno.matrix(rat_sighting)
plt.show()

In [None]:
# Here's a heatmap for the missingness.

msno.heatmap(rat_sighting)
plt.show()

In [None]:

cdate_rat = rat_sighting.groupby(rat_sighting['created_date'].dt.date).size().reset_index(name='count')

plt.figure(figsize=(35,20))
plt.plot(cdate_rat['created_date'], cdate_rat['count'], 'o', alpha=0.75)
plt.xlabel("Date", fontsize=20)
plt.ylabel("Number of Rat Sightings", fontsize=20)
plt.title("Rat Sightings in NYC Over Time", fontsize=24)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()

In [None]:
# We plot the rat sightings by date and burough to see if there are any trends by burough.

cdate_rat = rat_sighting.groupby(rat_sighting['created_date'].dt.date).size().reset_index(name='count')

for borough in rat_sighting['borough'].unique():
    if not pd.isnull(borough) and borough != 'Unspecified':
        plt.figure(figsize=(40,20))
        borough_data = cdate_rat[rat_sighting['borough'] == borough]
        plt.plot(borough_data['created_date'], borough_data['count'], 'o', color = np.random.rand(3,), alpha=1, markersize=10, label=borough)
        plt.xlabel("Date", fontsize=20)
        plt.ylabel("Number of Rat Sightings", fontsize=20)
        plt.title(f"Rat Sightings in NYC Over Time in {borough}", fontsize=30)
        plt.grid(True)
        plt.xticks(fontsize=24)
        plt.yticks(fontsize=24)
        plt.show()

In [None]:
# Let's plot the cumulative number of rats seen since 2020.
rat_sighting['created_date'] = pd.to_datetime(rat_sighting['created_date'])
rat_sighting = rat_sighting.sort_values('created_date')
rat_sighting['cumulative_count'] = np.arange(1, len(rat_sighting) + 1)
plt.figure(figsize=(35,20))
plt.plot(rat_sighting['created_date'], rat_sighting['cumulative_count'], 'o', alpha=0.75)
plt.xlabel("Date", fontsize=20)
plt.ylabel("Cumulative Number of Rat Sightings", fontsize=20)
plt.title("Cumulative Rat Sightings in NYC Over Time", fontsize=24)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()

In [None]:
# Now let's fit this using linear regression to see if there is a significant upward trend in rat sightings over time. We will use the created_date as the independent variable and the cumulative_count as the dependent variable.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Convert created_date to ordinal for regression
rat_sighting['created_date_ordinal'] = rat_sighting['created_date'].apply(lambda x: x.toordinal())
# Prepare the data for regression
X = rat_sighting['created_date_ordinal'].values.reshape(-1, 1)
y = rat_sighting['cumulative_count'].values
# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)
# Predict the values and calculate R^2 score
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
print(f"R^2 score of the linear regression model: {r2:.4f}")
print(f"The model's linear equation is: cumulative_count = {model.coef_[0]:.4f} * created_date_ordinal + {model.intercept_:.4f}")
plt.figure(figsize=(50,50))
plt.plot(rat_sighting['created_date'], rat_sighting['cumulative_count'], 'o', alpha=0.75, label='Actual Data')
plt.plot(rat_sighting['created_date'], y_pred, color='red', label='Linear Fit')
plt.xlabel("Date", fontsize=20)
plt.ylabel("Cumulative Number of Rat Sightings", fontsize=20)
plt.title("Cumulative Rat Sightings in NYC Over Time with Linear Fit from 2020-2026", fontsize=24)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend()
plt.show()

In [None]:
mask = (rat_sighting['created_date'].dt.year >= 2020) & (rat_sighting['created_date'].dt.year <= 2022)
filtered_data = rat_sighting[mask]

# Convert created_date to ordinal for regression
filtered_data['created_date_ordinal'] = filtered_data['created_date'].apply(lambda x: x.toordinal())

# Reshape
X_filtered = filtered_data['created_date_ordinal'].values.reshape(-1, 1)
y_filtered = filtered_data['cumulative_count'].values

# Fit the linear regression model
model_filtered = LinearRegression()
model_filtered.fit(X_filtered, y_filtered)

# Predict values and calculate R^2 score
y_pred_filtered = model_filtered.predict(X_filtered)
r2_filtered = r2_score(y_filtered, y_pred_filtered)
print(f"R^2 score of the linear regression model (2020-2022): {r2_filtered:.4f}")
print(f"The model's linear equation (2020-2022) is: cumulative_count = {model_filtered.coef_[0]:.4f} * created_date_ordinal + {model_filtered.intercept_:.4f}")

# Plot original data and regression line
plt.figure(figsize=(50,50))
plt.plot(filtered_data['created_date'], filtered_data['cumulative_count'], 'o', alpha=0.75, label='Actual Data (2020-2022)')
plt.plot(filtered_data['created_date'], y_pred_filtered, color='red', label='Linear Fit (2020-2022)')
plt.xlabel("Date", fontsize=20)
plt.ylabel("Cumulative Number of Rat Sightings", fontsize=20)
plt.title("Cumulative Rat Sightings in NYC Over Time with Linear Fit (2020-2022)", fontsize=24)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend()
plt.show()

In [None]:
zipcodes = rat_sighting['incident_zip'].values

zipcodes = np.unique(zipcodes)

In [None]:
print(zipcodes)

In [None]:
len(zipcodes)

In [None]:
# The zip code 12345 are invalid. They don't match the actual locations given.

rat_sighting[rat_sighting['incident_zip']== 12345]