## Rat Inspection Data Cleaning and EDA

This notebook is an initial study of the rat inspection data.

In [None]:
## Importing Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os
import glob

In [None]:
## Imports the rat inspection data from the split up csv files and concatenates them into one dataframe called rat_insp.

path = r'data/split_up_rat_inspection_data' 
all_files = glob.glob(os.path.join(path , "*.csv"))
rat_insp = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [None]:
display(rat_insp.sample(3)) #get a sense of what data looks like

print(f"Below are the columns in the dataframe.\n")
display(rat_insp.columns)

In [None]:
#Make letters lowercase, replace spaces with underscores, get rid of text after '(' etc
rat_insp.columns = [t.partition('(')[0].strip().lower().replace(' ', '_') for t in rat_insp.columns] #apply to column headers


In [None]:
rat_insp.info()

In [None]:
# boro_code and borough appear to be redundant information. We check if the borough code corresponds to borough names.

display(rat_insp['boro_code'].value_counts())
display(rat_insp['borough'].value_counts())

In [None]:
# boro_code 9 seems to correspond to 'Unspecified' borough. We check if all rows with boro_code 9 have borough as 'Unspecified'.
rat_insp[rat_insp['boro_code'] == 9]['borough'].value_counts()

In [None]:
# let's set those with boro_code 9 to have borough as 'Unspecified' just to be safe. 
rat_insp.loc[rat_insp['boro_code'] == 9, 'borough'] = 'Unspecified'

In [None]:
# Now, we drop boro_code since we have the borough column which is more descriptive.
rat_insp.drop(columns=['boro_code'], inplace=True)

In [None]:
# make the datetime the correct format
# this is currently commented out because it takes a long time to run. 
# We will run this after we have done some more cleaning and are ready to do analysis.


# rat_insp['inspection_date'] = pd.to_datetime(rat_insp['inspection_date']) 

In [None]:
# It looks like locationa and latitude and longitude are also redundant. We check if the location corresponds to the lat and long values.
display(rat_insp[['location', 'latitude', 'longitude']].sample(5))

In [None]:
# Here, we drop a lot of the extra columns we might not need for our analysis. 
# We can always add them back in later if we find that we need them.

# It looks like job_ticket_or_work_order_id, job_id, and job_progress are all related to the same thing. 
# It also looks like x_coord, y_coord, community board, council district, and census tract are all related to location.
# We drop all of these.

rat_insp.drop(columns=['job_ticket_or_work_order_id', 'job_id', 'job_progress', 'x_coord', 'y_coord', 'community_board', 'council_district', 'census_tract'], inplace=True)

# We might also want to drop house_numer, street_name, depending on what we focus on.

rat_insp.drop(columns=['house_number', 'street_name'], inplace=True)

# Same for block, lot, and nta.

rat_insp.drop(columns=['block', 'lot', 'nta'], inplace=True)

# We also probably won't be using bbl for anything.

rat_insp.drop(columns=['bbl'], inplace=True)

# Same for bin.

rat_insp.drop(columns=['bin'], inplace=True)




In [None]:
rat_insp.info()

In [None]:
# Let's look at the "results" of the inspections.

rat_insp['result'].value_counts()

# "Failed for Other R" seems to be irrelevant if we are focused on inspections involving rats.
# "Bait applied" could indicate that there were rats, but it could also indicate that there were just signs of rats. 
# We will keep it for now and see if we can find more information about it later.

# It is not clear what "Stoppage Done" and "Cleanup Done" mean. We need to look into this more later as well.




# To-do: Clean-up this column based on what we intend to do with the data later.

In [None]:
# To-Do: Clean-up the "inspection_type" column as well.
# Let's check the inspection_type column and see if there are any types of inspections that we might want to focus on or exclude.
rat_insp['inspection_type'].value_counts()

In [None]:
failed_rat_act = rat_insp[rat_insp['result'] == 'Failed for Rat Act']

failedidate = failed_rat_act.groupby(failed_rat_act['inspection_date'].dt.date).size().reset_index(name='count')

notfail = rat_insp[rat_insp['result'] != 'Failed for Rat Act']

idate = notfail.groupby(notfail['inspection_date'].dt.date).size().reset_index(name='count')


plt.figure(figsize=(35,20))
plt.plot(idate['inspection_date'], idate['count'], 'o', color="b", alpha=0.50)
plt.plot(failedidate['inspection_date'], failedidate['count'], 'o', color="r", alpha=0.50)
plt.xlabel('Inspection Date')
plt.ylabel('Count of Inspections')
plt.title('Count of Inspections Over Time (Blue = All Inspections, Red = Failed due to Rat Activity')
plt.show()

In [None]:
# The plot above seems to have some stranget data points.
# For example, we have an entry for 2045-08-28, which is in the future. 
# We also have some very old data points from before the mid 2010s.

rat_insp['inspection_date'].describe()

In [None]:
display(rat_insp[rat_insp['inspection_date'] > '2026-02-13'])

display(rat_insp[rat_insp['inspection_date'] > '2009-01-01'])

In [None]:
# Let's drop the data points that are in the future and before 2010,
# since they are likely to be errors and outliers.
rat_insp = rat_insp[(rat_insp['inspection_date'] >= '2010-01-01') & (rat_insp['inspection_date'] <= '2026-02-13')]

In [None]:
failed_rat_act = rat_insp[rat_insp['result'] == 'Failed for Rat Act']

failedidate = failed_rat_act.groupby(failed_rat_act['inspection_date'].dt.date).size().reset_index(name='count')

passed = rat_insp[rat_insp['result'] == 'Passed']

passidate = passed.groupby(passed['inspection_date'].dt.date).size().reset_index(name='count')


plt.figure(figsize=(35,20))
plt.plot(idate['inspection_date'], idate['count'], 'o', color="b", alpha=0.50)
plt.plot(failedidate['inspection_date'], failedidate['count'], 'o', color="r", alpha=0.50)
plt.xlabel('Inspection Date')
plt.ylabel('Count of Inspections')
plt.title('Count of Inspections Over Time (Blue = Passed, Red = Failed due to Rat Activity)')
plt.show()

In [None]:
# import plotly.figure_factory as ff


# # Add a dummy column to count each row
# rat_insp['dummy_count'] = 1

# fig = ff.create_hexbin_mapbox(
#     data_frame=rat_insp,
#     lat="latitude",
#     lon="longitude",
#     nx_hexagon=20,             # Number of hexagons in x direction
#     color="dummy_count",       # Sum of dummy_count = number of occurrences
#     agg_func=np.sum,           # Sum the dummy column
#     opacity=0.85,
#     labels={"color": "Number of Inspections"},
# )

# fig.update_layout(
#     mapbox_style="open-street-map",
#     margin=dict(b=0, t=0, l=0, r=0),
# )
# fig.show()



In [None]:
# The above map has points not in New York City.
# This is very weird and suggests that there are some errors in the latitude and longitude data.

# Let's check the latitude and longitude data to see if there are any obvious errors or outliers.
# display(rat_insp[['latitude', 'longitude']].describe())

In [None]:
# Let's look at the rows with the minimum and maximum latitude and longitude values to see if there are any obvious errors or outliers.
# display(rat_insp[rat_insp['latitude'] == rat_insp['latitude'].min()])
# display(rat_insp[rat_insp['latitude'] == rat_insp['latitude'].max()])
# display(rat_insp[rat_insp['longitude'] == rat_insp['longitude'].min()])
# display(rat_insp[rat_insp['longitude'] == rat_insp['longitude'].max()])

In [None]:
# For these entries, let's set their latitude and longitude values to NaN since they are likely to be errors.
rat_insp.loc[rat_insp['latitude'] == rat_insp['latitude'].min(), ['latitude', 'longitude']] = np.nan
rat_insp.loc[rat_insp['latitude'] == rat_insp['latitude'].max(), ['latitude', 'longitude']] = np.nan
rat_insp.loc[rat_insp['longitude'] == rat_insp['longitude'].min(), ['latitude', 'longitude']] = np.nan
rat_insp.loc[rat_insp['longitude'] == rat_insp['longitude'].max(), ['latitude', 'longitude']] = np.nan