In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os 
import json
import holidays
import folium
import matplotlib.image as mpimg
from folium.plugins import TimeSliderChoropleth
import datetime as dt
from ipywidgets import interact, widgets
from scipy.stats import pearsonr
from folium.plugins import MarkerCluster



In [None]:
notebooks = os.getcwd()
df_main = pd.read_csv(notebooks + '/../data/processed/sf_crime_cleaned.csv')
df_main.info()

In [None]:
with open(notebooks + '/../data/processed/sf_incident_dtypes.json', 'r') as f:
    data_types = json.load(f)
data_types

In [None]:
df_main = df_main.astype(data_types)
df_main.info()


In [None]:
df_main.head()

### Lets check how crime rate changes by time

In [None]:
df_main['date_time'] = pd.to_datetime(df_main['incident_date'].dt.strftime("%Y-%m-%d") + " " +df_main['incident_time'])


In [None]:
df_main.set_index('date_time', inplace=True)

In [None]:
df_main.head(3)

In [None]:
_ =sns.barplot(df_main.incident_year.value_counts())
plt.title("Incident count for Years (2018-2024)")
_.annotate(
    '',  # Text for annotation
    xy=(0, 97000),  # Point to annotate
    xytext=(5, 86000),  # Location for the text
    arrowprops=dict(color='green', arrowstyle='<-'), 
    color='red'# Arrow properties
)

plt.show()

##  We can see the Covid-19 lockdown effect on year 2020 - 2021.  

# since we know covid-19 had great impact on crime on 2020 and at the begining of 2021. What we see there is decrease on total crimes by each year.

In [None]:
plt.figure(figsize=(10, 8))
df_cat = pd.DataFrame(df_main['category'].value_counts().sort_values(ascending=False))
df_cat = df_cat.reset_index() 
df_cat.columns = ['category', 'count'] 


sns.barplot(x='category', y='count', data=df_cat, order=df_cat['category'])
plt.xticks(rotation=90)  
plt.title('Total counts for each Type of Crime From 2018 to 2024')
plt.show()

## Clearly Larceny Theft is number one crime in San Francisco

## Lets go with top 5 crimes

In [None]:
del df_cat

In [None]:
top5_cat = df_main.value_counts(subset='category').sort_values(ascending=False).iloc[:5].reset_index()['category'].tolist()
top5_cat

In [None]:
fig, ax = plt.subplots()
for cat in top5_cat:
    sns.barplot(df_main[df_main.category == cat].incident_year.value_counts(), label=cat)
plt.legend()
plt.title('Yearly Incident Counts for Top 5 Crime Categories')
plt.show()    
    

# It seems that while a significant number of assaults were reported before 2020, there were fewer reports after 2020. Conversely, motor vehicle thefts increased after 2021.

## What is the average 'Larceny Theft' in a day?

In [None]:
daily_counts = df_main.groupby(['incident_date', 'category']).size().reset_index(name='count')


daily_avg = daily_counts.groupby('category')['count'].mean().reset_index(name='daily_average')
daily_avg= daily_avg.sort_values('daily_average', ascending=False)
daily_avg


## What is the distributrion from Monday to Sunday?

In [None]:
df_weekday = df_main.incident_date.unique().strftime("%A")
days, counts = np.unique(df_weekday, return_counts=True)
df_days = pd.DataFrame({"incident_day":days, "day_count":counts})

In [None]:
df_day_cat = df_main.groupby('incident_day')['category'].value_counts().reset_index()
df_merged =df_day_cat.merge(df_days, on="incident_day", how='left')
df_merged['daily_avg'] = df_merged['count'] / df_merged['day_count'] 
df_merged.head()

In [None]:
plt.figure(figsize=(10,10))
plt.title("Daily Average of 5 Top Crime")
for cat in top5_cat:
    sns.barplot(x='incident_day', y='daily_avg', data=df_merged[df_merged.category == cat], label=cat)
plt.show()

### Daily average shows that more crime is happening on fridays and saturdays, we will check that it is statisticly significant

In [None]:
# Lets start with splitting our data to weekend and weekdays
df_weekdays = df_main[~df_main.incident_date.dt.strftime('%A').isin(['Friday', 'Saturday'])][['incident_date', 'neighborhood',	'latitude',	'longitude', 'category', 'subcategory']]
df_weekends =  df_main[df_main.incident_date.dt.strftime('%A').isin(['Friday', 'Saturday'])][['incident_date', 'neighborhood',	'latitude',	'longitude', 'category', 'subcategory']]

In [None]:
df_weekdays = df_main[~df_main.incident_date.dt.strftime('%A').isin(['Friday', 'Saturday'])][['incident_date', 'neighborhood',	'latitude',	'longitude', 'category', 'subcategory']]
df_weekends =  df_main[df_main.incident_date.dt.strftime('%A').isin(['Friday', 'Saturday'])][['incident_date', 'neighborhood',	'latitude',	'longitude', 'category', 'subcategory']]
prob_weekend = (df_weekends.size/2)/df_main.size
prob_weekday = (df_weekdays.size/5)/df_main.size 
print(f'Weekend: {prob_weekend}')
print(f'Weekday: {prob_weekday}')

## When we compare weekend to weekdays we see slightly higher probability of crime on weekends. Is that statisticly significant? 
## Let's find out ...

In [None]:
# Null Hypothessis will say there is no difference between weekdays and  weekends.
# I will run permutation test to test that hypothesis.
#I will take 100 samples from main data 1000 times 
np.random.seed = 42

# calculate the total incident report for each day
df_daily = df_main.groupby('incident_date').size().reset_index()
df_daily.columns = ['incident_date', 'incident_count']

df_daily['is_weekend'] =df_daily.incident_date.dt.strftime('%A').isin(['Friday','Saturday'])
df_daily.head()

In [None]:
plt.figure(figsize=(10,6))
daily_mean_sample_list = []
for i in range(1000):
    daily_mean_sample_list.append(df_daily.sample(100)['incident_count'].mean())

plt.hist(daily_mean_sample_list, bins=30, color='orange', alpha=0.7)
plt.axvline(x=df_daily['incident_count'].mean(), color='blue', linestyle='--', label='Daily Average')
percentiles = np.percentile(daily_mean_sample_list, [2.5, 97.5])
plt.axvline(x=percentiles[0], color='red', linestyle='--', label='2.5th Percentile')
plt.axvline(x=percentiles[1], color='red', linestyle='--', label='97.5th Percentile')
plt.axvline(x=(observed_weekend_average:=df_daily[df_daily['is_weekend']]['incident_count'].mean()), color='green', linestyle='-', label='Weekend Average')
plt.legend()
plt.xlabel('Incident Count')
plt.ylabel('Frequency')
plt.title('Permutation Test - Incident Count Distribution')
plt.grid(True)
plt.show()

# Having weekend average from the null hypothessis case is a very unlikely. For that reason we can reject the null hypothessis.
## Let's calculate the p-value. (threshold is 0.05 selected) 

In [None]:
p_value = np.mean(np.array(daily_mean_sample_list) >= observed_weekend_average)
print(f"p-value: {p_value}")

## lets see the daily total crime counts for each year

In [None]:
fig, ax = plt.subplots(2,3, figsize=(20,10))
ax = ax.flatten()
df_daily['year'] = df_daily.incident_date.dt.year
for i, year in enumerate(range(2018, 2024)):
    data=df_daily.query("year==@year")
    ax[i].scatter(data['incident_date'], data['incident_count'], c=data['incident_count'], cmap='viridis_r')
    ax[i].set_title(f"Daily incident counts for {year}")
    ax[i].set_xlabel("Date")
    if not i%3:
        ax[i].set_ylabel("Total Crime Amount")
    
    
plt.tight_layout()

plt.show()

# New Year's Day consistently has significantly higher crime than the average.

### When  I look on year 2019, 2022 and 2023 there are one day has more crime than any other days.  Let's find out what are those days.

In [None]:
print(df_daily.query('(year == 2019) & (incident_count > 400)')['incident_date'].dt.strftime("%y-%m-%d").iloc[0], df_daily.query('(year == 2019) & (incident_count > 400)')['incident_date'].dt.strftime('%A').iloc[0])
print(df_daily.query('(year == 2022) & (incident_count > 400)')['incident_date'].dt.strftime("%y-%m-%d").iloc[0], df_daily.query('(year == 2019) & (incident_count > 400)')['incident_date'].dt.strftime('%A').iloc[0])
print(df_daily.query('(year == 2023) & (incident_count > 400)')['incident_date'].dt.strftime("%y-%m-%d").iloc[0], df_daily.query('(year == 2019) & (incident_count > 400)')['incident_date'].dt.strftime('%A').iloc[0])


# What do these days have in common? 
# They are Pride Days! It looks like there is a dramatic increase in crime reports on those days.

In [None]:
# Lets see what crime is being increased on pride spesifically.

df_2019_pride = df_main[df_main['incident_date'].dt.strftime("%Y-%m-%d") == '2019-06-30']['category'].value_counts().reset_index()
df_2022_pride = df_main[df_main['incident_date'].dt.strftime("%Y-%m-%d") == '2022-06-26']['category'].value_counts().reset_index()
df_2023_pride = df_main[df_main['incident_date'].dt.strftime("%Y-%m-%d") == '2023-06-25']['category'].value_counts().reset_index()
pride_merged = pd.merge(daily_avg, df_2019_pride, on='category', how='inner')
pride_merged = pd.merge(pride_merged,df_2022_pride, on='category', how='inner')
pride_merged = pd.merge(pride_merged,df_2023_pride, on='category', how='inner')
pride_merged.columns=['category', 'daily_average', 'count_2019', 'count_2022', 'count_2023']

pride_merged['pride_average'] = np.mean(pride_merged[['count_2019', 'count_2022', 'count_2023']], axis=1)
pride_merged['difference'] = pride_merged['pride_average'] - pride_merged['daily_average']
pride_merged = pride_merged.sort_values(by='difference', ascending=False)
pride_merged.category = pride_merged.category.astype(str)
pride_merged


In [None]:
plt.figure(figsize=(20,10))
prd_merged_long = pd.melt(pride_merged, id_vars='category', value_vars=['daily_average', 'pride_average'], var_name='averages', value_name='value')
prd_merged_long.head()
sns.barplot(x='category', y='value', data=prd_merged_long, hue='averages')
plt.xticks(rotation=90)
plt.title('Crime Amount Comparison Pride Day vs Other Days')
plt.show();

# On average pride days Larceny Theft shows significant increase. From 108.6 to 291 on average.

In [None]:
del df_merged
del df_days
del df_day_cat
del df_weekday
del df_2019_pride 
del df_2022_pride
del df_2023_pride
del daily_avg


## Lets check what are the monthly distributions of the top 5 crimes 

In [None]:
def monthly_distribution(years=(2018, 2024), df=df_main, super_title= "Title"):
    # Calculate the number of rows needed based on the year range
    n_rows = ((years[1] - years[0]) // 3) + 1
    plt.style.use('ggplot')
    fig, ax = plt.subplots(n_rows, 3, figsize=(20, 5 * n_rows))
    ax = ax.flatten() 
    
    for i, year in enumerate(range(years[0], years[1] + 1)):
     
        df_year = df.loc[(df.index < pd.Timestamp(str(year + 1))) & (df.index >= pd.Timestamp(str(year)))].copy()
        
      
        df_year['year_month_text'] = df_year.incident_date.dt.strftime('%Y-%m')
        
       
        df_year = df_year.groupby("year_month_text").size().reset_index().set_index('year_month_text').rename(columns={0: "count"})
        
     

        sns.barplot(x='year_month_text', y='count', data=df_year, ax=ax[i], hue='count')
        ax[i].set_title(f"{year}")
        ax[i].tick_params(axis='x', rotation=90)
        ax[i].legend().remove()
 
    for j in range(i + 1, len(ax)):
        ax[j].set_visible(False)
    plt.suptitle(super_title, y=1)
    plt.tight_layout()
   
    plt.show();

# Example usage
monthly_distribution(years=(2018, 2024), df=df_main, super_title= 'Monthly Crime Counts per Year')

In [None]:
monthly_distribution(years=(2018, 2024), df=df_main[df_main.category == 'Larceny Theft'], super_title="Monthly Larceny Theft Counts per Year")

In [None]:
monthly_distribution(years=(2018, 2024), df=df_main[df_main.category == 'Robbery'], super_title="Monthly Robbery Counts per Year")

## Lets add `is_holiday` categorical column if date is holiday or not in US and `holiday_type` column to identfy what holiday it is

In [None]:
us_holidays = holidays.US() 
df_main['is_holiday'] = df_main.incident_date.apply(lambda x: x in (us_holidays))
df_main['holiday_type'] = np.array([us_holidays.get(day) for day in df_main.incident_date])
    

In [None]:
print(df_main.subcategory.unique().tolist())

In [None]:
def yearly_dist(year=2022, df = df_main, cat='Theft From Vehicle'):
    df_year = df.loc[df_main.incident_date.dt.year == year].copy()
    
    df_year = df_year[df_year.subcategory == cat].groupby('incident_date')['category'].size().reset_index()
    df_year['holiday_type'] = np.array([us_holidays.get(day) for day in df_year.incident_date])
    df_year['holiday_type'] = df_year['holiday_type'].fillna(df_year['incident_date'].apply(lambda x: "Weekend" if x.strftime("%A") in ['Friday', 'Saturday'] else 'Weekday'))
    plt.figure(figsize=(25,13))
    sns.scatterplot(x='incident_date', y='category', hue="holiday_type", data=df_year, palette='bright', size='holiday_type')
    sns.lineplot(x='incident_date', y='category', data=df_year)
    plt.axhline(np.mean(df_year['category']), label='Average', color='blue')
    
    plt.ylabel('count')
    plt.title(f"Daily {cat} incident counts for {year}")
    plt.show()


In [None]:
yearly_dist(cat= 'Burglary - Residential')

## Vertical line show the average crime amount for that year. My hypothesis is holidays and next couple of days have higher Residential Burglary.
 

In [None]:
sample_mean_list = [df_main[df_main['subcategory']=='Burglary - Residential'].groupby('incident_date').size().sample(100).mean() for _ in range(1000)]


# Lets find out what is the average on holidays and following 3 days of holidays

In [None]:
plt.figure(figsize=(20,10))
df_burglary =df_main[df_main['subcategory'] == 'Burglary - Residential'].groupby('incident_date').size().reset_index().rename(columns={0:'count'})
df_burglary['is_holiday']  = df_burglary.incident_date.apply(lambda x: x in (us_holidays))
# Lets change is holiday to true for 3 days following holiday
i = 0
while i < len(df_burglary) - 5:
    if df_burglary.loc[i, 'is_holiday']:
     
        df_burglary.loc[i+1:i+3, 'is_holiday'] = True
        i+=4
    
    else:
        i+=1
   

sns.histplot(sample_mean_list,  stat='probability', bins=30, color='orange')
percentiles = np.percentile(sample_mean_list, [2.5, 97.5])
plt.axvline(x=percentiles[0], color='red', linestyle='--', label='2.5th Percentile')
plt.axvline(x=percentiles[1], color='red', linestyle='--', label='97.5th Percentile')
plt.axvline(x=(observed:=np.mean(df_burglary[df_burglary['is_holiday']]['count'])), c='green',label='Observed Mean on Holidays')
plt.legend()
plt.title('Daily Average Burglary-Residential Distribution')
plt.xlabel('Burglary - Residential')      

# Since the observed value in the confidence interval we can not reject the null hypothesis. To make it official lets get the p-value. (Pre-selected treshold will be 0.05)

In [None]:
p_value = np.mean(sample_mean_list >= observed)
print(f"P-value: {p_value} \nP-value is larger than 0.05") 

## Holidays has no significant impact on residential Burglary!

# Lets create a table to keep only daily constant details

In [None]:

df_eachday = df_main.loc[~df_main.duplicated(subset='incident_date', keep='first')]
df_eachday = df_eachday[['incident_date', 'is_holiday', 'holiday_type' ,'precipitation']].reset_index(drop=True)
df_eachday.sample(20)

In [None]:
df_top5 = df_main[df_main['category'].isin(top5_cat)].astype(str)
df_top5= df_top5.groupby('incident_date')['category'].value_counts().reset_index()
df_top5 = df_top5.pivot_table(values='count', index= 'incident_date', columns='category').reset_index()
df_top5['incident_date'] = pd.to_datetime(df_top5['incident_date'], format="%Y-%m-%d")
df_top5.head()

In [None]:
df_day_merged = df_eachday.merge(df_top5, on='incident_date', how='left')
df_day_merged.head()

In [None]:
sns.scatterplot(x='precipitation', y='Larceny Theft', data=df_day_merged[df_day_merged['precipitation'] <=2], hue='Larceny Theft')
plt.title('Impact of Precipitation on Larceny Theft Incidents')
plt.show()

In [None]:
plt.figure(figsize=(25,10))
top5_sub = df_main.subcategory.value_counts()[:5].reset_index()['subcategory'].tolist()
df_top5 = df_main[df_main['subcategory'].isin(top5_sub)].astype(str)
df_top5= df_top5.groupby('incident_date')['subcategory'].value_counts().reset_index()
df_top5 = df_top5.pivot_table(values='count', index= 'incident_date', columns='subcategory').reset_index()
df_top5['incident_date'] = pd.to_datetime(df_top5['incident_date'], format="%Y-%m-%d")
df_top5.head()
df_day_merged = df_eachday.merge(df_top5, on='incident_date', how='left')
df_day_merged.head()
for sub in top5_sub:
    plt.scatter(df_day_merged['precipitation'], df_day_merged[sub], alpha=0.7, label=sub)
    plt.title(f'Impact of Precipitation on {sub}')
plt.xlabel("Rain Level")
plt.ylabel("Incident Count")
plt.legend()
plt.show()

# My hypothesis is that rain and Larceny - From Vehicle have a negative correlation. This is not clearly visible in the plot. To check if this relationship is statistically significant, I will perform a hypothesis test. Let's start by comparing the mean number of reported incidents on rainy versus non-rainy days.

In [None]:
mean_no_rain = np.mean(df_day_merged.query('precipitation < 0.3')['Larceny - From Vehicle'])
mean_rain = np.mean(df_day_merged.query('precipitation >= 0.3')['Larceny - From Vehicle'])
print('Average Larceny from vehicle on a sunny day is', mean_no_rain)
print('Average Larceny from vehicle on a rainy day is', mean_rain)

# That difference beween sunny day and rainy day looks so little, looks like rain does not stop people from going out and stealing from cars

## Lets check if there is a location pattern for certain types of crime 

In [None]:
def  neighborhood_crime_bar(data = df_main, year=None, category='Larceny - From Vehicle'):
    df_ = data.groupby(['incident_date', 'neighborhood'])['subcategory'].value_counts().reset_index()
    df_['subcategory'] = df_['subcategory'].astype(str)
    df_sub = df_[df_['subcategory'] == category]
    df_sub_neig = df_sub.groupby('neighborhood')['count'].sum().reset_index().sort_values(by='count', ascending=False)
    plt.figure(figsize=(20,10))
    sns.barplot(x=df_sub_neig['neighborhood'], y=df_sub_neig['count'], hue=df_sub_neig['neighborhood'])
    plt.xticks(rotation=90)
    plt.title(f'{category} Count for Neighborhoods')
    plt.show();

In [None]:
neighborhood_crime_bar(category='Burglary - Residential')

In [None]:
neighborhood_crime_bar(category='Larceny Theft - Pickpocket')

### Lets create a function to calculate and visualize the ratio of a specific crime category in different neighborhoods

In [None]:
def crime_ratio_by_neighborhood(category='Prostitution'):
    neighborhood_list = []
    ratio_list = []
    for neighborhood in df_main['neighborhood'].unique():
        a_crime = len(df_main[(df_main['subcategory'] == category) & (df_main['neighborhood'] == neighborhood)])

        total_crime = sum(df_main['neighborhood'] == neighborhood)
        ratio = a_crime / total_crime
        neighborhood_list.append(neighborhood)
        ratio_list.append(ratio)
        
    zipped = list(zip(neighborhood_list, ratio_list))
    zipped=sorted(zipped, key=(lambda x: x[1]), reverse=True)
    neighborhood_list = [tpl[0] for tpl in zipped]
    ratio_list = [tpl[1] for tpl in zipped]
    plt.style.use('ggplot')
    plt.figure(figsize=(20,10))
    sns.barplot(x=neighborhood_list, y=ratio_list, hue=ratio_list)
    plt.xticks(rotation=90)
    plt.title(f"{category} ratio for Neighborhoods")
    plt.xlabel('Neighborhood')
    plt.ylabel('Density')
    plt.show();

In [None]:
crime_ratio_by_neighborhood(category='Burglary - Residential')

## How to read the plot above?  
### The crime category 'Burglary - Residential' comprises around 8.5% of all crimes in Noe Valley.

### Lets Find out most popular crimes for each neighborhood 

In [None]:
plt.figure(figsize=(20,10))
df_neighborhood = df_main.groupby('neighborhood')['subcategory'].value_counts().reset_index()
df_neighborhood = df_neighborhood.loc[~df_neighborhood.duplicated(subset='neighborhood', keep='first')]
df_neighborhood['subcategory'] = df_neighborhood['subcategory'].astype(str)
sns.barplot(x='neighborhood', y='count', hue='subcategory' , data=df_neighborhood)
plt.xticks(rotation=90)
plt.title("Most Common Crimes for Neighborhoods")
plt.show();


# Looks like Larceny - From Vehicle is number one crime almost all over the city. What crime would be appear if we filter 'Larceny - From Vehicle'? 

In [None]:
plt.figure(figsize=(20,10))
df_neighborhood = df_main[df_main['subcategory'] != 'Larceny - From Vehicle'].groupby('neighborhood')['subcategory'].value_counts().reset_index()
df_neighborhood = df_neighborhood.loc[~df_neighborhood.duplicated(subset='neighborhood', keep='first')]
df_neighborhood['subcategory'] = df_neighborhood['subcategory'].astype(str)
sns.barplot(x='neighborhood', y='count', hue='subcategory' , data=df_neighborhood)
plt.xticks(rotation=90)
plt.title("Most Common Crimes for Neighborhoods (Larceny - From Vehicle filtered)")
plt.show();

In [None]:
df_sub = df_main[df_main['subcategory'] == 'Larceny - From Vehicle']
day = pd.Timestamp('2018-01-01')
#df_sub_date = df_sub.query("incident_date == @day")
background_image = mpimg.imread('./SanFranciscoNeighborhoods.jpg')


def plot_map(day):
    
    df_sub_date = df_sub[df_sub['incident_date'] == pd.Timestamp(day)]
    df_sub_date = df_sub_date.sort_values(by='incident_time')
    
    plt.figure(figsize=(10,10))
    
    # Display the background image
    plt.imshow(background_image, extent=[-122.52, -122.35, 37.68, 37.83], aspect='auto')
    palette = sns.color_palette(palette='OrRd')
    # Plot the scatterplot on top of the background
    sns.scatterplot(y='latitude', x='longitude', data=df_sub_date, s=100, alpha=0.8, hue=df_sub_date.index.hour, palette='OrRd')
    # Adjust axes limits to match the map
    plt.xlim([-122.52, -122.35])
    plt.ylim([37.68, 37.83])
    plt.title('Larceny - From Vehicle on Map')
    # Display the plot
    plt.show()


start_day = pd.Timestamp('2018-01-01')
end_day = pd.Timestamp('2024-05-01') 

day_slider = widgets.DatePicker(
    description='Select Date',
    value=start_day,
    disabled=False
)

# Display the interactive plot
interact(plot_map, day=day_slider)


# Lest find out the center (mean) for certain crime where it locates in the City.

In [None]:
def plot_weight_center_for_crime(ax,crime_sub="Larceny Theft - From Building"):
    mean_latitude, mean_longitude = df_main[df_main['category'] == crime_sub ][['latitude', 'longitude']].agg(np.mean)
    #plt.imshow(background_image, extent=[-122.52, -122.35, 37.68, 37.83], aspect='auto')
    ax.scatter([mean_longitude],[mean_latitude], marker='o',alpha=1, label=crime_sub, s=100, edgecolors='white',linewidth=3)
    #plt.show()
#plot_weight_center_for_crime()    

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
plt.imshow(background_image, extent=[-122.52, -122.35, 37.68, 37.83], aspect='auto')
for cat in top5_cat:
    plot_weight_center_for_crime(ax, cat)
plt.legend()
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("The center of Top 5 crimes")
plt.show()


In [None]:
def find_on_map_with_subcategory(cat_type):
    sf_map= folium.Map(location=[37.7749, -122.4194], zoom_start=12)

    marker_cluster = MarkerCluster().add_to(sf_map)
    
    for i, row in df_main.loc[df_main['subcategory']==cat_type].iterrows():
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=row['incident_date'].strftime('%B %d,%Y')
        ).add_to(marker_cluster)
    return(sf_map)
    
sf_map =find_on_map_with_subcategory('Larceny - From Vehicle')
sf_map.save('larceny_from_vehicle_on_map.html')

In [None]:
display(sf_map)

# Let's visualize the hourly distribution of crimes to identify any potential clusters at specific times of the day.

In [None]:
df_main.info()
df_main['incident_hour'] = df_main['incident_time'].apply(lambda x: int(x[:2]))

In [None]:
plt.figure(figsize=(25,10))
sns.histplot(x='incident_hour', data=df_main, hue='category', multiple='stack', bins=24)
plt.xticks(rotation=45)
plt.title('Hourly Distribution of Crime')
plt.show()

# Lets create a function to visualize same graphic for certain crime 

In [None]:
def dist_by_time(cat='Assault'):
    df_cat = df_main.loc[df_main['category'] == cat].copy()
    df_cat['subcategory'] = df_cat['subcategory'].astype(str)
    plt.figure(figsize=(25,10))
    sns.histplot(x='incident_hour', data=df_cat, hue='subcategory', multiple='stack', bins=24)
    plt.xticks(rotation=45)
    plt.title(f'Hourly Distribution of {cat}')
    plt.show();

In [None]:
dist_by_time()

In [None]:
dist_by_time('Larceny Theft')

In [None]:
plt.figure(figsize=(25,8))
sns.boxplot(x='category', y='incident_hour', data=df_main)
plt.xticks(rotation=90)
plt.title("Hourly Distribution of each Crime Category")
plt.show()

In [None]:
plt.figure(figsize=(25,8))
sns.boxplot(x='subcategory', y='incident_hour', data=df_main, hue='subcategory')
plt.xticks(rotation=90)
plt.title("Hourly Distribution of each Crime Subcategory")
plt.show()

# Lets see the correlation between max and min temperature if wee need to keep both

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x='min_temperature', y='max_temperature', data=df_main)
average_temp = (df_main['min_temperature'] + df_main['max_temperature'])/2
plt.scatter(average_temp, average_temp, color='b')

In [None]:
correlation_coefficient, p_value= pearsonr(df_main['min_temperature'], df_main['max_temperature'])
correlation_coefficient

In [None]:
df_main['avg_temp'] = average_temp

In [None]:
df_main.head(1)

In [None]:
# Lets eliminate the columns will no need any further 
df_main = df_main[['incident_date', 'incident_time', 'incident_day', 'category', 'subcategory', 'latitude', 'longitude', 'avg_temp', 'precipitation', 'is_holiday', 'holiday_type']]


In [None]:
df_main.head()