In [1]:
from tqdm import tqdm
import time
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob
#import tick customization tools
import matplotlib.ticker as mticks
import matplotlib.dates as mdates
## Setting figures to timeseries-friendly
plt.rcParams['figure.figsize'] = (12,4)
plt.rcParams['figure.facecolor'] = 'white'
sns.set_context("talk", font_scale=0.9)
# set random seed
SEED = 321
np.random.seed(SEED)
#display more columns
pd.set_option('display.max_columns',50)



# Loading Data

In [None]:
folder = "Data/Chicago/"
crime_files = sorted(glob.glob(folder+"*.csv"))
chicago_df = pd.concat([pd.read_csv(f) for f in crime_files])
chicago_df

# Chicago Dataframe

In [None]:
chicago_df.isna().sum()

In [None]:
chicago_df.dropna(inplace = True)
chicago_df

In [None]:
chicago_df['Datetime'] = pd.to_datetime(chicago_df['Date'], format="%m/%d/%Y %H:%M:%S %p")
chicago_df = chicago_df.sort_values('Datetime')
chicago_df = chicago_df.set_index('Datetime')
chicago_df.info()
chicago_df.head()

## Holiday Dataframe

In [None]:
crime_files = sorted(glob.glob("Data/"+"*.csv"))
holiday_df = pd.concat([pd.read_csv(f) for f in crime_files])
holiday_df

In [None]:
# nulls
holiday_df.isna().sum()

In [None]:
holiday_df.dropna(inplace = True)
holiday_df.drop(columns = ["State Holidays"], inplace = True)
holiday_df

In [None]:
!pip install holidays

In [None]:
import holidays
import datetime as dt
from holidays import country_holidays

In [None]:
## making a date range that covers full dataset
all_days = pd.date_range(chicago_df["Date"].min(), chicago_df["Date"].max())
all_days



In [None]:
## Create an instance of the US country holidays.
us_holidays = country_holidays('US')
us_holidays

In [None]:
## Testing first date
print(all_days[0])
us_holidays.get(all_days[0])

In [None]:
## Getting us holidays for all dates
holiday_list = [us_holidays.get(day) for day in all_days]
holiday_list[:5]

In [None]:
# For a specific subdivisions (e.g. state or province):
co_holidays = country_holidays('US', subdiv='CO')
co_holidays

In [None]:
## Saving both holiday types as columns
chicago_df["US Holiday"] = [us_holidays.get(day) for day in chicago_df['Date']]
chicago_df['CO Holiday'] = [co_holidays.get(day) for day in chicago_df['Date']]
chicago_df.head()

In [None]:
## US Holidays
chicago_df['US Holiday'].value_counts()

In [None]:
## MD Holidays
chicago_df['CO Holiday'].value_counts()

In [None]:
ax = sns.barplot(data= chicago_dfgo_df, x='US Holiday',y='Total_Incidents',estimator=np.sum)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45, ha='right');

# Resampled Dataframe

In [None]:
## Creating a Total # of Crimes
# getting list of  unique crime descriptions
crime_list = chicago_df['Description'].unique()
crime_list

In [None]:
# Creating an empty dictionary  
CRIMES = {}
# 3. Loop through the list of crimes
for crime in crime_list:
    # I. Save a temp df of just the rows that match the crime
   temp = chicago_df.loc[chicago_df['Description']==crime].copy()
    # II. Resample the temp DataFrame as Daily data (crime counts) 
    # and keep ONLY the .size() 
   temp_res = temp.resample("D").size() 
   #III. Save the temporary DataFrame in the dictionary,
   #using the crime description as the key.
   CRIMES[crime] = temp_res.copy() 
CRIMES.keys()

In [None]:
resampled_df = pd.DataFrame(CRIMES)
resampled_df

In [None]:
## saving to disk
resampled_df.to_csv("Data/Chicago/Chicago_crime_counts.csv")

In [None]:
chicago_df.groupby("Description").size().head()

In [None]:
# II. Resample the temp DataFrame as Daily data (crime counts)
# and keep ONLY the .size()
temp_res = temp.resample("D").size()
temp_res




In [None]:
## testing our saved data
resampled_df = pd.read_csv("Data/Chicago/Chicago_crime_counts.csv", parse_dates=['Datetime'], index_col=0)
resampled_df

In [None]:
resampled_df = resampled_df.resample('D').asfreq()
resampled_df.head(3)


In [None]:
ts0 = resampled_df.index[0]
ts0

In [None]:
# checking the documentation for astimezone
ts0.astimezone?

# 

In [None]:
## remove time zone from the dt index
df = df.tz_convert(None)
df.head(3)

In [None]:
resampled_df.isna().sum()

In [None]:
## filling the null values with 0
resampled_df = resampled_df.fillna(0)
resampled_df



# Q1: Which district has the most crimes? Which has the least?


In [None]:
df_ts = chicago_df.groupby('District').sum()
df_ts