In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticks
import seaborn as sns
import glob
import os


import missingno as miss
import holidays
import datetime as dt
from holidays import country_holidays
import statsmodels.tsa.api as tsa
SEED = 321
np.random.seed(SEED)
pd.set_option('display.max_columns', 100)
plt.style.use(('ggplot','fivethirtyeight'))
sns.set_context('notebook', font_scale=1.2)
plt.rcParams['figure.figsize'] = (12,4)
plt.rcParams['savefig.transparent'] = False
plt.rcParams['savefig.bbox'] = 'tight'
pd.set_option('display.float_format',lambda x: f"{x:,.2f}")

In [2]:
plt.rcParams['figure.facecolor']='white'
plt.rcParams['figure.figsize']=(12,4)
#import tick customization tools
import matplotlib.ticker as mticks
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
sns.set_context("talk", font_scale=0.9)

In [3]:
def format_xdates(ax, fig=None, xtick_fontweight='bold',
               title= None):
    """Formats x-axis with minor ticks every 3 months, and major 
    ticks every year. Includes different grid options for visibility"""
    # create the locator to place ticks every 3 months.
    loc_3months = mdates.MonthLocator(interval=3)
    fmt_months = mdates.DateFormatter("%b")
    ## for major year ticks
    loc_year = mdates.YearLocator()
    fmt_year = mdates.DateFormatter("%Y")
    
    ## Change the font of the major ticks to stand out
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels(ax.get_xticklabels(), 
                       fontweight=xtick_fontweight) 
    ax.set_xticklabel_format(style = 'plain')
    
    ## customize minor ticks
    ax.xaxis.set_minor_locator(loc_3months)
    ax.xaxis.set_minor_formatter(fmt_months)
    ## customize major ticks
    ax.xaxis.set_major_locator(loc_year)
    ax.xaxis.set_major_formatter(fmt_year)
    ## Making major/minor gridlines visually distince
    ax.grid(which='minor',axis='x',ls=":")
    ax.grid(which='major',axis='x',color='k')
    if fig is None:
        fig = ax.get_figure()
    ## rotate the dates
    fig.autofmt_xdate(which='major',rotation=90,ha='center')
    return fig,ax

In [4]:
path = r"C:/Users/dell/Documents/Data Enrichment assignments/Project-Part-4/Data/Chicago"
all_files = glob.glob(os.path.join(path, "*.csv"))

df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)


In [5]:
holidays= pd.read_csv("C:/Users/dell/Downloads/Holidays-Chicago - Holidays-Chicago (1).csv", low_memory = False)
crime_df = df
crime_df.info()
crime_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7815543 entries, 0 to 7815542
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    int64  
 1   Date                  object 
 2   Primary Type          object 
 3   Description           object 
 4   Location Description  object 
 5   Arrest                bool   
 6   Domestic              bool   
 7   Beat                  int64  
 8   District              float64
 9   Ward                  float64
 10  Latitude              float64
 11  Longitude             float64
dtypes: bool(2), float64(4), int64(2), object(4)
memory usage: 611.2+ MB


Unnamed: 0,ID,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude
0,1324743,01/01/2001 01:00:00 PM,GAMBLING,ILLEGAL ILL LOTTERY,STREET,True,False,313,3.0,,41.78,-87.61
1,1319931,01/01/2001 01:00:00 PM,BATTERY,SIMPLE,RESIDENCE,False,False,825,8.0,,41.78,-87.68
2,6154338,01/01/2001 01:00:00 PM,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,False,False,831,8.0,15.0,41.77,-87.7
3,1318099,01/01/2001 01:00:00 AM,BATTERY,SIMPLE,RESIDENCE PORCH/HALLWAY,False,True,214,2.0,,41.82,-87.62
4,1311732,01/01/2001 01:00:00 PM,PUBLIC PEACE VIOLATION,ARSON THREAT,RESIDENCE,False,False,1433,14.0,,41.91,-87.67


# Stakeholder Questions

1. Crimes Across the Years:
 - Is the total number of crimes increasing or decreasing across the years?
 
 - Are there any individual crimes that are doing the opposite (e.g decreasing when overall crime is increasing or vice-versa)?
 
 
 2. Comparing AM vs. PM Rush Hour:
- Are crimes more common during AM rush hour or PM rush hour?
         - You can consider any crime that occurred between 7 AM - 10 AM as AM rush hour
         
       - You can consider any crime that occurred between 4 -7 PM as PM rush hour. 
        
 - What are the top 5 most common crimes during AM rush hour? What are the top 5 most common crimes during PM rush hour?
 
- Are Motor Vehicle Thefts more common during AM rush hour or PM Rush Hour?
        
3. Comparing Months:

- What months have the most crime? What months have the least?
- Are there any individual crimes that do not follow this pattern? If so, which crimes?

4. Comparing Holidays:

- Are there any holidays that show an increase in the # of crimes?
- Are there any holidays that show a decrease in the # of crimes?

In [6]:
#Question 1
crime_df['DateTime'] = pd.to_datetime(crime_df['Date'], errors = 'coerce')
crime_df= crime_df.set_index('Date')
crime_df= crime_df.sort_index()
crime_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7815543 entries, 01/01/2001 01:00:00 AM to 12/31/2022 12:59:00 PM
Data columns (total 12 columns):
 #   Column                Dtype         
---  ------                -----         
 0   ID                    int64         
 1   Primary Type          object        
 2   Description           object        
 3   Location Description  object        
 4   Arrest                bool          
 5   Domestic              bool          
 6   Beat                  int64         
 7   District              float64       
 8   Ward                  float64       
 9   Latitude              float64       
 10  Longitude             float64       
 11  DateTime              datetime64[ns]
dtypes: bool(2), datetime64[ns](1), float64(4), int64(2), object(3)
memory usage: 670.8+ MB


In [7]:
crime_df.head()

Unnamed: 0_level_0,ID,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude,DateTime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
01/01/2001 01:00:00 AM,1309918,THEFT,$500 AND UNDER,TAVERN/LIQUOR STORE,False,False,1924,19.0,,41.94,-87.65,2001-01-01 01:00:00
01/01/2001 01:00:00 AM,1310824,THEFT,$500 AND UNDER,BAR OR TAVERN,False,False,323,3.0,,41.77,-87.62,2001-01-01 01:00:00
01/01/2001 01:00:00 AM,3212105,OFFENSE INVOLVING CHILDREN,AGG SEX ASSLT OF CHILD FAM MBR,RESIDENCE,True,False,913,9.0,14.0,41.82,-87.7,2001-01-01 01:00:00
01/01/2001 01:00:00 AM,1422085,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,False,True,1023,10.0,,41.86,-87.7,2001-01-01 01:00:00
01/01/2001 01:00:00 AM,1312658,BATTERY,SIMPLE,STREET,False,False,2534,25.0,,41.92,-87.73,2001-01-01 01:00:00


In [8]:
crime_df.index

Index(['01/01/2001 01:00:00 AM', '01/01/2001 01:00:00 AM',
       '01/01/2001 01:00:00 AM', '01/01/2001 01:00:00 AM',
       '01/01/2001 01:00:00 AM', '01/01/2001 01:00:00 AM',
       '01/01/2001 01:00:00 AM', '01/01/2001 01:00:00 AM',
       '01/01/2001 01:00:00 AM', '01/01/2001 01:00:00 AM',
       ...
       '12/31/2022 12:41:00 PM', '12/31/2022 12:42:00 AM',
       '12/31/2022 12:44:00 AM', '12/31/2022 12:45:00 AM',
       '12/31/2022 12:45:00 PM', '12/31/2022 12:50:00 PM',
       '12/31/2022 12:50:00 PM', '12/31/2022 12:52:00 AM',
       '12/31/2022 12:52:00 PM', '12/31/2022 12:59:00 PM'],
      dtype='object', name='Date', length=7815543)

In [9]:
#crime_df['Date'] = pd.to_datetime(crime_df.strftime('%Y-%m-%d'))
#crime_df.head()

In [10]:
#crime_df['Year']= crime_df.index.year

AttributeError: 'Index' object has no attribute 'year'

In [11]:
year_counts = crime_df['Year'].value_counts().sort_index()
year_counts.plot(style='o-',grid=True,ylabel='# of Crimes', xlabel='Year');
#over the years the total amount of crimes seem to be decreasing vastly 

KeyError: 'Year'

In [None]:
#what are the types of crimes increasing over the years 
type_df = crime_df.groupby('Primary Type').resample('Y').sum()

In [None]:
#type_df= type_df.drop(columns=['Year'])
type_df.info()

In [None]:
type_df.head()

In [None]:
type_df.index.get_level_values(0)

In [None]:
type_df.index.get_level_values(1)