# Setup and Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set
import warnings
import re
from pandas.io import gbq
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from scipy import sparse

# Get the Data

In [3]:
chicago_data = """
                    SELECT unique_key, date, primary_type, location_description, 
                            arrest, domestic, community_area,year, latitude, longitude,
                    FROM `gdac-327115.Chicago.chicago2`
                    WHERE year >= 2011
               """
chicago_data = gbq.read_gbq(chicago_data, project_id="gdac-327115")

In [4]:
#Read in an Excel file with a one to one mapping between Chicago community areas and districts
chicago_districts = pd.read_excel("ChicagoCommunityAreas.xlsx")

In [5]:
#Data type can't be joined on an int
chicago_districts.community_area = chicago_districts["community_area"].astype("string")
chicago_data.community_area = chicago_data["community_area"].astype("string")

In [6]:
#Outer join the two data sets
chicago = chicago_data.merge(chicago_districts, how="outer", left_on="community_area", right_on="community_area", )

In [7]:
#Check data types
chicago.dtypes

unique_key                            Int64
date                    datetime64[ns, UTC]
primary_type                         object
location_description                 object
arrest                              boolean
domestic                            boolean
community_area                       string
year                                  Int64
latitude                            float64
longitude                           float64
district_name                        object
community_name                       object
dtype: object

In [8]:
#Change the object data types to categorical
chicago.loc[:, chicago.dtypes == "object"] = chicago.select_dtypes(["object"]).apply(lambda x: x.astype("category"))

In [9]:
#Drop the community area variable since we have a community name variable
chicago.drop("community_area", axis = 1, inplace = True)

### Explore first few rows

In [10]:
chicago.head()

Unnamed: 0,unique_key,date,primary_type,location_description,arrest,domestic,year,latitude,longitude,district_name,community_name
0,11228094,2017-11-20 15:00:00+00:00,DECEPTIVE PRACTICE,POOL ROOM,False,False,2017,,,Far North,NORTH PARK
1,12135292,2020-08-13 12:00:00+00:00,DECEPTIVE PRACTICE,BANK,False,False,2020,41.993402,-87.71228,Far North,NORTH PARK
2,8905294,2012-11-27 17:30:00+00:00,PUBLIC PEACE VIOLATION,RESTAURANT,False,False,2012,41.978265,-87.708742,Far North,NORTH PARK
3,8805120,2012-09-17 00:30:00+00:00,DECEPTIVE PRACTICE,TAXICAB,False,False,2012,41.985277,-87.713827,Far North,NORTH PARK
4,9030552,2013-02-21 00:00:00+00:00,CRIM SEXUAL ASSAULT,RESIDENCE,False,True,2013,41.994323,-87.720213,Far North,NORTH PARK


In [11]:
#Check the number of rows and columns
chicago.shape

(2971810, 11)

# Split the Data into a Training Set and Test Set

In [12]:
chicago_train = chicago.loc[chicago["year"] != 2021]
chicago_test = chicago.loc[chicago["year"] == 2021]

In [20]:
#Check if the test set contains data from the full year
chicago_test.loc[:, "date"].dt.month. \
                                value_counts(). \
                                reset_index(). \
                                rename(columns={"index":"Month", "date":"Count"}). \
                                sort_values(by = "Month")

Unnamed: 0,Month,Count
5,1,16008
8,2,12852
6,3,15709
7,4,15279
4,5,17493
2,6,18468
0,7,18898
3,8,18110
1,9,18632
9,10,5507


# Data Exploration and Cleaning

In [22]:
#Check the total number of missing values in each column
chicago_train.isnull().sum()

unique_key                  0
date                        0
primary_type                0
location_description     7844
arrest                      0
domestic                    0
year                        0
latitude                26843
longitude               26843
district_name             218
community_name            218
dtype: int64

In [21]:
#Check to make sure there are no duplicate incidents based on the "unique key" variable
assert len(chicago_train["unique_key"].unique()) == chicago_train.shape[0]

### Cleaning Primary Type Variable

In [23]:
#Quick reference on the number of missing values
chicago_train["primary_type"].isnull().sum()

0

In [24]:
#Examine the different crime types listed
chicago_train["primary_type"].value_counts()

THEFT                                636007
BATTERY                              512318
CRIMINAL DAMAGE                      299943
NARCOTICS                            222406
ASSAULT                              189549
OTHER OFFENSE                        171368
DECEPTIVE PRACTICE                   167709
BURGLARY                             152532
MOTOR VEHICLE THEFT                  120013
ROBBERY                              108092
CRIMINAL TRESPASS                     69971
WEAPONS VIOLATION                     45871
OFFENSE INVOLVING CHILDREN            23359
PUBLIC PEACE VIOLATION                21829
PROSTITUTION                          12438
CRIM SEXUAL ASSAULT                   12334
INTERFERENCE WITH PUBLIC OFFICER      11793
SEX OFFENSE                           10771
HOMICIDE                               5669
ARSON                                  4482
GAMBLING                               3507
LIQUOR LAW VIOLATION                   3407
CRIMINAL SEXUAL ASSAULT         

It appears "CRIM SEXUAL ASSAULT" should be merged into "CRIMINAL SEXUAL ASSAULT". We'll also merge the three various "Non-criminal" categories into one category. We'll do the same thing for the "OTHER NARCOTIC VIOLATION" category. Finally, we'll move "RITUALISM" into the "OTHER" since it has only one occurence. Let's collect all of these changes into one function. 

In [25]:
def crime_type_cleaner(df):
    """
    This function cleans the primary_type column. 
    
    df: DataFrame with primary_type column
    
    returns: df
    """
    df.loc[:, "primary_type"].replace(["CRIM SEXUAL ASSAULT"], "CRIMINAL SEXUAL ASSAULT", inplace = True)

    df.loc[:, "primary_type"].replace(["OTHER NARCOTIC VIOLATION"], "NARCOTICS", inplace = True)

    df.loc[:, "primary_type"].replace(["NON - CRIMINAL", "NON-CRIMINAL (SUBJECT SPECIFIED)"], "NON-CRIMINAL", inplace = True)

    df.loc[:, "primary_type"].replace(["RITUALISM"], "OTHER OFFENSE", inplace = True)
   

In [29]:
crime_type_cleaner(chicago_train)
chicago_train["primary_type"].value_counts()

THEFT                                636007
BATTERY                              512318
CRIMINAL DAMAGE                      299943
NARCOTICS                            222467
ASSAULT                              189549
OTHER OFFENSE                        171369
DECEPTIVE PRACTICE                   167709
BURGLARY                             152532
MOTOR VEHICLE THEFT                  120013
ROBBERY                              108092
CRIMINAL TRESPASS                     69971
WEAPONS VIOLATION                     45871
OFFENSE INVOLVING CHILDREN            23359
PUBLIC PEACE VIOLATION                21829
CRIMINAL SEXUAL ASSAULT               14850
PROSTITUTION                          12438
INTERFERENCE WITH PUBLIC OFFICER      11793
SEX OFFENSE                           10771
HOMICIDE                               5669
ARSON                                  4482
GAMBLING                               3507
LIQUOR LAW VIOLATION                   3407
KIDNAPPING                      