# Setup and Imports

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
import seaborn as sns
sns.set
import warnings
import re
from pandas.io import gbq
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import xgboost
import pickle
from sklearn.model_selection import ParameterSampler
from scipy import sparse
#Custom Python Module with functions specifically for this project
import ChicagoDataCleaningFunctions as cd
#Custom Python Module to fetch the data
import FetchChicagoData as fc

import PrepareChicago as pc

# Get the Data

In [12]:
%%time
#Specify input values for fetching the data
query = """
            SELECT unique_key, date, primary_type, location_description, 
                    arrest, domestic, community_area, year
            FROM `gdac-327115.Chicago.chicago2`
            WHERE year >= 2011
        """
project_id = "gdac-327115"
excel_file = "ChicagoCommunityAreas.xlsx"

#Fetch the data
chicago = fc.fetch_chicago_data(query, project_id, excel_file, verbose=True)

Fetching Chicago Data Started...

Successfully queried Google BigQuery.
Sucessfully read in excel file.
Sucessfully joined Chicago districts to main data.
Successfully dropped duplicate column

Succcessfully fetched Chicago Data
Wall time: 3min 20s


# Clean the Data

In [99]:
%%capture --no-stdout
#Clean the full data set
cd.chicago_data_cleaner(chicago, verbose = True)

Cleaning Started...

Successfully Cleaned Primary Type
Successfully Imputed Location
Successfully Cleaned Location
Successfully Added Month Column
Successfully Added Hour Column
Successfully Cleaned Community

Data Set Successfully Cleaned!


# Load in Production Model

In [4]:
best_model = pickle.load(open("best_model.sav", 'rb'))
best_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.45195454681591674,
              enable_categorical=False, gamma=0.546708263364187, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.38768070515882624, max_delta_step=0, max_depth=7,
              min_child_weight=25, missing=nan, monotone_constraints='()',
              n_estimators=195, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=0.6338249886045665,
              scale_pos_weight=1, subsample=0.7838501639099957,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [13]:
chicago.columns

Index(['unique_key', 'date', 'primary_type', 'location_description', 'arrest',
       'domestic', 'year', 'district_name', 'community_name', 'Month', 'Hour'],
      dtype='object')

In [136]:
chicago.dtypes

unique_key                            Int64
date                    datetime64[ns, UTC]
primary_type                         object
location_description                 object
arrest                              boolean
domestic                            boolean
year                                  Int64
district_name                        object
community_name                       object
Month                                 int64
Hour                                  int64
dtype: object

In [15]:
chicago_districts

Unnamed: 0,community_area,district_name,community_name
0,1,Far North,ROGERS PARK
1,2,Far North,WEST RIDGE
2,3,Far North,UPTOWN
3,4,Far North,LINCOLN SQUARE
4,5,North,NORTH CENTER
...,...,...,...
72,73,Far Southwest,WASHINGTON HEIGHTS
73,74,Far Southwest,MOUNT GREENWOOD
74,75,Far Southwest,MORGAN PARK
75,76,Far North,OHARE


In [17]:
chicago.head()

Unnamed: 0,unique_key,date,primary_type,location_description,arrest,domestic,year,district_name,community_name,Month,Hour
0,10225218,2015-09-05 23:00:00+00:00,ROBBERY,CTA,False,False,2015,Far Southeast,RIVERDALE,9,23
1,10341077,2015-12-10 05:15:00+00:00,BURGLARY,RESIDENCE,False,False,2015,Far Southeast,RIVERDALE,12,5
2,11861007,2019-10-14 22:47:00+00:00,OTHER OFFENSE,STREET,True,False,2019,Far Southeast,RIVERDALE,10,22
3,10599541,2016-07-15 22:30:00+00:00,INTERFERENCE WITH PUBLIC OFFICER,PARKING,True,False,2016,Far Southeast,RIVERDALE,7,22
4,10707134,2016-10-04 17:00:00+00:00,DECEPTIVE PRACTICE,STREET,False,False,2016,Far Southeast,RIVERDALE,10,17


In [27]:
def get_district(community):
    district = dict(zip(chicago["community_name"], chicago["district_name"]))
    return district[community]

In [14]:
def str_to_date(date_str, time_str):
    crime_time = date_str + " " + time_str
    return datetime.strptime(crime_time, "%m/%d/%Y %H:%M:%S")

test_date = "2/13/2022"
test_hour = "2:30:15"
str_to_date(test_date, test_hour)

datetime.datetime(2022, 2, 13, 2, 30, 15)

In [28]:
def user_input():
    community_name = str.upper(input("Enter the Community Name: "))
    district = get_district(community_name)
    
    primary_type = str.upper(input("Enter the crime committed: "))
    
    location_description = str.upper(input("Enter the crime's location (street, residence, etc.): "))
    
    domestic = (str.upper(input("Was the crime domestic? (Yes/No): ")) == "YES")
    
    date = input("Date of Crime (Ex. 01/01/2022): ")
    
    hour = input("Time of Crime: ")
    
    date_time = str_to_date(date, hour)
    
    crime_instance = pd.DataFrame({"date": date_time, "primary_type": primary_type, "location_description": location_description,
                                  "domestic": domestic, "district_name": district, "community_name": community_name},
                                 index = [1])
    return(crime_instance)

In [29]:
def user_input():
    community_name = str.upper(input("Enter the Community Name: "))
    district = get_district(community_name)
    
    primary_type = str.upper(input("Enter the crime committed: "))
    
    location_description = str.upper(input("Enter the crime's location (street, residence, etc.): "))
    
    domestic = (str.upper(input("Was the crime domestic? (Yes/No): ")) == "YES")
    
    date = input("Date of Crime (Ex. 01/01/2022): ")
    
    hour = input("Time of Crime: ")
    
    date_time = str_to_date(date, hour)
    
    return [community_name, district, primary_type, location_description, domestic, date_time]

In [101]:
def unique_column_values(df):
    crime_values = list(np.sort(df["primary_type"].value_counts().index))
    location_values = list(np.sort(df["location_description"].value_counts().index))
    domestic_values = list(np.sort(df["domestic"].value_counts().index))
    district_values = list(np.sort(df["district_name"].value_counts().index))
    community_values = list(np.sort(df["community_name"].value_counts().index))
    Month_values = list(np.sort(df["Month"].value_counts().index))
    Hour_values = list(np.sort(df["Hour"].value_counts().index))

    column_values = [crime_values, location_values, domestic_values, district_values, community_values, Month_values, Hour_values]
    return column_values

In [240]:
crimetype = "THEFT"
crime_cats = pd.Categorical(chicago["primary_type"]).categories
crime = pd.Categorical(crimetype, categories=crime_cats)
crime
temp = pd.DataFrame({"primary_type": crime}, index = [1])

In [30]:
def input_to_category(df):
    crime_details = user_input()[::-1]
    
    communities = pd.Categorical(chicago["community_name"]).categories
    community_name = pd.Categorical(crime_details.pop(), categories=communities)
    
    districts = pd.Categorical(chicago["district_name"]).categories
    district = pd.Categorical(crime_details.pop(), categories=districts)
    
    crime_grps = pd.Categorical(chicago["primary_type"]).categories
    crime = pd.Categorical(crime_details.pop(), categories=crime_grps)
    
    location_grps = pd.Categorical(chicago["location_description"]).categories
    location = pd.Categorical(crime_details.pop(), categories=location_grps)
    
    domestic_grp = pd.Categorical(chicago["domestic"]).categories
    domestic = pd.Categorical(crime_details.pop(), categories=domestic_grp)
    
    crime_instance = pd.DataFrame({"date": crime_details.pop(), "primary_type": crime, 
                                   "location_description": location, "domestic": domestic,
                                   "district_name": district, "community_name": community_name},
                                 index = [1])
    return crime_instance

In [31]:
temp = input_to_category(chicago)

Enter the Community Name: rogers park
Enter the crime committed: theft
Enter the crime's location (street, residence, etc.): street
Was the crime domestic? (Yes/No): yes
Date of Crime (Ex. 01/01/2022): 02/16/2022
Time of Crime: 2:30:45


In [32]:
temp

Unnamed: 0,date,primary_type,location_description,domestic,district_name,community_name
1,2022-02-16 02:30:45,THEFT,STREET,True,Far North,ROGERS PARK


In [33]:
cd.chicago_data_cleaner(temp, verbose=True)

Cleaning Started...

Successfully Cleaned Primary Type
Successfully Imputed Location
Successfully Cleaned Location
Successfully Added Month Column
Successfully Added Hour Column
Successfully Cleaned Community

Data Set Successfully Cleaned!


In [34]:
temp

Unnamed: 0,date,primary_type,location_description,domestic,district_name,community_name,Month,Hour
0,2022-02-16 02:30:45,THEFT,STREET,True,Far North,ROGERS PARK,2,2


In [45]:
attribs = ["primary_type", "location_description", "domestic", "district_name", "community_name", "Month", "Hour"]
prep_temp = pc.prepare_chicago_features(temp, attribs)

In [37]:
list(temp.columns)

['date',
 'primary_type',
 'location_description',
 'domestic',
 'district_name',
 'community_name',
 'Month',
 'Hour']

In [102]:
unique_vals = unique_column_values(chicago)
cat_encoder = OneHotEncoder(categories=unique_vals)
X = cat_encoder.fit_transform(temp[attribs])

In [103]:
X.shape

(1, 183)

In [61]:
OneHotEncoder?

In [71]:
enc = OneHotEncoder(categories=[["Male", "Female", "Non-binary"], [1,2,3]], sparse = False)
X = [['Male', 1], ['Female', 3], ['Female', 2]]
X_prep = enc.fit_transform(X)

In [72]:
X_prep.shape

(3, 6)

In [73]:
X_prep

array([[1., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.]])

In [70]:
enc.categories_

[array(['Male', 'Female', 'Non-binary'], dtype=object),
 array([1, 2, 3], dtype=object)]

In [66]:
OneHotEncoder?

In [53]:
temp["Hour"].value_counts()

2     1
0     0
13    0
22    0
21    0
20    0
19    0
18    0
17    0
16    0
15    0
14    0
12    0
1     0
11    0
10    0
9     0
8     0
7     0
6     0
5     0
4     0
3     0
23    0
Name: Hour, dtype: int64

In [199]:
test_instance = user_input()

Enter the Community Name: rogers park
Enter the crime committed: theft
Enter the crime's location (street, residence, etc.): street
Was the crime domestic? (Yes/No): yes
Date of Crime (Ex. 01/01/2022): 02/14/2022
Time of Crime: 2:30:15


In [249]:
def transform_input(df):
    
    cd.chicago_data_cleaner(df)
    
transform_input(df = temp)
temp

Unnamed: 0,date,primary_type,location_description,domestic,district_name,community_name,Month,Hour
0,2022-02-14 02:30:45,THEFT,STREET,True,Far North,ROGERS PARK,2,2


In [256]:
chicago.head()

Unnamed: 0,unique_key,date,primary_type,location_description,arrest,domestic,year,district_name,community_name,Month,Hour
0,10225218,2015-09-05 23:00:00+00:00,ROBBERY,CTA,False,False,2015,Far Southeast,RIVERDALE,9,23
1,10341077,2015-12-10 05:15:00+00:00,BURGLARY,RESIDENCE,False,False,2015,Far Southeast,RIVERDALE,12,5
2,11861007,2019-10-14 22:47:00+00:00,OTHER OFFENSE,STREET,True,False,2019,Far Southeast,RIVERDALE,10,22
3,10599541,2016-07-15 22:30:00+00:00,INTERFERENCE WITH PUBLIC OFFICER,PARKING,True,False,2016,Far Southeast,RIVERDALE,7,22
4,10707134,2016-10-04 17:00:00+00:00,DECEPTIVE PRACTICE,STREET,False,False,2016,Far Southeast,RIVERDALE,10,17


In [274]:
chicago_sub = chicago.loc[chicago["date"].dt.month == 2, ["unique_key", "date", "primary_type", "location_description", "domestic", "district_name", "community_name"]].copy()

In [275]:
chicago_sub["Month"] = pd.Categorical(chicago_sub["date"].dt.month, categories=range(1,13))

In [276]:
chicago_sub.head()

Unnamed: 0,unique_key,date,primary_type,location_description,domestic,district_name,community_name,Month
14,9505238,2014-02-21 19:00:00+00:00,MOTOR VEHICLE THEFT,STREET,False,Far Southeast,RIVERDALE,2
41,10414085,2016-02-11 10:00:00+00:00,DECEPTIVE PRACTICE,APARTMENT,False,Far Southeast,RIVERDALE,2
60,10433256,2016-02-29 21:45:00+00:00,OFFENSE INVOLVING CHILDREN,RESIDENCE,False,Far Southeast,RIVERDALE,2
69,9017651,2013-02-19 20:00:00+00:00,OTHER OFFENSE,RESIDENCE,True,Far Southeast,RIVERDALE,2
101,10848541,2017-02-13 20:50:00+00:00,DECEPTIVE PRACTICE,VEHICLE,False,Far Southeast,RIVERDALE,2


In [277]:
chicago_sub["Month"].value_counts()

2     206735
1          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
Name: Month, dtype: int64

In [221]:
cat_attribs = ["primary_type", "location_description", "domestic", "district_name", "community_name", "Month", "Hour"]

def transform_input2(df, attribs):
    cat_encoder = OneHotEncoder()
    X = cat_encoder.fit_transform(df[attribs])
    return X

crime = transform_input2(test_instance, cat_attribs)

In [222]:
crime.shape

(1, 7)

In [232]:
test_cate = pd.Categorical(chicago["primary_type"]).categories
crime = pd.Categorical(test_instance["primary_type"])
crime.set_categories(test_cate)

['THEFT']
Categories (30, object): ['ARSON', 'ASSAULT', 'BATTERY', 'BURGLARY', ..., 'SEX OFFENSE', 'STALKING', 'THEFT', 'WEAPONS VIOLATION']

In [213]:
cp.prepare_chicago(test_instance, cat_attribs)

NameError: name 'OneHotEncoder' is not defined

In [209]:
dir(cp)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'prepare_chicago']

In [201]:
test_instance

Unnamed: 0,date,primary_type,location_description,domestic,district_name,community_name,Month,Hour
0,2022-02-14 02:30:15,THEFT,STREET,True,Far North,ROGERS PARK,2,2


In [190]:
"""
community_name = str.upper(input("Enter the Community Name: "))
primary_type = str.upper(input("Enter the crime committed: "))
location_description = str.upper(input("Enter the crime's location (street, residence, etc.): "))
domestic = (str.upper(input("Was the crime domestic? (Yes/No): ")) == "YES")
date = input("Date of Crime (Ex. 01/01/2022): ")
hour = input("Time of Crime:")
"""
print(district_name)
print(community_name)
print(primary_type)
print(location_description)
print(domestic)
print(date)

#user_input()


Far North
['ROGERS PARK' 'WEST RIDGE' 'UPTOWN' 'LINCOLN SQUARE' 'NORTH CENTER'
 'LAKE VIEW' 'LINCOLN PARK' 'NEAR NORTH SIDE' 'EDISON PARK' 'NORWOOD PARK'
 'JEFFERSON PARK' 'FOREST GLEN' 'NORTH PARK' 'ALBANY PARK' 'PORTAGE PARK'
 'IRVING PARK' 'DUNNING' 'MONTCLARE' 'BELMONT CRAGIN' 'HERMOSA' 'AVONDALE'
 'LOGAN SQUARE' 'HUMBOLDT PARK' 'WEST TOWN' 'AUSTIN' 'WEST GARFIELD PARK'
 'EAST GARFIELD PARK' 'NEAR WEST SIDE' 'NORTH LAWNDALE' 'SOUTH LAWNDALE'
 'LOWER WEST SIDE' 'LOOP' 'NEAR SOUTH SIDE' 'ARMOUR SQUARE' 'DOUGLAS'
 'OAKLAND' 'FULLER PARK' 'GRAND BOULEVARD' 'KENWOOD' 'WASHINGTON PARK'
 'HYDE PARK' 'WOODLAWN' 'SOUTH SHORE' 'CHATHAM' 'AVALON PARK'
 'SOUTH CHICAGO' 'BURNSIDE' 'CALUMET HEIGHTS' 'ROSELAND' 'PULLMAN'
 'SOUTH DEERING' 'EAST SIDE' 'WEST PULLMAN' 'RIVERDALE' 'HEGEWISCH'
 'GARFIELD RIDGE' 'ARCHER HEIGHTS' 'BRIGHTON PARK' 'MCKINLEY PARK'
 'BRIDGEPORT' 'NEW CITY' 'WEST ELSDON' 'GAGE PARK' 'CLEARING' 'WEST LAWN'
 'CHICAGO LAWN' 'WEST ENGLEWOOD' 'ENGLEWOOD' 'GREATER GRAND CROSSING'
 

In [196]:
pd.DataFrame({"test1": 1, "test2": 2}, index = [1])

Unnamed: 0,test1,test2
1,1,2


In [166]:
community_name = np.array(chicago_districts["community_name"])
district = dict(keys = chicago_districts["community_name"], values = chicago_districts["district_name"])

In [176]:
district = dict(zip(chicago_districts["community_name"], chicago_districts["district_name"]))
print(district["ROGERS PARK"])
get_district("ROGERS PARK")

Far North


'Far North'

In [164]:
district.keys()

dict_keys(['keys', 'values'])

In [151]:
chicago_districts

Unnamed: 0,community_area,district_name,community_name
0,1,Far North,ROGERS PARK
1,2,Far North,WEST RIDGE
2,3,Far North,UPTOWN
3,4,Far North,LINCOLN SQUARE
4,5,North,NORTH CENTER
...,...,...,...
72,73,Far Southwest,WASHINGTON HEIGHTS
73,74,Far Southwest,MOUNT GREENWOOD
74,75,Far Southwest,MORGAN PARK
75,76,Far North,OHARE


In [36]:
crime_date = input("Date (Ex: 01/01/2022): ")
print(date(crime_date))

Date (Ex: 01/01/2022): 01/01/2022


TypeError: an integer is required (got type str)

In [127]:
def str_to_date(date_str, time_str):
    #month, day, year = str.split(string, "/")
    date_lst = str.split(string, "/")[::-1]
    
    month = int(date_lst.pop())
    if month not in np.arange(1,13): 
        raise ValueError("Month has to be between 1 and 12")
    
    day = int(date_lst.pop())
    if day not in np.arange(1,32):
        raise ValueError("Day has to be between 1 and 31")
        
    year = int(date_lst.pop())
    return date()

test = "11/31/2022"
str_to_date(test)

(11, 31, 2022)

In [145]:
test_date = "01/02/2022"
test_hour = "00:30:15"
crime_time = test_date + " " + test_hour
print(crime_time)
test = datetime.strptime(crime_time, "%d/%m/%Y %H:%M:%S")
print(test)

01/02/2022 00:30:15
2022-02-01 00:30:15


In [119]:
test = str.split("01/02/2022", "/")
print(test[::-1])
month = int(test.pop())
day = int(test.pop())
year = int(test.pop())
print(month)
print(day)
print(year)

['2022', '02', '01']
2022
2
1


In [115]:
test.pop?

In [90]:
pattern = "-"
test = "test/test-test/"
re.split(pattern, test)

['test/test', 'test/']

In [95]:
str.replace("test-test-test", "", "/")

'test-test-test'

In [35]:
from datetime import date
date?

In [18]:
str.upper("test")

'TEST'

In [29]:
domestic = (str.upper(input("Was the crime domestic? (Yes/No): ")) == "YES")
domestic
print(domestic)

Was the crime domestic? (Yes/No): yes
True


In [21]:
astyp

Object `astype` not found.
