In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
crimes = pd.read_csv('~/Downloads/cleaned_chicago_crime_data.csv')

In [3]:
# get list of unique districts 1-25
districts = crimes['District'].unique().tolist()
districts

[11.0,
 7.0,
 2.0,
 9.0,
 5.0,
 6.0,
 4.0,
 12.0,
 15.0,
 14.0,
 1.0,
 20.0,
 10.0,
 22.0,
 8.0,
 17.0,
 3.0,
 18.0,
 16.0,
 25.0,
 24.0,
 19.0,
 21.0]

In [4]:
crime_types = crimes['Primary Type'].unique().tolist()
crime_types

['BATTERY',
 'OTHER OFFENSE',
 'ROBBERY',
 'NARCOTICS',
 'CRIMINAL DAMAGE',
 'WEAPONS VIOLATION',
 'THEFT',
 'BURGLARY',
 'MOTOR VEHICLE THEFT',
 'PUBLIC PEACE VIOLATION',
 'DECEPTIVE PRACTICE',
 'ASSAULT',
 'CRIMINAL TRESPASS',
 'CRIM SEXUAL ASSAULT',
 'INTERFERENCE WITH PUBLIC OFFICER',
 'ARSON',
 'LIQUOR LAW VIOLATION',
 'KIDNAPPING',
 'SEX OFFENSE',
 'OFFENSE INVOLVING CHILDREN',
 'PROSTITUTION',
 'HOMICIDE',
 'GAMBLING',
 'INTIMIDATION',
 'STALKING',
 'OBSCENITY',
 'PUBLIC INDECENCY',
 'HUMAN TRAFFICKING',
 'CONCEALED CARRY LICENSE VIOLATION',
 'OTHER NARCOTIC VIOLATION',
 'NON - CRIMINAL',
 'NON-CRIMINAL',
 'RITUALISM',
 'NON-CRIMINAL (SUBJECT SPECIFIED)',
 'DOMESTIC VIOLENCE']

In [5]:
violent_crimes = ['BATTERY','ASSAULT','CRIM SEXUAL ASSAULT', 'ARSON','HOMICIDE']
felonies = ['HUMAN TRAFFICKING', 'HOMICIDE','ARSON','KIDNAPPING','BURGLARY','ROBBERY','BATTERY','STALKING','SEX OFFENSE']

In [6]:
violents = []
for crime in violent_crimes:
    violents.append( crimes.loc[ crimes['Primary Type'] == crime] )

In [7]:
df_violent = violents[0]
for i in range(1,len(violents)):
    df_violent.append(violents[i])

In [8]:
df_violent;

In [9]:
import pandas as pd

In [10]:
crimes = pd.read_csv('~/Downloads/cleaned_chicago_crime_data.csv')

In [11]:
crimes.head()

Unnamed: 0.1,Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,0,10000092,HY189866,03/18/2015 07:44:00 PM,047XX W OHIO ST,041A,BATTERY,AGGRAVATED: HANDGUN,STREET,False,...,28.0,25.0,04B,1144606.0,1903566.0,2015,02/10/2018 03:50:01 PM,41.891399,-87.744385,"(41.891398861, -87.744384567)"
1,1,10000094,HY190059,03/18/2015 11:00:00 PM,066XX S MARSHFIELD AVE,4625,OTHER OFFENSE,PAROLE VIOLATION,STREET,True,...,15.0,67.0,26,1166468.0,1860715.0,2015,02/10/2018 03:50:01 PM,41.773372,-87.665319,"(41.773371528, -87.665319468)"
2,2,10000095,HY190052,03/18/2015 10:45:00 PM,044XX S LAKE PARK AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,...,4.0,39.0,08B,1185075.0,1875622.0,2015,02/10/2018 03:50:01 PM,41.813861,-87.596643,"(41.81386068, -87.596642837)"
3,3,10000096,HY190054,03/18/2015 10:30:00 PM,051XX S MICHIGAN AVE,0460,BATTERY,SIMPLE,APARTMENT,False,...,3.0,40.0,08B,1178033.0,1870804.0,2015,02/10/2018 03:50:01 PM,41.800802,-87.622619,"(41.800802415, -87.622619343)"
4,4,10000097,HY189976,03/18/2015 09:00:00 PM,047XX W ADAMS ST,031A,ROBBERY,ARMED: HANDGUN,SIDEWALK,False,...,28.0,25.0,03,1144920.0,1898709.0,2015,02/10/2018 03:50:01 PM,41.878065,-87.743354,"(41.878064761, -87.743354013)"


In [12]:
crimes.index = crimes['ID'] # make the index the actual case number
# remove columns that are unnecessary for the analysis and would probably just slow our code down
crimes = crimes.drop(['Beat', 'FBI Code','Updated On', 'Case Number', 'Block', 'IUCR', 'ID', 'Unnamed: 0','Description', 'Ward', 'X Coordinate', 'Y Coordinate', 'Latitude', 'Longitude'], axis=1)

In [13]:
crime_types = crimes['Primary Type'].unique().tolist()

### Begin cleaning data

1. Remove NaN values
2. standardize the crime type to get rid of the uppercase, standardize the non-criminal type
3. standardize the location description

There are actually no Nan values to remove! Government data is reliable I guess. The only NaN values are location, but we still know the district, so as long as we don't end up needing location later on, this is fine

In [14]:
to_drop = crimes[crimes.isnull().any(axis=1)]
to_drop;

In [15]:
def standardize_primary_type(string):
    # compile all non-criminal offenses into on label
    if string == 'NON-CRIMINAL (SUBJECT SPECIFIED)' or string == 'NON - CRIMINAL' or string == 'NON-CRIMINAL':
        return 'non-criminal'
    if string == 'OTHER OFFENSE':
        return 'other'
    # rename crim sexual assault to just sexual assault to make it easier to read
    if string == 'CRIM SEXUAL ASSAULT':
        return 'sexual assault'
    else:
        # everything else, make sure to lowercase it so we don't have to use caps lock lol
        return string.lower()
        

In [16]:
def standardize_location(string):
    if type(string) == str:
        string = string.lower()
        if 'airport' in string:
            return 'airport'
    return string

In [17]:
crimes['Primary Type'] = crimes['Primary Type'].apply(standardize_primary_type) # clean data 
crimes['location'] = crimes['Location Description'].apply(standardize_location)

In [18]:
# be careful if you run this one, only run it once it won't work the second time 
crimes = crimes.drop(['Location Description','Community Area'],axis=1)

In [19]:
crimes = crimes.rename(columns = {'Location': 'Coordinates'})

In [20]:
violents = ['BATTERY','ASSAULT','CRIM SEXUAL ASSAULT', 'ARSON','HOMICIDE']

In [21]:
violent_crimes = crimes[ (crimes['Primary Type'] == 'battery') | (crimes['Primary Type'] == 'assault') | (crimes['Primary Type'] == 'sexual assault') | (crimes['Primary Type'] == 'arson') | (crimes['Primary Type'] == 'homicide')]