In [1]:
import pandas as pd
import os
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.axes as axs

sns.set()


import scipy.stats as stats


# %load_ext nb_black
%load_ext lab_black

### gen purpose functions

In [2]:
def mis_set_num(set_):
    """obtain missing numeric values in set

    Args:
        set_ (set or list like object): set of numbers

    Returns:
        set: returns set of missing values in passed container
    """
    return sorted(set(range(set_[0], set_[-1])) - set(set_))


def convert_test_int(set_):
    """convert elemetnt to list

    Args:
        set_ (set or list like object): set of numbers

    Returns:
        set: returns warning if integer conversion fails
    """
    try:
        return [int(x) for x in set_]
    except:
        print("nan at position")


def date_to_datetime_input(df_):
    """generate SQL compliant date string input

    Args:
        df_ (DataFrame): input DataFrame
        datecol (series): datetime compliant pandas series

    Returns:
        series: return SQL compiant
    """
    return df_["date"].apply(
        lambda x: df_.datecol.str.replace("/", "-").str.split()[:2]
    )

## Import the data using the relvant specifications

In [3]:
# REad the data
df = pd.read_csv(
    "{}/Data/crimes.csv".format("/".join(os.getcwd().split("/")[:-1])),
    delimiter="\t",
    dtype={
        "CrimeID": str,
        "beat": str,
        "iucr": str,
        "district": str,
        "casenumber": str,
        "date": str,
        "block": str,
        "location": str,
    },
)
None

  df = pd.read_csv(


### Missing values processing 

In [4]:
df.head()

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location
0,1,JA118920,01/01/2017 01:00:00 AM,023XX N WAYNE AVE,1156,DECEPTIVE PRACTICE,ATTEMPT - FINANCIAL IDENTITY THEFT,RESIDENCE,False,1811,18,41.924198,-87.662071,"(41.924196311, -87.662069166)"
1,2,JA100052,01/01/2017 01:00:00 AM,090XX S BLACKSTONE AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,413,4,41.730709,-87.587997,"(41.730707788, -87.587998843)"
2,3,JA100367,01/01/2017 01:00:00 AM,049XX N WHIPPLE ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,1713,17,41.970776,-87.704872,"(41.97077465, -87.704873143)"
3,4,JA100092,01/01/2017 01:00:00 AM,023XX S CALIFORNIA AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,False,1034,10,41.848907,-87.695366,"(41.848905706, -87.695364781)"
4,5,JA494707,01/01/2017 01:00:00 AM,032XX N MILWAUKEE AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,APARTMENT,False,1732,17,41.940193,-87.725647,"(41.940192486, -87.725649965)"


### Initial misisng values check

In [5]:
# simple missing values check
df.isna().sum()

CrimeID                    0
casenumber                 3
date                       2
block                      0
iucr                       3
primarytype                3
description                3
locationdescription     3434
arrest                     3
beat                       3
district                   3
latitude               11583
longitude              11581
location               11581
dtype: int64

remove all instances that do not have a casenumber - resulting in 3 instances removed from the database


In [6]:
# those with missing casenumbers are unworkable
isna_index = df[df.casenumber.isna()].index
df.drop(df[df.casenumber.isna()].index, axis=0, inplace=True)

In [7]:
df.isna().sum()

CrimeID                    0
casenumber                 0
date                       0
block                      0
iucr                       0
primarytype                0
description                0
locationdescription     3431
arrest                     0
beat                       0
district                   0
latitude               11580
longitude              11580
location               11580
dtype: int64

In [8]:
df.loc[df.arrest == "true", "arrest"] = True
df.loc[df.arrest == "false", "arrest"] = False

### search for hidden NA or uncommen NaN values not detected by python
Simple Trick: Search for dataobjects by column which do not correspond with the correct data object in ths first place 

In [9]:
# find search set space for missing values
print(mis_set_num(df.CrimeID.astype(int).values))

missing_values = mis_set_num(df.CrimeID.astype(int).values)

[30000, 98000, 444444]


In [10]:
isna_index

Int64Index([30038, 98038, 444482], dtype='int64')

In [11]:
# find non vonvertables in CrimeId
convert_test_int(df.CrimeID.values)
None

the dropped indexes correspond with the removed observarions for missing case numbers

In [12]:
df[df.index.isin(missing_values)]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location
30000,29962,JB124174,01/16/2018 09:00:00 AM,007XX N SPAULDING AVE,820,THEFT,$500 AND UNDER,RESIDENCE,False,1121,11,41.894493,-87.708969,"(41.894493874, -87.708967553)"
98000,97962,JA162833,02/22/2017 03:52:00 AM,027XX W 42ND ST,1020,ARSON,BY FIRE,VEHICLE NON-COMMERCIAL,False,921,9,41.817398,-87.692429,"(41.817398864, -87.692427281)"
444444,444406,JB389583,08/11/2018 11:20:00 PM,011XX N LARAMIE AVE,1365,CRIMINAL TRESPASS,TO RESIDENCE,RESIDENCE,False,1524,15,41.900894,-87.755745,"(41.900894005, -87.755743288)"


In [None]:
df.CrimeID = df.CrimeID.astype("str", copy=True, errors="raise")

<IPython.core.display.Javascript object>

### Duplicates

In [13]:
df.drop_duplicates(keep="first", inplace=True, ignore_index=True)

dropping duplicates repoves 149 instances

In [14]:
# doublechecking confirms
df.drop_duplicates(["casenumber"]).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730748 entries, 0 to 730747
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   CrimeID              730748 non-null  object 
 1   casenumber           730748 non-null  object 
 2   date                 730748 non-null  object 
 3   block                730748 non-null  object 
 4   iucr                 730748 non-null  object 
 5   primarytype          730748 non-null  object 
 6   description          730748 non-null  object 
 7   locationdescription  727321 non-null  object 
 8   arrest               730748 non-null  object 
 9   beat                 730748 non-null  object 
 10  district             730748 non-null  object 
 11  latitude             719169 non-null  float64
 12  longitude            719169 non-null  float64
 13  location             719169 non-null  object 
dtypes: float64(2), object(12)
memory usage: 83.6+ MB


In [15]:
df.drop_duplicates(["CrimeID"]).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730748 entries, 0 to 730747
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   CrimeID              730748 non-null  object 
 1   casenumber           730748 non-null  object 
 2   date                 730748 non-null  object 
 3   block                730748 non-null  object 
 4   iucr                 730748 non-null  object 
 5   primarytype          730748 non-null  object 
 6   description          730748 non-null  object 
 7   locationdescription  727321 non-null  object 
 8   arrest               730748 non-null  object 
 9   beat                 730748 non-null  object 
 10  district             730748 non-null  object 
 11  latitude             719169 non-null  float64
 12  longitude            719169 non-null  float64
 13  location             719169 non-null  object 
dtypes: float64(2), object(12)
memory usage: 83.6+ MB


Conclusion: no duplicates in CrimeId and Casenumber

### Beat & District analysis

See whether beat and district have the right amount of entities (beat should have length 4; district length 2)
Adjust for "0" preceeding



In [16]:
# start with districts
# df[df.district.str.len() < 2]

In [17]:
# Continue with districts
# df[df.beat.str.len() < 4]

In [18]:
# alternatively:
# 1. insert 0s in the front of each beat and district where length is smaller 4 or smaller 2 respectively
df["beat_"] = df["beat"].apply(lambda x: "{0:0>4}".format(x))
df["district_"] = df["district"].apply(lambda x: "{0:0>2}".format(x))

In [19]:
# 2) now check where the first two digits of beat_ does NOT comply with the two digits of district_
df[df.beat_.str[:2] != df.district_]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,beat_,district_
5067,5068,JA308088,01/03/2017 12:01:00 AM,021XX W GIDDINGS ST,1582,OFFENSE INVOLVING CHILDREN,CHILD PORNOGRAPHY,RESIDENCE,False,1911,1,41.885700,-87.642029,"(41.885702079, -87.642031644)",1911,01
6264,6265,JD102780,01/03/2020 11:10:00 AM,097XX W KENNEDY EXPY OB,0486,BATTERY,DOMESTIC BATTERY SIMPLE,HIGHWAY/EXPRESSWAY,True,1654,31,41.982494,-87.875580,"(41.98249319, -87.875577905)",1654,31
33045,33047,JE116040,01/17/2021 12:00:00 AM,005XX W OHARE ST,0530,ASSAULT,AGGRAVATED - OTHER DANGEROUS WEAPON,PARKING LOT / GARAGE (NON RESIDENTIAL),False,1653,31,41.965057,-87.879951,"(41.965057367, -87.879953326)",1653,31
59593,59595,JC288579,02/01/2017 12:00:00 AM,038XX S COTTAGE GROVE AVE,1754,OFFENSE INVOLVING CHILDREN,AGGRAVATED SEXUAL ASSAULT OF CHILD BY FAMILY M...,APARTMENT,False,212,3,41.783707,-87.594719,"(41.783707907, -87.594718494)",0212,03
61004,61006,JD429703,02/01/2020 12:00:00 AM,079XX S CHAMPLAIN AVE,1752,OFFENSE INVOLVING CHILDREN,AGGRAVATED CRIMINAL SEXUAL ABUSE BY FAMILY MEMBER,APARTMENT,False,624,11,41.870483,-87.719666,"(41.870481746, -87.719665179)",0624,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702686,702690,JB558892,12/16/2018 07:00:00 PM,026XX S DRAKE AVE,1754,OFFENSE INVOLVING CHILDREN,AGGRAVATED SEXUAL ASSAULT OF CHILD BY FAMILY M...,RESIDENCE,False,1032,31,41.968014,-87.818794,"(41.968013038, -87.818796103)",1032,31
707240,707244,JD464205,12/18/2020 05:00:00 AM,014XX W 72ND ST,0266,CRIMINAL SEXUAL ASSAULT,PREDATORY,RESIDENCE,True,734,6,41.754051,-87.659424,"(41.754052908, -87.659425103)",0734,06
721168,721172,JC152102,12/26/2018 12:13:00 PM,015XX E 87TH ST,1110,DECEPTIVE PRACTICE,BOGUS CHECK,SMALL RETAIL STORE,False,412,22,41.735855,-87.658737,"(41.73585412, -87.658739723)",0412,22
724281,724285,JB570460,12/28/2018 04:56:00 PM,049XX S KEDZIE AVE,0860,THEFT,RETAIL THEFT,SMALL RETAIL STORE,False,821,9,41.804962,-87.703888,"(41.804962024, -87.703884743)",0821,09


This returns another 79 porblematic instances

In [20]:
# those columns that contain problems will from now on be marked
df["marked"] = False
df.loc[df.index.isin(df[df.beat_.str[:2] != df.district_].index), "marked"] = True

In [21]:
# replace beat with beat_
df.beat = df.beat_
df.district = df.district_

df.drop(["beat_", "district_"], axis=1, inplace=True)

### Check date 

DATETIME - format: YYYY-MM-DD HH:MI:SS.
TIMESTAMP - format: YYYY-MM-DD HH:MI:SS.

Case number has a mandatory length of 8; 

See whether beat and district have the right amount of entities (beat should have length 4; district length 2)
Adjust for "0" preceeding

string is na: '', ' ',"N/A"

In [22]:
df.date

0         01/01/2017 01:00:00 AM
1         01/01/2017 01:00:00 AM
2         01/01/2017 01:00:00 AM
3         01/01/2017 01:00:00 AM
4         01/01/2017 01:00:00 AM
                   ...          
730743    12/31/2021 12:38:00 AM
730744    12/31/2021 12:41:00 AM
730745    12/31/2021 12:41:00 PM
730746    12/31/2021 12:49:00 AM
730747    12/31/2021 12:55:00 AM
Name: date, Length: 730748, dtype: object

In [23]:
# simple conversion
# now convert date
df["date_time"] = pd.to_datetime(df.date.astype(str), errors="coerce")

In [24]:
# date_to_datetime_input(df)

In [25]:
df[df["date_time"].isna()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 22 to 624028
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   CrimeID              15 non-null     object        
 1   casenumber           15 non-null     object        
 2   date                 15 non-null     object        
 3   block                15 non-null     object        
 4   iucr                 15 non-null     object        
 5   primarytype          15 non-null     object        
 6   description          15 non-null     object        
 7   locationdescription  14 non-null     object        
 8   arrest               15 non-null     object        
 9   beat                 15 non-null     object        
 10  district             15 non-null     object        
 11  latitude             14 non-null     float64       
 12  longitude            14 non-null     float64       
 13  location             14 non-null

an additional 16 entries do not contain dates; this is pivotal so they have to be removed

In [26]:
df.date_time_str = df.date_time.astype(str)

  df.date_time_str = df.date_time.astype(str)


### Outliers in locatioN!


Here z values will suffice 

In [27]:
# df["longitude"].apply(lambda x: stats.zscore(x, nan_policy="omit"))

In [28]:
df.describe()

Unnamed: 0,latitude,longitude
count,719169.0,719169.0
mean,41.843298,-87.670142
std,0.086724,0.059218
min,36.619446,-91.686569
25%,41.768051,-87.712669
50%,41.862217,-87.663658
75%,41.905334,-87.627686
max,42.022671,-87.524529


removing all observations which do not fall into the general range of 40 to 42.5 a

In [29]:
df.longitude_z = (df.longitude - df.longitude.mean()) / df.longitude.std()
df.latitude_z = (df.latitude - df.latitude.mean()) / df.latitude.std()

  df.longitude_z = (df.longitude - df.longitude.mean()) / df.longitude.std()
  df.latitude_z = (df.latitude - df.latitude.mean()) / df.latitude.std()


after some testing only one observation with the following information seemed to be wrong
in this case, the decision was made to make this isntance of lat lon not known.
36.619446	-91.686569

In [30]:
df.loc[df.latitude == 36.619446, "latitude"] = np.nan
df.loc[df.longitude == -91.686569, "longitude"] = np.nan

### Misc tests

In [31]:
df[~pd.notnull(df)]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time
0,,,,,,,,,,,,,,,,NaT
1,,,,,,,,,,,,,,,,NaT
2,,,,,,,,,,,,,,,,NaT
3,,,,,,,,,,,,,,,,NaT
4,,,,,,,,,,,,,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730743,,,,,,,,,,,,,,,,NaT
730744,,,,,,,,,,,,,,,,NaT
730745,,,,,,,,,,,,,,,,NaT
730746,,,,,,,,,,,,,,,,NaT


In [32]:
df.dropna(how="all")

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time
0,1,JA118920,01/01/2017 01:00:00 AM,023XX N WAYNE AVE,1156,DECEPTIVE PRACTICE,ATTEMPT - FINANCIAL IDENTITY THEFT,RESIDENCE,False,1811,18,41.924198,-87.662071,"(41.924196311, -87.662069166)",False,2017-01-01 01:00:00
1,2,JA100052,01/01/2017 01:00:00 AM,090XX S BLACKSTONE AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,0413,04,41.730709,-87.587997,"(41.730707788, -87.587998843)",False,2017-01-01 01:00:00
2,3,JA100367,01/01/2017 01:00:00 AM,049XX N WHIPPLE ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,1713,17,41.970776,-87.704872,"(41.97077465, -87.704873143)",False,2017-01-01 01:00:00
3,4,JA100092,01/01/2017 01:00:00 AM,023XX S CALIFORNIA AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,False,1034,10,41.848907,-87.695366,"(41.848905706, -87.695364781)",False,2017-01-01 01:00:00
4,5,JA494707,01/01/2017 01:00:00 AM,032XX N MILWAUKEE AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,APARTMENT,False,1732,17,41.940193,-87.725647,"(41.940192486, -87.725649965)",False,2017-01-01 01:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730743,730747,JE493505,12/31/2021 12:38:00 AM,005XX E 32ND ST,031A,ROBBERY,ARMED - HANDGUN,RESIDENCE - YARD (FRONT / BACK),False,0211,02,41.836071,-87.612816,"(41.83607107, -87.612813156)",False,2021-12-31 00:38:00
730744,730748,JE493511,12/31/2021 12:41:00 AM,038XX W 68TH PL,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,0833,08,41.768700,-87.718941,"(41.76870088, -87.718942599)",False,2021-12-31 00:41:00
730745,730749,JF100517,12/31/2021 12:41:00 PM,038XX W 83RD PL,0820,THEFT,$500 AND UNDER,RESIDENCE - PORCH / HALLWAY,False,0834,08,41.741333,-87.717537,"(41.741332605, -87.717536356)",False,2021-12-31 12:41:00
730746,730750,JF100660,12/31/2021 12:49:00 AM,075XX S SEELEY AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,RESIDENCE,False,0611,06,41.756420,-87.674522,"(41.756418241, -87.674522275)",False,2021-12-31 00:49:00


### length of each instance 

Case number has a mandatory length of 8; 

See whether beat and district have the right amount of entities (beat should have length 4; district length 2)
Adjust for "0" preceeding

string is na: '', ' ',"N/A"


relevant columns to incestigate:
casenumber - 8
iucr - 4
beat - 4
district - 2
date_time - 19

In [33]:
df[df.casenumber.str.len() != 8]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time
77887,77889,G647504,02/11/2017 06:32:00 PM,011XX W 51ST ST,110,HOMICIDE,FIRST DEGREE MURDER,STREET,False,933,9,41.80146,-87.654411,"(41.801458486, -87.654410134)",False,2017-02-11 18:32:00
115637,115640,F780368,03/03/2018 01:25:00 PM,035XX S RHODES AVE,110,HOMICIDE,FIRST DEGREE MURDER,APARTMENT,True,212,2,41.829651,-87.614449,"(41.829651528, -87.614451899)",False,2018-03-03 13:25:00
303817,303820,.JB299184,06/08/2018 10:00:00 PM,060XX N SHERIDAN RD,890,THEFT,FROM BUILDING,APARTMENT,False,2433,24,41.991562,-87.655434,"(41.991560549, -87.655432099)",False,2018-06-08 22:00:00
434689,434692,J382356,08/07/2017 01:00:00 PM,018XX S THROOP ST,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,1233,12,41.857353,-87.658859,"(41.857352518, -87.658859262)",False,2017-08-07 13:00:00
560953,560957,464266,10/05/2018 10:45:00 PM,118XX S PERRY AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,522,5,41.679008,-87.625145,"(41.679009135, -87.625147078)",False,2018-10-05 22:45:00
570996,571000,D439410,10/10/2018 07:00:00 AM,042XX W AUGUSTA BLVD,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,1111,11,41.898926,-87.731758,"(41.898926722, -87.731756818)",False,2018-10-10 07:00:00
642845,642849,G284899,11/15/2018 09:01:00 PM,014XX N CALIFORNIA AVE,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,1423,14,41.90731,-87.696991,"(41.907312308, -87.696987788)",False,2018-11-15 21:01:00


In [34]:
df[df.iucr.str.len() != 4]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time


In [35]:
# these were the same instances recored before
len(df[df.date_time.astype(str).str.len() != 19])

15

In [36]:
df.loc[
    df.index.isin(df[df.date_time.astype(str).str.len() != 19].index), "marked"
] = True
df[df["marked"] == True]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time
22,23,JB235763,,016XX W PRATT BLVD,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,APARTMENT,,2432,24,,,,True,NaT
122,123,JA100380,,078XX S MORGAN ST,0810,THEFT,OVER $500,RESIDENCE,,0612,06,41.751896,-87.648926,"(41.751896943, -87.648926093)",True,NaT
321,322,JA469768,,076XX S VERNON AVE,1122,DECEPTIVE PRACTICE,COUNTERFEIT CHECK,RESIDENCE,,0624,06,41.755615,-87.613823,"(41.755615044, -87.613820045)",True,NaT
1122,1123,JB346519,,080XX S KEDVALE AVE,0553,ASSAULT,AGGRAVATED PO: OTHER DANG WEAP,STREET,,0834,08,41.747280,-87.725410,"(41.747280312, -87.725413197)",True,NaT
1321,1322,JD348240,,078XX S WINCHESTER AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,,0611,06,41.751614,-87.671959,"(41.751611954, -87.671958451)",True,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702686,702690,JB558892,12/16/2018 07:00:00 PM,026XX S DRAKE AVE,1754,OFFENSE INVOLVING CHILDREN,AGGRAVATED SEXUAL ASSAULT OF CHILD BY FAMILY M...,RESIDENCE,False,1032,31,41.968014,-87.818794,"(41.968013038, -87.818796103)",True,2018-12-16 19:00:00
707240,707244,JD464205,12/18/2020 05:00:00 AM,014XX W 72ND ST,0266,CRIMINAL SEXUAL ASSAULT,PREDATORY,RESIDENCE,True,0734,06,41.754051,-87.659424,"(41.754052908, -87.659425103)",True,2020-12-18 05:00:00
721168,721172,JC152102,12/26/2018 12:13:00 PM,015XX E 87TH ST,1110,DECEPTIVE PRACTICE,BOGUS CHECK,SMALL RETAIL STORE,False,0412,22,41.735855,-87.658737,"(41.73585412, -87.658739723)",True,2018-12-26 12:13:00
724281,724285,JB570460,12/28/2018 04:56:00 PM,049XX S KEDZIE AVE,0860,THEFT,RETAIL THEFT,SMALL RETAIL STORE,False,0821,09,41.804962,-87.703888,"(41.804962024, -87.703884743)",True,2018-12-28 16:56:00


In [37]:
df[df.beat.str.len() != 4]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time


In [38]:
df[df.district.str.len() != 2]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time


#### arrests

    

In [47]:
df[~((df.arrest == True) | (df.arrest == False))].head()

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time
22,23,JB235763,,016XX W PRATT BLVD,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,APARTMENT,,2432,24,,,,True,NaT
122,123,JA100380,,078XX S MORGAN ST,810,THEFT,OVER $500,RESIDENCE,,612,6,41.751896,-87.648926,"(41.751896943, -87.648926093)",True,NaT
321,322,JA469768,,076XX S VERNON AVE,1122,DECEPTIVE PRACTICE,COUNTERFEIT CHECK,RESIDENCE,,624,6,41.755615,-87.613823,"(41.755615044, -87.613820045)",True,NaT
1122,1123,JB346519,,080XX S KEDVALE AVE,553,ASSAULT,AGGRAVATED PO: OTHER DANG WEAP,STREET,,834,8,41.74728,-87.72541,"(41.747280312, -87.725413197)",True,NaT
1321,1322,JD348240,,078XX S WINCHESTER AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,,611,6,41.751614,-87.671959,"(41.751611954, -87.671958451)",True,NaT


In [None]:
df.loc[
    df.index.isin(df[~((df.arrest == True) | (df.arrest == False))].index), "marked"
] = True

### Reset the index & Select df

In [55]:
df.date = df.date_time
df.drop("date_time", axis=0)

KeyError: "['date_time'] not found in axis"

In [57]:
df_final = df.loc[
    :,
    [
        "CrimeID",
        "date",
        "iucr",
        "primarytype",
        "description",
        "locationdescription",
        "arrest",
        "latitude",
        "longitude",
        "location",
        "marked",
    ],
]

In [59]:
df_final.reset_index(inplace=True)

In [60]:
# export this file and you are done