In [1]:
import pandas as pd
import os
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.axes as axs

sns.set()


import scipy.stats as stats


# %load_ext nb_black
%load_ext lab_black

### gen purpose functions

In [2]:
def mis_set_num(set_):
    """obtain missing numeric values in set

    Args:
        set_ (set or list like object): set of numbers

    Returns:
        set: returns set of missing values in passed container
    """
    return sorted(set(range(set_[0], set_[-1])) - set(set_))


def convert_test_int(set_):
    """convert elemetnt to list

    Args:
        set_ (set or list like object): set of numbers

    Returns:
        set: returns warning if integer conversion fails
    """
    try:
        return [int(x) for x in set_]
    except:
        print("nan at position")


def date_to_datetime_input(df_):
    """generate SQL compliant date string input

    Args:
        df_ (DataFrame): input DataFrame
        datecol (series): datetime compliant pandas series

    Returns:
        series: return SQL compiant
    """
    return df_["date"].apply(
        lambda x: df_.datecol.str.replace("/", "-").str.split()[:2]
    )

## Import the data using the relvant specifications

In [3]:
# REad the data
df = pd.read_csv(
    "{}/Data/crimes.csv".format("/".join(os.getcwd().split("/")[:-1])),
    delimiter="\t",
    dtype={
        "CrimeID": str,
        "beat": str,
        "iucr": str,
        "district": str,
        "casenumber": str,
        "date": str,
        "block": str,
        "location": str,
    },
)
None

  df = pd.read_csv(


### Missing values processing 

In [4]:
df.head()

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location
0,1,JA118920,01/01/2017 01:00:00 AM,023XX N WAYNE AVE,1156,DECEPTIVE PRACTICE,ATTEMPT - FINANCIAL IDENTITY THEFT,RESIDENCE,False,1811,18,41.924198,-87.662071,"(41.924196311, -87.662069166)"
1,2,JA100052,01/01/2017 01:00:00 AM,090XX S BLACKSTONE AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,413,4,41.730709,-87.587997,"(41.730707788, -87.587998843)"
2,3,JA100367,01/01/2017 01:00:00 AM,049XX N WHIPPLE ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,1713,17,41.970776,-87.704872,"(41.97077465, -87.704873143)"
3,4,JA100092,01/01/2017 01:00:00 AM,023XX S CALIFORNIA AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,False,1034,10,41.848907,-87.695366,"(41.848905706, -87.695364781)"
4,5,JA494707,01/01/2017 01:00:00 AM,032XX N MILWAUKEE AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,APARTMENT,False,1732,17,41.940193,-87.725647,"(41.940192486, -87.725649965)"


### Duplicates

In [5]:
# dropping duplicates repoves 149 instances

df.drop_duplicates(keep="first", inplace=True, ignore_index=True)

In [6]:
# doublechecking confirms
df.drop_duplicates(["casenumber"]).info()


df.drop_duplicates(["CrimeID"]).info()

# Conclusion: no duplicates in CrimeId and Casenumber

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730749 entries, 0 to 730750
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   CrimeID              730749 non-null  object 
 1   casenumber           730748 non-null  object 
 2   date                 730749 non-null  object 
 3   block                730749 non-null  object 
 4   iucr                 730748 non-null  object 
 5   primarytype          730748 non-null  object 
 6   description          730748 non-null  object 
 7   locationdescription  727321 non-null  object 
 8   arrest               730748 non-null  object 
 9   beat                 730748 non-null  object 
 10  district             730748 non-null  object 
 11  latitude             719169 non-null  float64
 12  longitude            719169 non-null  float64
 13  location             719169 non-null  object 
dtypes: float64(2), object(12)
memory usage: 83.6+ MB
<class 'pandas.core

### Initial misisng values check

In [7]:
# simple missing values check
df.isna().sum()

CrimeID                    0
casenumber                 3
date                       2
block                      0
iucr                       3
primarytype                3
description                3
locationdescription     3430
arrest                     3
beat                       3
district                   3
latitude               11582
longitude              11580
location               11580
dtype: int64

remove all instances that do not have a casenumber - resulting in 3 instances removed from the database


In [8]:
# those with missing casenumbers are unworkable
isna_index = df[df.casenumber.isna()].index
df.drop(df[df.casenumber.isna()].index, axis=0, inplace=True)

In [9]:
df.isna().sum()

CrimeID                    0
casenumber                 0
date                       0
block                      0
iucr                       0
primarytype                0
description                0
locationdescription     3427
arrest                     0
beat                       0
district                   0
latitude               11579
longitude              11579
location               11579
dtype: int64

In [10]:
df.loc[df.arrest == "true", "arrest"] = True
df.loc[df.arrest == "false", "arrest"] = False

### search for hidden NA or uncommen NaN values not detected by python
Simple Trick: Search for dataobjects by column which do not correspond with the correct data object in ths first place 

In [11]:
# find search set space for missing values
print(mis_set_num(df.CrimeID.astype(int).values))

missing_values = mis_set_num(df.CrimeID.astype(int).values)

[30000, 98000, 444444]


In [12]:
isna_index

Int64Index([29999, 97999, 444443], dtype='int64')

In [13]:
# find non vonvertables in CrimeId
convert_test_int(df.CrimeID.values)
None

the dropped indexes correspond with the removed observarions for missing case numbers

In [14]:
df[df.index.isin(missing_values)]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location
30000,30001,JB117766,01/16/2018 10:15:00 AM,048XX W LEXINGTON ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,1533,15,41.871147,-87.746452,"(41.871148644, -87.746450344)"
98000,98001,JA163702,02/22/2017 05:00:00 PM,095XX S STATE ST,530,ASSAULT,AGGRAVATED: OTHER DANG WEAPON,STREET,False,511,5,41.72118,-87.623718,"(41.721180629, -87.623717187)"
444444,444445,JB420709,08/11/2018 12:00:00 PM,031XX N ORIOLE AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,RESIDENCE,False,2511,25,41.936714,-87.816475,"(41.936715753, -87.816471337)"


In [15]:
df.CrimeID = df.CrimeID.astype("str", copy=True, errors="raise")

### Beat & District analysis

See whether beat and district have the right amount of entities (beat should have length 4; district length 2)
Adjust for "0" preceeding



In [16]:
# start with districts
# df[df.district.str.len() < 2]

In [17]:
# Continue with districts
# df[df.beat.str.len() < 4]

In [18]:
# alternatively:
# 1. insert 0s in the front of each beat and district where length is smaller 4 or smaller 2 respectively
df["beat_"] = df["beat"].apply(lambda x: "{0:0>4}".format(x))
df["district_"] = df["district"].apply(lambda x: "{0:0>2}".format(x))

In [19]:
# 2) now check where the first two digits of beat_ does NOT comply with the two digits of district_
df[df.beat_.str[:2] != df.district_]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,beat_,district_
5067,5068,JA308088,01/03/2017 12:01:00 AM,021XX W GIDDINGS ST,1582,OFFENSE INVOLVING CHILDREN,CHILD PORNOGRAPHY,RESIDENCE,False,1911,1,41.885700,-87.642029,"(41.885702079, -87.642031644)",1911,01
6264,6265,JD102780,01/03/2020 11:10:00 AM,097XX W KENNEDY EXPY OB,0486,BATTERY,DOMESTIC BATTERY SIMPLE,HIGHWAY/EXPRESSWAY,True,1654,31,41.982494,-87.875580,"(41.98249319, -87.875577905)",1654,31
33046,33047,JE116040,01/17/2021 12:00:00 AM,005XX W OHARE ST,0530,ASSAULT,AGGRAVATED - OTHER DANGEROUS WEAPON,PARKING LOT / GARAGE (NON RESIDENTIAL),False,1653,31,41.965057,-87.879951,"(41.965057367, -87.879953326)",1653,31
59594,59595,JC288579,02/01/2017 12:00:00 AM,038XX S COTTAGE GROVE AVE,1754,OFFENSE INVOLVING CHILDREN,AGGRAVATED SEXUAL ASSAULT OF CHILD BY FAMILY M...,APARTMENT,False,212,3,41.783707,-87.594719,"(41.783707907, -87.594718494)",0212,03
61005,61006,JD429703,02/01/2020 12:00:00 AM,079XX S CHAMPLAIN AVE,1752,OFFENSE INVOLVING CHILDREN,AGGRAVATED CRIMINAL SEXUAL ABUSE BY FAMILY MEMBER,APARTMENT,False,624,11,41.870483,-87.719666,"(41.870481746, -87.719665179)",0624,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702689,702690,JB558892,12/16/2018 07:00:00 PM,026XX S DRAKE AVE,1754,OFFENSE INVOLVING CHILDREN,AGGRAVATED SEXUAL ASSAULT OF CHILD BY FAMILY M...,RESIDENCE,False,1032,31,41.968014,-87.818794,"(41.968013038, -87.818796103)",1032,31
707243,707244,JD464205,12/18/2020 05:00:00 AM,014XX W 72ND ST,0266,CRIMINAL SEXUAL ASSAULT,PREDATORY,RESIDENCE,True,734,6,41.754051,-87.659424,"(41.754052908, -87.659425103)",0734,06
721171,721172,JC152102,12/26/2018 12:13:00 PM,015XX E 87TH ST,1110,DECEPTIVE PRACTICE,BOGUS CHECK,SMALL RETAIL STORE,False,412,22,41.735855,-87.658737,"(41.73585412, -87.658739723)",0412,22
724284,724285,JB570460,12/28/2018 04:56:00 PM,049XX S KEDZIE AVE,0860,THEFT,RETAIL THEFT,SMALL RETAIL STORE,False,821,9,41.804962,-87.703888,"(41.804962024, -87.703884743)",0821,09


This returns another 79 porblematic instances

In [20]:
# those columns that contain problems will from now on be marked
df["marked"] = False
df.loc[df.index.isin(df[df.beat_.str[:2] != df.district_].index), "marked"] = True

In [21]:
# replace beat with beat_
df.beat = df.beat_
df.district = df.district_

df.drop(["beat_", "district_"], axis=1, inplace=True)

### Check date 

DATETIME - format: YYYY-MM-DD HH:MI:SS.
TIMESTAMP - format: YYYY-MM-DD HH:MI:SS.

Case number has a mandatory length of 8; 

See whether beat and district have the right amount of entities (beat should have length 4; district length 2)
Adjust for "0" preceeding

string is na: '', ' ',"N/A"

In [22]:
# simple conversion
# now convert date
df["date_time"] = pd.to_datetime(df.date.astype(str), errors="coerce")

In [23]:
# date_to_datetime_input(df)

In [24]:
df[df["date_time"].isna()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 22 to 624031
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   CrimeID              15 non-null     object        
 1   casenumber           15 non-null     object        
 2   date                 15 non-null     object        
 3   block                15 non-null     object        
 4   iucr                 15 non-null     object        
 5   primarytype          15 non-null     object        
 6   description          15 non-null     object        
 7   locationdescription  14 non-null     object        
 8   arrest               15 non-null     object        
 9   beat                 15 non-null     object        
 10  district             15 non-null     object        
 11  latitude             14 non-null     float64       
 12  longitude            14 non-null     float64       
 13  location             14 non-null

an additional 16 entries do not contain dates; this is pivotal so they have to be removed

In [25]:
df["date_time_str"] = df.date_time.astype(str)

In [26]:
df.loc[df.index.isin(df[df.date_time.isnull()].index), "marked"] = True

### Outliers in locatioN!


Here z values will suffice 

In [27]:
# df["longitude"].apply(lambda x: stats.zscore(x, nan_policy="omit"))

In [28]:
df.describe()

Unnamed: 0,latitude,longitude
count,719169.0,719169.0
mean,41.843298,-87.670142
std,0.086724,0.059218
min,36.619446,-91.686569
25%,41.768051,-87.712669
50%,41.862217,-87.663658
75%,41.905334,-87.627686
max,42.022671,-87.524529


removing all observations which do not fall into the general range of 40 to 42.5 a

In [29]:
df.longitude_z = (df.longitude - df.longitude.mean()) / df.longitude.std()
df.latitude_z = (df.latitude - df.latitude.mean()) / df.latitude.std()

  df.longitude_z = (df.longitude - df.longitude.mean()) / df.longitude.std()
  df.latitude_z = (df.latitude - df.latitude.mean()) / df.latitude.std()


after some testing only one observation with the following information seemed to be wrong
in this case, the decision was made to make this isntance of lat lon not known.
36.619446	-91.686569

In [30]:
# marked_unimportant
df["marked_u"] = False
df.loc[df.index.isin(df[df.latitude == 36.619446].index), "marked_u"] = True

In [31]:
df.loc[df.latitude == 36.619446, "latitude"] = np.nan
df.loc[df.longitude == -91.686569, "longitude"] = np.nan

### Misc tests

In [32]:
df.dropna(how="all")

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time,date_time_str,marked_u
0,1,JA118920,01/01/2017 01:00:00 AM,023XX N WAYNE AVE,1156,DECEPTIVE PRACTICE,ATTEMPT - FINANCIAL IDENTITY THEFT,RESIDENCE,False,1811,18,41.924198,-87.662071,"(41.924196311, -87.662069166)",False,2017-01-01 01:00:00,2017-01-01 01:00:00,False
1,2,JA100052,01/01/2017 01:00:00 AM,090XX S BLACKSTONE AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,0413,04,41.730709,-87.587997,"(41.730707788, -87.587998843)",False,2017-01-01 01:00:00,2017-01-01 01:00:00,False
2,3,JA100367,01/01/2017 01:00:00 AM,049XX N WHIPPLE ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,1713,17,41.970776,-87.704872,"(41.97077465, -87.704873143)",False,2017-01-01 01:00:00,2017-01-01 01:00:00,False
3,4,JA100092,01/01/2017 01:00:00 AM,023XX S CALIFORNIA AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,False,1034,10,41.848907,-87.695366,"(41.848905706, -87.695364781)",False,2017-01-01 01:00:00,2017-01-01 01:00:00,False
4,5,JA494707,01/01/2017 01:00:00 AM,032XX N MILWAUKEE AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,APARTMENT,False,1732,17,41.940193,-87.725647,"(41.940192486, -87.725649965)",False,2017-01-01 01:00:00,2017-01-01 01:00:00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730746,730747,JE493505,12/31/2021 12:38:00 AM,005XX E 32ND ST,031A,ROBBERY,ARMED - HANDGUN,RESIDENCE - YARD (FRONT / BACK),False,0211,02,41.836071,-87.612816,"(41.83607107, -87.612813156)",False,2021-12-31 00:38:00,2021-12-31 00:38:00,False
730747,730748,JE493511,12/31/2021 12:41:00 AM,038XX W 68TH PL,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,0833,08,41.768700,-87.718941,"(41.76870088, -87.718942599)",False,2021-12-31 00:41:00,2021-12-31 00:41:00,False
730748,730749,JF100517,12/31/2021 12:41:00 PM,038XX W 83RD PL,0820,THEFT,$500 AND UNDER,RESIDENCE - PORCH / HALLWAY,False,0834,08,41.741333,-87.717537,"(41.741332605, -87.717536356)",False,2021-12-31 12:41:00,2021-12-31 12:41:00,False
730749,730750,JF100660,12/31/2021 12:49:00 AM,075XX S SEELEY AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,RESIDENCE,False,0611,06,41.756420,-87.674522,"(41.756418241, -87.674522275)",False,2021-12-31 00:49:00,2021-12-31 00:49:00,False


### length of each instance 

Case number has a mandatory length of 8; 

See whether beat and district have the right amount of entities (beat should have length 4; district length 2)
Adjust for "0" preceeding

string is na: '', ' ',"N/A"


relevant columns to incestigate:
casenumber - 8
iucr - 4
beat - 4
district - 2
date_time - 19

In [33]:
df[df.casenumber.str.len() != 8]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time,date_time_str,marked_u
77888,77889,G647504,02/11/2017 06:32:00 PM,011XX W 51ST ST,110,HOMICIDE,FIRST DEGREE MURDER,STREET,False,933,9,41.80146,-87.654411,"(41.801458486, -87.654410134)",False,2017-02-11 18:32:00,2017-02-11 18:32:00,False
115639,115640,F780368,03/03/2018 01:25:00 PM,035XX S RHODES AVE,110,HOMICIDE,FIRST DEGREE MURDER,APARTMENT,True,212,2,41.829651,-87.614449,"(41.829651528, -87.614451899)",False,2018-03-03 13:25:00,2018-03-03 13:25:00,False
303819,303820,.JB299184,06/08/2018 10:00:00 PM,060XX N SHERIDAN RD,890,THEFT,FROM BUILDING,APARTMENT,False,2433,24,41.991562,-87.655434,"(41.991560549, -87.655432099)",False,2018-06-08 22:00:00,2018-06-08 22:00:00,False
434691,434692,J382356,08/07/2017 01:00:00 PM,018XX S THROOP ST,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,1233,12,41.857353,-87.658859,"(41.857352518, -87.658859262)",False,2017-08-07 13:00:00,2017-08-07 13:00:00,False
560956,560957,464266,10/05/2018 10:45:00 PM,118XX S PERRY AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,522,5,41.679008,-87.625145,"(41.679009135, -87.625147078)",False,2018-10-05 22:45:00,2018-10-05 22:45:00,False
570999,571000,D439410,10/10/2018 07:00:00 AM,042XX W AUGUSTA BLVD,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,1111,11,41.898926,-87.731758,"(41.898926722, -87.731756818)",False,2018-10-10 07:00:00,2018-10-10 07:00:00,False
642848,642849,G284899,11/15/2018 09:01:00 PM,014XX N CALIFORNIA AVE,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,1423,14,41.90731,-87.696991,"(41.907312308, -87.696987788)",False,2018-11-15 21:01:00,2018-11-15 21:01:00,False


In [34]:
df.loc[df.index.isin(df[df.casenumber.str.len() != 8].index), "marked_u"] = True

In [35]:
df[df.iucr.str.len() != 4]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time,date_time_str,marked_u


In [36]:
# these were the same instances recored before
len(df[df.date_time.astype(str).str.len() != 19])

15

In [37]:
# df.loc[
#     df.index.isin(df[df.date_time.astype(str).str.len() != 19].index), "marked"
# ] = True
# df[df["marked"] == True]

In [38]:
df[df.beat.str.len() != 4]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time,date_time_str,marked_u


In [39]:
df[df.district.str.len() != 2]

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time,date_time_str,marked_u


#### arrests

    

In [40]:
df[~((df.arrest == True) | (df.arrest == False))].head()

Unnamed: 0,CrimeID,casenumber,date,block,iucr,primarytype,description,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time,date_time_str,marked_u
22,23,JB235763,,016XX W PRATT BLVD,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,APARTMENT,,2432,24,,,,True,NaT,NaT,False
122,123,JA100380,,078XX S MORGAN ST,810,THEFT,OVER $500,RESIDENCE,,612,6,41.751896,-87.648926,"(41.751896943, -87.648926093)",True,NaT,NaT,False
321,322,JA469768,,076XX S VERNON AVE,1122,DECEPTIVE PRACTICE,COUNTERFEIT CHECK,RESIDENCE,,624,6,41.755615,-87.613823,"(41.755615044, -87.613820045)",True,NaT,NaT,False
1122,1123,JB346519,,080XX S KEDVALE AVE,553,ASSAULT,AGGRAVATED PO: OTHER DANG WEAP,STREET,,834,8,41.74728,-87.72541,"(41.747280312, -87.725413197)",True,NaT,NaT,False
1321,1322,JD348240,,078XX S WINCHESTER AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,,611,6,41.751614,-87.671959,"(41.751611954, -87.671958451)",True,NaT,NaT,False


In [41]:
df.loc[
    df.index.isin(df[~((df.arrest == True) | (df.arrest == False))].index), "marked"
] = True

### repair crimetype overall    in terms of duplciation

In [150]:
# get all necessary iucr
df_crimetype = df.loc[:, ["iucr", "primarytype", "description"]].drop_duplicates()


# fixing the IUCR
# all instances in the following DF have to be unionized,
df_crimetype[
    df_crimetype.index.isin(df_crimetype.drop_duplicates(subset=["iucr"]).index)
    == False
]

# as such: we drop duplicates for each IUCR;
# and then replace all values of the IUCRS with the correct FIRST primary description
# do it via merging (right join)
df_iucr = df_crimetype.drop_duplicates(subset=["iucr"], keep="first")

In [151]:
# drop the primary and secondary type
df.drop(["primarytype", "description"], axis=1, inplace=True)

In [155]:
# right merge on IUCR
df = pd.merge(left=df, right=df_iucr, how="right", on="iucr")

In [156]:
# it works
df.drop_duplicates(subset=["iucr"])

Unnamed: 0,CrimeID,casenumber,date,block,iucr,locationdescription,arrest,beat,district,latitude,longitude,location,marked,date_time,date_time_str,marked_u,primarytype,description
0,1,JA118920,01/01/2017 01:00:00 AM,023XX N WAYNE AVE,1156,RESIDENCE,False,1811,18,41.924198,-87.662071,"(41.924196311, -87.662069166)",False,2017-01-01 01:00:00,2017-01-01 01:00:00,False,DECEPTIVE PRACTICE,ATTEMPT - FINANCIAL IDENTITY THEFT
1786,2,JA100052,01/01/2017 01:00:00 AM,090XX S BLACKSTONE AVE,0486,RESIDENCE,True,0413,04,41.730709,-87.587997,"(41.730707788, -87.587998843)",False,2017-01-01 01:00:00,2017-01-01 01:00:00,False,BATTERY,DOMESTIC BATTERY SIMPLE
68915,5,JA494707,01/01/2017 01:00:00 AM,032XX N MILWAUKEE AVE,1153,APARTMENT,False,1732,17,41.940193,-87.725647,"(41.940192486, -87.725649965)",False,2017-01-01 01:00:00,2017-01-01 01:00:00,False,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300
82397,6,JA102612,01/01/2017 01:00:00 AM,047XX S VINCENNES AVE,1320,STREET,False,0223,02,41.808472,-87.613853,"(41.80847026, -87.613856126)",False,2017-01-01 01:00:00,2017-01-01 01:00:00,False,CRIMINAL DAMAGE,TO VEHICLE
119193,7,JC456595,01/01/2017 01:00:00 AM,015XX W 19TH ST,0281,APARTMENT,False,1235,12,,,,False,2017-01-01 01:00:00,2017-01-01 01:00:00,False,CRIM SEXUAL ASSAULT,NON-AGGRAVATED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730743,541301,JE387420,09/25/2021 10:31:00 PM,019XX W OGDEN AVE,0480,HOSPITAL BUILDING / GROUNDS,False,1231,12,41.873219,-87.675865,"(41.873219626, -87.675868666)",False,2021-09-25 22:31:00,2021-09-25 22:31:00,False,BATTERY,AGGRAVATED PROTECTED EMPLOYEE - HANDGUN
730744,646493,JB519507,11/17/2017 12:01:00 AM,064XX S WINCHESTER AVE,1531,RESIDENCE,True,0726,07,,,,False,2017-11-17 00:01:00,2017-11-17 00:01:00,False,PROSTITUTION,JUVENILE PIMPING
730745,647606,JD434451,11/17/2020 11:52:00 PM,032XX W LAKE ST,0510,VEHICLE NON-COMMERCIAL,False,1123,11,41.884388,-87.706863,"(41.884389103, -87.706866213)",False,2020-11-17 23:52:00,2020-11-17 23:52:00,False,RITUALISM,"AGG. RITUAL MUTILATION - HANDS, FISTS, FEET, S..."
730746,682445,JE480451,12/05/2021 11:00:00 AM,044XX N MAGNOLIA AVE,4810,APARTMENT,False,1913,19,41.962231,-87.660973,"(41.962230219, -87.660975183)",False,2021-12-05 11:00:00,2021-12-05 11:00:00,False,OTHER OFFENSE,COMPOUNDING A CRIME


### Reset the index & Select df

In [158]:
df["date"] = df.date_time

In [159]:
df.drop(["date_time", "date_time_str"], axis=1, inplace=True)

In [160]:
df_final = df.loc[
    :,
    [
        "CrimeID",
        "date",
        "iucr",
        "primarytype",
        "description",
        "beat",
        "district",
        "locationdescription",
        "arrest",
        "latitude",
        "longitude",
        "location",
        "marked",
    ],
]

In [161]:
# remove all marked
df_final.drop(df_final[df_final.marked == True].index, axis=0, inplace=True)

In [162]:
df_final.reset_index(inplace=True, drop=True)

In [163]:
# export this file and you are done

df_final.drop_duplicates()

Unnamed: 0,CrimeID,date,iucr,primarytype,description,beat,district,locationdescription,arrest,latitude,longitude,location,marked
0,1,2017-01-01 01:00:00,1156,DECEPTIVE PRACTICE,ATTEMPT - FINANCIAL IDENTITY THEFT,1811,18,RESIDENCE,False,41.924198,-87.662071,"(41.924196311, -87.662069166)",False
1,301,2017-01-01 08:00:00,1156,DECEPTIVE PRACTICE,ATTEMPT - FINANCIAL IDENTITY THEFT,1923,19,RESIDENCE,False,,,,False
2,610,2017-01-01 00:01:00,1156,DECEPTIVE PRACTICE,ATTEMPT - FINANCIAL IDENTITY THEFT,1911,19,RESIDENCE,False,41.965687,-87.695702,"(41.965685734, -87.69570491)",False
3,1833,2019-01-01 00:00:00,1156,DECEPTIVE PRACTICE,ATTEMPT - FINANCIAL IDENTITY THEFT,0422,04,APARTMENT,False,,,,False
4,2430,2020-01-01 12:00:00,1156,DECEPTIVE PRACTICE,ATTEMPT - FINANCIAL IDENTITY THEFT,0735,07,RESIDENCE,False,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
730649,541301,2021-09-25 22:31:00,0480,BATTERY,AGGRAVATED PROTECTED EMPLOYEE - HANDGUN,1231,12,HOSPITAL BUILDING / GROUNDS,False,41.873219,-87.675865,"(41.873219626, -87.675868666)",False
730650,646493,2017-11-17 00:01:00,1531,PROSTITUTION,JUVENILE PIMPING,0726,07,RESIDENCE,True,,,,False
730651,647606,2020-11-17 23:52:00,0510,RITUALISM,"AGG. RITUAL MUTILATION - HANDS, FISTS, FEET, S...",1123,11,VEHICLE NON-COMMERCIAL,False,41.884388,-87.706863,"(41.884389103, -87.706866213)",False
730652,682445,2021-12-05 11:00:00,4810,OTHER OFFENSE,COMPOUNDING A CRIME,1913,19,APARTMENT,False,41.962231,-87.660973,"(41.962230219, -87.660975183)",False


In [165]:
df_final.to_csv(
    "/home/angelo/Documents/Uni/Courses/Data Managment & Ethics/Integrated Assignment/data_crime_transformed.csv",
    sep="\t",
)