In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('Arrest_Data_from_2010_to_Present.csv') # read in the csv file

In [2]:
#delete columns not relevant to analysis
df.drop(['Report ID','Area ID','Charge Group Code','Location'],axis=1,inplace=True)

In [3]:
#need to cleanup the time field...it is stored like 645 instead of 06:45
df_cleansed = df

#convert float to string
df_cleansed['Time'] = df_cleansed['Time'].astype(str) 

#get rid of decimals
df_cleansed['Time'] = df_cleansed['Time'].str.split(".", expand=True)[0] 

#convert missing to 0000
df_cleansed['Time'] = df_cleansed['Time'].replace(to_replace="nan",value="0000") 

#treat 0 as missing and convert to 0000
df_cleansed['Time'] = df_cleansed['Time'].replace(to_replace="0",value="0000") 

#2400 is not a valid time, converting to 0001 so it isn't the same as missing
df_cleansed['Time'] = df_cleansed['Time'].replace(to_replace="2400",value="0001") 

#split the time string to get the appropriate digits that correspond to hours and minutes
df_cleansed['Hour'] = np.where(df_cleansed['Time'].str.len() == 4,df_cleansed['Time'].str[-4:2],np.where(df_cleansed['Time'].str.len() == 3,df_cleansed['Time'].str[-3:1],"00"))
df_cleansed['Minute'] = df_cleansed['Time'].str[-2:4]

#put hour and minute back together in time format
df_cleansed['NewTime'] = pd.to_datetime(df_cleansed['Hour'] + ':' + df_cleansed['Minute'] + ':00',format='%H:%M:%S').dt.time

In [4]:
#need to clean up cross street field

#remove duplicate whitespaces
df_cleansed['Cross Street'] = df_cleansed['Cross Street'].replace('\s+',' ',regex=True)
df_cleansed['Address'] = df_cleansed['Address'].replace('\s+',' ',regex=True)

#if all digits are numeric, nullify
df_cleansed['Address New'] = np.where(df_cleansed["Address"].str.isdigit() == True,np.nan, df_cleansed["Address"])
df_cleansed['Cross Street New'] = np.where(df_cleansed["Cross Street"].str.isdigit() == True,np.nan, df_cleansed["Cross Street"])

df_cleansed['Address_first_word'] = df_cleansed['Address'].str.split(n=1).str[0]
df_cleansed['Street'] = np.where(df_cleansed['Address_first_word'].str.isdigit() == True,df_cleansed['Address'].str.split(n=1).str[1],df_cleansed['Address'])

df_cleansed['Cross_street_first_word'] = df_cleansed['Cross Street'].str.split(n=1).str[0]
df_cleansed['CrossStreet'] = np.where(df_cleansed['Cross_street_first_word'].str.isdigit() == True,df_cleansed['Cross Street'].str.split(n=1).str[1],df_cleansed['Cross Street'])

In [5]:
#delete columns not relevant to analysis
df_cleansed.drop(['Time','Hour','Minute','Address','Cross Street','Address New','Cross Street New','Address_first_word','Cross_street_first_word'],axis=1,inplace=True)

#add year column
df_cleansed['Date'] = pd.to_datetime(df_cleansed['Arrest Date'])
df_cleansed['Year'] = df_cleansed['Date'].dt.year

In [6]:
df_cleansed.to_csv(r'Cleansed.csv')