In [81]:
from typing import List

import pandas as pd
import re
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [83]:
if not os.path.exists('/content/drive/MyDrive/'):
    raise Exception("Error: Mount Google Drive before continuing!")

In [84]:
BASE_DIR = '/content/drive/MyDrive/Data Science for Social Good - Spring 2022/data/'
DATA_DIR = BASE_DIR + 'CLEAN/'
CLEAN_DIR = BASE_DIR + 'merged_data/'
JACKSON_DIR = CLEAN_DIR + 'jackson/'

os.makedirs(JACKSON_DIR, exist_ok=True)

In [85]:
df_list = []
for sub_dir, dirs, files in os.walk(DATA_DIR):
    # Skip the first iteration because it starts at DATA_DIR/
    # We want DATA_DIR/<DATE>/
    cur_dir = sub_dir.split('/')[-1]
    if cur_dir == '':
        continue

    jackson_filename = f'{sub_dir}/{cur_dir}_Jackson.csv'
    if os.path.exists(jackson_filename):
        df_list.append(pd.read_csv(jackson_filename))

df = pd.concat(df_list, axis=0)

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76573 entries, 0 to 362
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Book_Number     76573 non-null  object 
 1   Name_Last       76573 non-null  object 
 2   Name_Middle     74181 non-null  object 
 3   Name_First_MI   76573 non-null  object 
 4   BookDate        76573 non-null  object 
 5   ArrestDate      76573 non-null  object 
 6   Arrest_Agency   76573 non-null  object 
 7   ID_Number       76573 non-null  int64  
 8   Total Bond($)   76181 non-null  float64
 9   Bondable?       76181 non-null  object 
 10  inmate_info     76181 non-null  object 
 11  inmate_offense  76181 non-null  object 
dtypes: float64(1), int64(1), object(10)
memory usage: 7.6+ MB


# Merge the data

In [87]:
# Process different date formats
df1 = df[df['BookDate'].str.contains('/')].copy()
df2 = df[df['BookDate'].str.contains('-')].copy()

# Convert to a single date format
df1['BookDate'] = pd.to_datetime(df1['BookDate'], infer_datetime_format=True)
df2['BookDate'] = pd.to_datetime(df2['BookDate'], infer_datetime_format=True)

# Merge the two dataframes since they have the same date format
df_new = pd.concat([df1, df2], axis=0)

# Remove duplicates and keep the most recent entry
df_new.sort_values('BookDate', inplace=True)
df_new.drop_duplicates('Book_Number', keep='last', inplace=True)

df_new = df_new.dropna(subset=['inmate_info'])
df_new.reset_index(drop=True, inplace=True)

In [88]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3008 entries, 0 to 3007
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Book_Number     3008 non-null   object        
 1   Name_Last       3008 non-null   object        
 2   Name_Middle     2903 non-null   object        
 3   Name_First_MI   3008 non-null   object        
 4   BookDate        3008 non-null   datetime64[ns]
 5   ArrestDate      3008 non-null   object        
 6   Arrest_Agency   3008 non-null   object        
 7   ID_Number       3008 non-null   int64         
 8   Total Bond($)   3008 non-null   float64       
 9   Bondable?       3008 non-null   object        
 10  inmate_info     3008 non-null   object        
 11  inmate_offense  3008 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(9)
memory usage: 282.1+ KB


# **Clean Data**

In [89]:
# Based on the way the data is collected the inmate_info column should have 7 info points in it
NUM_INFO_EXPECTED = 7

In [90]:
def get_as_list(row: str) -> List[str]:
    """Get every offense between two single quotes.
    E.g. "['AGG.ASAULT ON POLICE,FIRE,SCHOOL', 'AGG.ASAULT']" -> ['AGG.ASAULT ON POLICE,FIRE,SCHOOL', 'AGG.ASAULT']
    """
    # There are some crimes that are contained within double quotes
    # E.g. "INTERFERING WITH DRIVER'S VIEW"
    # We get all strings inside the double quotes and swap the single quotes with the double quotes and vice versa
    # E.g. 'INTERFERING WITH DRIVER"S VIEW'
    double_quote_contents = re.findall('"([^"]*)"', row)
    original_double_quote = [f'"{x}"' for x in double_quote_contents]
    single_quote_contents = [x.replace("'", '"') for x in double_quote_contents]
    converted_single_quote = [f"'{x}'" for x in single_quote_contents]

    # Swap single quotes with double quotes and vice versa
    for original, new in zip(original_double_quote, converted_single_quote):
        row = row.replace(original, new)

    return re.findall("'([^']*)'", row)

In [91]:
def create_temp_df(data):
    """Take in the raw dataframe.
    Output the temp df with the new columns.
    Create a temp df that has all of the inmates info split.
    """
    col = ['race_sex', 'weight', 'height', 'eye_color', 'age', 'booking_num', 'is_bondable']
    info_split = pd.DataFrame(data['inmate_info'].to_list(), columns=col)
    info_split.reset_index(drop=True, inplace=True)
    info_split = info_split.dropna()

    # Race
    info_split["Race"] = info_split["race_sex"].map(lambda x: x.split(" ")[0])
    info_split["Race"] = info_split["Race"].replace("Not", "N/A")

    # Sex
    info_split["Sex"] = info_split['race_sex'].map(lambda x: x.split(" ")[-1])

    # Weight
    info_split["Weight (Pounds)"] = info_split['weight'].map(lambda x: x.split(" ")[0])

    # Age
    info_split["Age"] = info_split['age'].map(lambda x: x.split(" ")[0])

    # Drop unused columns
    info_split = info_split.drop(col, axis=1)

    return info_split

In [92]:
def clean_jackson(data):
    """Take in the raw dataframe.
    Output cleaned dataframe.
    Combine the temp df made from the inmate info col and remove inmates that are missing info
    """
    data['inmate_info'] = data['inmate_info'].map(get_as_list)

    data['inmate_offense'] = data['inmate_offense'].map(get_as_list)

    info_split = create_temp_df(data)

    data_cleaned = pd.concat([data, info_split], axis=1)
    data_cleaned = data_cleaned.drop(data_cleaned[data_cleaned['inmate_info'].map(len) != NUM_INFO_EXPECTED].index)
    data_cleaned = data_cleaned.drop(['inmate_info'], axis=1)

    return data_cleaned

In [93]:
data_cleaned = clean_jackson(df_new)

In [94]:
data_cleaned.head()

Unnamed: 0,Book_Number,Name_Last,Name_Middle,Name_First_MI,BookDate,ArrestDate,Arrest_Agency,ID_Number,Total Bond($),Bondable?,inmate_offense,Race,Sex,Weight (Pounds),Age
0,NJCADC0000008926,STAFFORD,ELLIOT,THOMAS,2017-04-21,04/21/2017,PTS,200999332,0.0,No,"[MURDER, Felony - Bond: $0.00]",White,Male,160,44
1,NJCADC0000010774,PRICE,LAKENA,ANDREA,2017-07-08,07/22/2017,MPPD,200959047,2500.0,No,"[BOND REVOKED, Felony - Bond: $0.00, AGGRAVATE...",Black,Female,260,38
2,NJCADC0000010963,SKALLA,WILLIAM,CORY,2017-07-29,07/31/2017,GPD,201016096,510000.0,Yes,"[SEXUALLY ORIENTED MATERIALS-18, Felony - Bond...",White,Male,165,43
3,NJCADC0000014888,HARRIS,DONOVAN,MICHAEL,2018-02-16,02/16/2018,JCSO,201019477,0.0,No,"[MURDER, Felony - Bond: $0.00, CHILD ABUSE - S...",White,Male,285,30
4,NJCADC0000017010,WILLIAMS,BERNARD,SHERMAN,2018-05-29,05/29/2018,JCSO,200995356,90000.0,No,"[RETURN PER COURT ORDER, Misdemeanor - Bond: $...",Black,Male,180,24


# **Create Offense Table**

In [95]:
def create_row(id_Number, offense, category, bond):
    """Add a row to the offense dataframe.
    Take in all features and put them into a dictionary.
    """
    row = {'ID_Number': id_Number, 'Offense': offense, 'Category': category, 'Bond': bond}
    return row

In [96]:
offense_data = pd.DataFrame(columns=['ID_Number', 'Offense', 'Category', 'Bond'])

# Pass through each inmates offense list and add every offense as a seperate row to the offense dataframe
for i in range(0, len(df_new)):
    id_Number = df_new['ID_Number'][i]

    offense_list = df_new['inmate_offense'][i]

    if len(offense_list) % 2 != 0:
        print(i)
        print("Error: The offense list is not an even length!")
        print(offense_list)

    total_crimes = len(offense_list) // 2

    start_ind = 0
    for _ in range(0, total_crimes):
        offense = offense_list[start_ind]
        category = offense_list[start_ind + 1].split(" ")[0]
        bond = offense_list[start_ind + 1].split(" ")[-1]
        start_ind += 2

        row = create_row(id_Number, offense, category, bond)

        offense_data = offense_data.append(row, ignore_index=True)

# **Save to Drive**

In [97]:
data_cleaned.to_csv(JACKSON_DIR + 'jackson_data_cleaned.csv')
offense_data.to_csv(JACKSON_DIR + 'jackson_offense_data_merged.csv')