In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

In [None]:
""" 
Round 1 Objectives: 
    - Compile Offense Data files into total offense .csv
    - Compile Property Data files into total property .csv

Processing needs:
    <> Ensure uniform formatting
    - Create unique key per event that can be used to group event specific reports
    - Order by date in such a way that a time series can still be orriented over 36 months, 
    rather than have each january from 3 different years mashed together

"""

In [62]:
cc_pathway = '../Data/Crime & COVID (2018-2020)/'

month_pairs = ['Jan-Feb', 'Mar-Apr', 'May-Jun', 'Jul-Aug', 'Sep-Oct', 'Nov-Dec',]

years = ['2018', '2019', '2020']

# test case, reading in Jan-Feb 2018 Offense Data.csv file to explore
jf_path = cc_pathway + ' '.join([month_pairs[0], years[0],'Offense Data.csv'])
jf_2018_df = pd.read_csv(jf_path, encoding='ISO-8859-1',skiprows=5)
#jf_2018_df.head()

# test case, reading in Jan-Feb 2018 Property Data with Category and Value by Loss Type.csv
jfp_path = cc_pathway + ' '.join([month_pairs[0], years[0],'Property Data with Category and Value by Loss Type.csv'])
jfp_2018_df = pd.read_csv(jfp_path, encoding='ISO-8859-1',skiprows=6)
#jfp_2018_df.head()

Unnamed: 0,ORI,Incident Number,Incident Date,Report Date Indicator,Type of Property Loss,Property Description,Property Value,Agency Name
0,TN0110000,180106-0154,1/6/2018,,Destroyed/Damaged/Vandalized,Household Goods,150,Cheatham County Sheriff's Office
1,TN0110000,180106-0154,1/6/2018,,Stolen,Chemicals,50,Cheatham County Sheriff's Office
2,TN0010000,1801010019,1/1/2018,,Stolen,Video Game Consoles/Games,200,Anderson County Sheriff's Office
3,TN0010000,1801020057,1/2/2018,,Stolen,Trucks,4000,Anderson County Sheriff's Office
4,TN0010000,1801020057,1/2/2018,,Stolen,Logging Equipment,1400,Anderson County Sheriff's Office


In [83]:
# creating unique key identifier
def make_key(df):
    # assumes non-datetime object for date
    df['ID_Key'] = df.ORI + '<>' + df['Incident Number'] + '<>' + df['Incident Date']
    cols = list(df.columns)
    cols.insert(0,cols.pop())
    return df[cols]

def prep_date_to_sort(date_str):
    try:
        in_time = datetime.strptime(date_str, "%m/%d/%Y")
    except ValueError:
        in_time = datetime.strptime(date_str, "%m/%d/%y")
    return datetime.strftime(in_time, "%m/%d/%Y")

def sort_by_date(df):
    df['Incident Date'] = df['Incident Date'].apply(prep_date_to_sort)
    df.sort_values('Incident Date', inplace=True)
    return

def compile_C_C_files(file_paths_dict, cc_pathway = '../Data/Crime & COVID (2018-2020)/'):
    i = 1
    # iterating through types of files to compile
    for suffix in file_paths_dict['suffixes']:

        # construct file_path from suffix
        file_path = cc_pathway + suffix

        # initialize first df
        if 'Jan-Feb 2018' in suffix:
            compiled_df = pd.read_csv(file_path, encoding='ISO-8859-1',skiprows=file_paths_dict['skip'])
            # cleaning on DF
            compiled_df = make_key(compiled_df)
            sort_by_date(compiled_df)
            fp = cc_pathway + 'J_test_' + suffix
            compiled_df.to_csv(fp)
            return 0
            print(f'{i} of {len(file_paths_dict["suffixes"])}: {suffix[:20]}')
        # otherwise, concat
        else:
            df = pd.read_csv(file_path, encoding='ISO-8859-1',skiprows=file_paths_dict['skip'])
            # cleaning on df
            df = make_key(df)
            sort_by_date(df)

            compiled_df = pd.concat([compiled_df, df], ignore_index=True)
            print(f'{i} of {len(file_paths_dict["suffixes"])}: {suffix[:20]}')
        
        i += 1
    
    # finished concatonating... write to new file
    new_file_path = f'{cc_pathway}/Compiled/{file_paths_dict["title"]}'
    compiled_df.to_csv(new_file_path)
    print(f'{file_paths_dict["title"]} DONE.')
    return 0

In [84]:
# 0 = Offense Data, 1 = Property Data, paths_dict[type]['path' or 'skip']
paths_dict = {
    0:{
        'title':'2018-2020_Compiled_Offense_Data.csv',
        'type':'Offense Data.csv',
        'skip':5,
        'suffixes':[],
    },
    1:{
        'title':'2018-2020_Compiled_Property_Data.csv',
        'type':'Property Data with Category and Value by Loss Type.csv',
        'skip':6,
        'suffixes':[],
    },
}

# compiling path_suffixes
for i in [0,1]:
    for year in years:
        for months in month_pairs:
            suffix = f'{months} {year} {paths_dict[i]["type"]}'
            paths_dict[i]['suffixes'].append(suffix)

for key in list(paths_dict.keys()):
    compile_C_C_files(paths_dict[key])
        