In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

In [None]:
""" 
Round 1 Objectives: 
    - Compile Offense Data files into total offense .csv
    - Compile Property Data files into total property .csv

Processing needs:
    <> Ensure uniform formatting
    - Create unique key per event that can be used to group event specific reports
    - Order by date in such a way that a time series can still be orriented over 36 months, 
    rather than have each january from 3 different years mashed together

"""

In [62]:
cc_pathway = '../Data/Crime & COVID (2018-2020)/'

month_pairs = ['Jan-Feb', 'Mar-Apr', 'May-Jun', 'Jul-Aug', 'Sep-Oct', 'Nov-Dec',]

years = ['2018', '2019', '2020']

# test case, reading in Jan-Feb 2018 Offense Data.csv file to explore
jf_path = cc_pathway + ' '.join([month_pairs[0], years[0],'Offense Data.csv'])
jf_2018_df = pd.read_csv(jf_path, encoding='ISO-8859-1',skiprows=5)
#jf_2018_df.head()

# test case, reading in Jan-Feb 2018 Property Data with Category and Value by Loss Type.csv
jfp_path = cc_pathway + ' '.join([month_pairs[0], years[0],'Property Data with Category and Value by Loss Type.csv'])
jfp_2018_df = pd.read_csv(jfp_path, encoding='ISO-8859-1',skiprows=6)
#jfp_2018_df.head()

Unnamed: 0,ORI,Incident Number,Incident Date,Report Date Indicator,Type of Property Loss,Property Description,Property Value,Agency Name
0,TN0110000,180106-0154,1/6/2018,,Destroyed/Damaged/Vandalized,Household Goods,150,Cheatham County Sheriff's Office
1,TN0110000,180106-0154,1/6/2018,,Stolen,Chemicals,50,Cheatham County Sheriff's Office
2,TN0010000,1801010019,1/1/2018,,Stolen,Video Game Consoles/Games,200,Anderson County Sheriff's Office
3,TN0010000,1801020057,1/2/2018,,Stolen,Trucks,4000,Anderson County Sheriff's Office
4,TN0010000,1801020057,1/2/2018,,Stolen,Logging Equipment,1400,Anderson County Sheriff's Office


In [66]:
# 0 = Offense Data, 1 = Property Data, paths_dict[type]['path' or 'skip']
paths_dict = {
    0:{
        'path':'Offense Data.csv',
        'skip':5,
    },
    1:{
        'path':'Property Data with Category and Value by Loss Type.csv',
        'skip':6,
    },
}

# 0 = Offense Data, 1 = Property Data
path_suffixes = {
    0:[],
    1:[],
}
for i in [0,1]:
    for year in years:
        for months in month_pairs:
            suffix = f'{months} {year} {paths_dict[i]["path"]}'
            path_suffixes[i].append(suffix)

# iterating through types of files to compile
for i in [0,1]:
    # iterating through files per type
    for suffix in path_suffixes[i]:
        # initialize df

In [64]:
# making sure that the number of lines skipped doesn't need to change within file type
cols = list(jfp_2018_df.columns)
i = 1
for year in years:
    for months in month_pairs:
        #path = cc_pathway + ' '.join([months,year,'Offense Data.csv'])
        path = cc_pathway + ' '.join([months,year,'Property Data with Category and Value by Loss Type.csv'])
        #df = pd.read_csv(path, encoding='ISO-8859-1',skiprows=5)
        df = pd.read_csv(path, encoding='ISO-8859-1',skiprows=6)
        if cols == list(df.columns):
            print(f'{i} of 18 Check.')
        else:
            print(path)
        i += 1

1 of 18 Check.
2 of 18 Check.
3 of 18 Check.
4 of 18 Check.
5 of 18 Check.
6 of 18 Check.
7 of 18 Check.
8 of 18 Check.
9 of 18 Check.
10 of 18 Check.
11 of 18 Check.
12 of 18 Check.
13 of 18 Check.
14 of 18 Check.
15 of 18 Check.
16 of 18 Check.
17 of 18 Check.
18 of 18 Check.


In [58]:
# creating unique key identifier
def make_key(df):
    # assumes non-datetime object for date
    df['ID_Key'] = df.ORI + '<>' + df['Incident Number'] + '<>' + df['Incident Date']
    cols = list(df.columns)
    cols.insert(0,cols.pop())
    return df[cols]

def prep_date_to_sort(date_str):
    in_time = datetime.strptime(date_str, "%m/%d/%Y")
    return datetime.strftime(in_time, "%m/%d/%Y")

def sort_by_date(df):
    df['Incident Date'] = df['Incident Date'].apply(prep_date_to_sort)
    df.sort_values('Incident Date', inplace=True)
    return
