In [None]:
import pandas as pd
import numpy as np
from datetime import datetime 
import os

In [None]:
""" 
NOTE: Assuming no repeats, as informed
For timeseries, we want aggregated by date the frequency of each crime type. For example:
row 1: 1/1/2018 Theft: 12, Destruction/...: 22, etc. 
As many features as the 24 offense types (codes 13A, 13B, 13C, 13D are recoded as 13, etc.)
"""

### Functions for Creating Bi-Monthly DF

In [None]:
# recoding offense codes into the 24 options
def new_offense_code(prior):
    if prior.isnumeric():
        return prior
    return prior[:2]

# gets 'Incident Date' feature in correct order
def reorder_dates(bimonthly_df):
    # puts incident dates in order
    days = list(set(bimonthly_df['Incident Date']))
    try:
        days.sort(key=lambda date: datetime.strptime(date, "%m/%d/%Y"))
    except ValueError:
        days.sort(key=lambda date: datetime.strptime(date, "%m/%d/%y"))

    # gets index values in the order by date
    order = []
    for day in days:
        i = bimonthly_df.loc[bimonthly_df['Incident Date'] == day].index[0]
        order.append(i)
    # reindex to order dates in ascending order
    bimonthly_cleaned_df = bimonthly_df.reindex(order)
    # reset index to start with 0
    bimonthly_cleaned_df.reset_index(inplace=True, drop=True)

    return bimonthly_cleaned_df

# create the aggregated by code count df
def create_bimonthyl_code_df(df):
    df['Code'] = df['Offense Code'].apply(new_offense_code)

    # extract monthly period
    months = '_'.join(df['Incident Month'].unique().tolist())

    # trim features
    dfx = df[['ORI','Incident Number','Incident Date','Code']]

    # expand Code as features (this doesn't guaruntee that 24 codes will come up, technically a month could not have a certain crime)
    X = pd.get_dummies(dfx.Code, prefix='Code')
    dfy = pd.concat([dfx,X], axis=1).drop('Code', axis=1)

    # aggregate by count of code by date
    bimonthly_df = dfy.groupby(['Incident Date']).sum()

    # make 'Incident Date' a feature again instead of index
    bimonthly_df.reset_index(drop=False, inplace=True)

    return months, reorder_dates(bimonthly_df=bimonthly_df)

     


### Working over files in Directory

In [None]:
# setting directory
d = '../Data/TBI2018-20/'
# prepping dictionary for file names
yl = {
    '2018':[],
    '2019':[],
    '2020':[],
}
# getting files to run by year
for year in ['2018', '2019', '2020']:
    for file in os.listdir(d):
        if year in file:
            yl[year].append(d + file)

# ordering files chronologically 
yl['2018'] = [yl['2018'][i] for i in [1,4,5,2,3,0]]
yl['2019'] = [yl['2019'][i] for i in [0,2,3,4,5,1]]
yl['2020'] = [yl['2020'][i] for i in [5,2,3,0,1,4]]

# ordering features for concatonation consistency
ordered_cols = [
    'Incident Date',
    'Code_09',
    'Code_11',
    'Code_13',
    'Code_23',
    'Code_26',
    'Code_35',
    'Code_36',
    'Code_39',
    'Code_40',
    'Code_64',
    'Code_100',
    'Code_120',
    'Code_200',
    'Code_210',
    'Code_220',
    'Code_240',
    'Code_250',
    'Code_270',
    'Code_280',
    'Code_290',
    'Code_370',
    'Code_510',
    'Code_520',
    'Code_720'
]

# will have master file
# all_df = pd.DataFrame()

for year in list(yl.keys()):
    # reset yearly df
    yearly_df = pd.DataFrame()

    for file in yl[year]:
        # read in bimonthly csv
        bimonthly_df = pd.DataFrame()
        bimonthly_df = pd.read_csv(file, encoding='ISO-8859-1',skiprows=5)
        # get the months for the new file, prepped bimonthly df
        
        months, cleaned_monthly_df = create_bimonthyl_code_df(bimonthly_df)
    
        # need to order columns to match
        cleaned_monthly_df = cleaned_monthly_df[ordered_cols]
        # export bimonthly to csv
        cleaned_monthly_df.to_csv(f'../Data/TBI2018-20_cleaned/Bi-monthly/{months}_{year}_cleaned.csv')
        # concat the yearly csv
        if yearly_df.empty:
            yearly_df = cleaned_monthly_df
        else:
            yearly_df = pd.concat([yearly_df,cleaned_monthly_df],axis=0)
    
    # fix index
    yearly_df.reset_index(drop=True, inplace=True)
    # export yearly file
    yearly_df.to_csv(f'../Data/TBI2018-20_cleaned/Yearly/year_{year}_compiled.csv')
    # # concat the master csv
    # if all_df.empty:
    #     all_df = yearly_df
    # else:
    #     all_df = pd.concat([all_df, yearly_df], axis=0)

# export master file
# all_df.to_csv(f'../Data/TBI2018-20_cleaned/master_2018-2020.csv')
    

In [None]:
# creating master file
X = pd.read_csv('../Data/TBI2018-20_cleaned/Yearly/year_2018_compiled.csv')
Y = pd.read_csv('../Data/TBI2018-20_cleaned/Yearly/year_2019_compiled.csv')
Z = pd.read_csv('../Data/TBI2018-20_cleaned/Yearly/year_2020_compiled.csv')

all_df = pd.concat([X,Y,Z], axis=0)
all_df.reset_index(drop=True, inplace=True)
all_df.to_csv('../Data/TBI2018-20_cleaned/master_2018-2020.csv')

### Troubleshooting Issue with delinquent files

In [None]:
# # reading in jan-feb 2018 for a look see what prep is needed
df = pd.read_csv('../Data/TBI2018-20/May-Jun 2019 Offense Data.csv', encoding='ISO-8859-1',skiprows=5)
# df2 = pd.read_csv('../Data/TBI2018-20/Nov-Dec 2019 Offense Data.csv', encoding='ISO-8859-1',skiprows=5)

# # fixed this for the delinquent files. In each of them someone misentered address info that caused the address 
# # to spill into multiple cell entries, so these rows had entries on up to 4 extra columns
# df['Incident Month'].unique().tolist() # ['December', 'November', '37055"', '37036"']

In [None]:
l = df['Incident Date'].unique().tolist()


In [None]:
# m, ndf = create_bimonthyl_code_df(df)
# # m2,ndf2 = create_bimonthyl_code_df(df2)
# print(m)
# ndf