This jupyter notebook helps partition data. This partitioned data will then be used to train a seperate CTGANs. 

In [1]:
import pandas as pd
df = pd.read_feather("raw_incident_data_v2.feather")

In [2]:
df['service_code_description'].value_counts()
df_filtered = df.copy()

In [3]:
# Which service_code to filter on
# df_filtered = df[df['service_code_description'] == "DOMESTIC INCIDENT"]
# len(df_filtered)

### Preprocessing

In [4]:
df_filtered.head()

Unnamed: 0,incident_count,date_created,hour_created,max_final_priority,inc_resourced,scrs_crime_inc,datazone,dzone_code,service_code_description,mm_ward_code,multi_member_ward
0,2,2023-05-24,15,STANDARD,False,True,Northfield and Piershill - 01,S01008743,THEFT,S13002932,Craigentinny/Duddingston
1,2,2023-08-11,7,PROMPT,False,False,Carntyne West and Haghill - 03,S01010244,ROAD TRAFFIC COLLISION,S13002975,Calton
2,2,2023-02-16,20,PROMPT,True,False,Charleston - 04,S01007848,ASSIST MEMBER OF THE PUBLIC,S13002546,Lochee
3,1,2023-02-16,21,PROMPT,True,False,Fort William North - 04,S01010513,ASSIST MEMBER OF THE PUBLIC,S13003000,Caol and Mallaig
4,1,2023-02-16,21,IMMEDIATE,True,False,City Centre - 05,S01007705,ASSIST MEMBER OF THE PUBLIC,S13002549,Maryfield


In [5]:
df_filtered.dtypes

incident_count               int32
date_created                object
hour_created                 int32
max_final_priority          object
inc_resourced                 bool
scrs_crime_inc                bool
datazone                    object
dzone_code                  object
service_code_description    object
mm_ward_code                object
multi_member_ward           object
dtype: object

In [6]:
# covert bool vars to cat
bool_vars = ['scrs_crime_inc', 'inc_resourced']
df_filtered.loc[:, bool_vars] = df_filtered[bool_vars].astype('category')

Length: 1340347
Categories (2, bool): [False, True]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  df_filtered.loc[:, bool_vars] = df_filtered[bool_vars].astype('category')
Length: 1340347
Categories (2, bool): [False, True]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  df_filtered.loc[:, bool_vars] = df_filtered[bool_vars].astype('category')


In [8]:
# converting date variable to a number in [startdate, enddate]
import pandas as pd

# make sure date_created is datetime
df_filtered['date_created'] = pd.to_datetime(df_filtered['date_created'])

# compute day offset from min date, then +1
min_date = df_filtered['date_created'].min()  # 2022-04-01 00:00:00
df_filtered['date_seq'] = (df_filtered['date_created'] - min_date).dt.days + 1

# compute min/max of each
min_date = df_filtered['date_created'].min()
max_date = df_filtered['date_created'].max()
min_seq  = df_filtered['date_seq'].min()
max_seq  = df_filtered['date_seq'].max()

# grab the corresponding dates from the mapped column
mapped_min_date = df_filtered.loc[df_filtered['date_seq'] == min_seq, 'date_created'].iloc[0]
mapped_max_date = df_filtered.loc[df_filtered['date_seq'] == max_seq, 'date_created'].iloc[0]

# sanity‐check
print(f"earliest actual date: {min_date}, mapped date at seq {min_seq}: {mapped_min_date}")
print(f" latest actual date: {max_date}, mapped date at seq {max_seq}: {mapped_max_date}")

earliest actual date: 2022-04-01 00:00:00, mapped date at seq 1: 2022-04-01 00:00:00
 latest actual date: 2024-06-28 00:00:00, mapped date at seq 820: 2024-06-28 00:00:00


In [11]:
import unicodedata

# Function to normalize multi_member_ward values
def normalize_ward_name(name):
    if pd.isnull(name):
        return name
    name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("utf-8")
    name = name.replace("&", "and").replace("/", ",").replace("'", "").strip()
    name = ' '.join(name.split())  # remove extra internal spaces
    return name.lower()  # optional: lowercase for consistency

# Apply to real dataset before training
df_filtered["multi_member_ward"] = df_filtered["multi_member_ward"].map(normalize_ward_name)
lookup["MMWard_Name"] = lookup["MMWard_Name"].map(normalize_ward_name)

NameError: name 'lookup' is not defined

In [13]:
# Checking difference between the lookup and df multi_ward_member names
df_set = set(df_filtered["multi_member_ward"])
lookup_set = set(lookup['MMWard_Name'])

print(lookup_set-df_set) # this might be a location where a theft was not recorded in the recent years

NameError: name 'lookup' is not defined

In [15]:
# Removing unnecessary columns
df_filtered = df_filtered.drop(['incident_count','datazone', 'dzone_code'], axis=1)

In [16]:
df_filtered = df_filtered.drop('mm_ward_code', axis=1)

In [17]:
from sdv.metadata import Metadata

cat_cols  = ["max_final_priority", "service_code_description", 'scrs_crime_inc', 'inc_resourced', 'multi_member_ward']
df_filtered[cat_cols] = df_filtered[cat_cols].astype('object')

## Saving the filtered data into a pickle file

In [18]:
df_filtered.to_pickle("filtered_data/df_v2_filtered.pkl")