Importing libraries

In [21]:
import pandas as pd
import numpy as np

Read in the original DataSet for comparsion:

In [22]:
original = pd.read_csv("../data/lrpd.csv")
print(f"Successfully read in Original Little Rock Crime Data. DataFrame Size: {original.shape[0]} rows x {original.shape[1]} cols")

Successfully read in Original Little Rock Crime Data. DataFrame Size: 92771 rows x 14 cols


### Request relevant columns from dataset:

In [23]:
requested_columns = ["INCIDENT_DATE", "INCIDENT_NUMBER", "LOCATION_DISTRICT", "OFFENSE_DESCRIPTION", "WEAPON_TYPE", "ZIP", "LATITUDE", "LONGITUDE"]

lrpd = pd.read_csv("lrpd.csv", usecols=requested_columns, index_col="INCIDENT_NUMBER")

print(f"Successfully read in Little Rock Crime Data. DataFrame Size: {lrpd.shape[0]} rows x {lrpd.shape[1]} cols")

Successfully read in Little Rock Crime Data. DataFrame Size: 92771 rows x 7 cols


### Memory Reduction
Following code reduces the memory of the dataset that we are working with. It replaces large int/float datatypes into smaller ones to use less memory. 

Our data isnt too big but on much larger data, this code will make a big difference in the speed of feature engineering and processing of data.

In [24]:
def reduce_mem_usage(df, category=False):
  start_mem = df.memory_usage().sum() / 1024 **2
  print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

  for col in df.columns:
    col_type = df[col].dtype

    if col_type != object:
      c_min = df[col].min()
      c_max = df[col].max()
      if str(col_type)[:3] == 'int':
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
          df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
          df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
          df[col] = df[col].astype(np.int32)
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
          df[col] = df[col].astype(np.int64)
      else:
        if c_min  > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
          df[col] = df[col].astype(np.float16)
        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
          df[col] = df[col].astype(np.float32)
        else:
          df[col] = df[col].astype(np.float64)
    else:
      if category:
        df[col] = df[col].astype('category')

  end_mem = df.memory_usage().sum() / 1024 ** 2
  print('Memory usage after optimizations: {:.2f} MB'.format(end_mem))
  print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
  return df

In [25]:
lrpd = reduce_mem_usage(lrpd, category=True)

Memory usage of dataframe is 5.66 MB
Memory usage after optimizations: 4.73 MB
Decreased by 16.5%


### Drop duplicate rows based on index values:

In [26]:
prev_shape = lrpd.shape
lrpd = lrpd.drop_duplicates(keep='first')
new_shape = lrpd.shape

print(f'Dropped {(prev_shape[0] - new_shape[0])} duplicates!')
print(f'Little Rock Crime Data DataFrame Size: {lrpd.shape[0]} rows x {lrpd.shape[1]} cols')

Dropped 5927 duplicates!
Little Rock Crime Data DataFrame Size: 86844 rows x 7 cols


### Fill missing values with Number 0:

In [27]:
lrpd.isna().sum()

INCIDENT_DATE              0
LOCATION_DISTRICT         89
OFFENSE_DESCRIPTION        0
WEAPON_TYPE            71665
ZIP                     1195
LATITUDE                3140
LONGITUDE               3140
dtype: int64

In [28]:
lrpd.fillna(value={'LATITUDE': 0}, inplace=True)
lrpd.fillna(value={'LONGITUDE': 0}, inplace=True)

In [29]:
lrpd['WEAPON_TYPE'] = lrpd['WEAPON_TYPE'].cat.add_categories('NO WEAPON')
lrpd.fillna(value={'WEAPON_TYPE': 'NO WEAPON'}, inplace=True)

  lrpd.fillna(value={'WEAPON_TYPE': 'NO WEAPON'}, inplace=True)


### Parse Number 0 and String 1 to weapon type:

In [30]:
lrpd["WEAPON_TYPE"] = lrpd["WEAPON_TYPE"].replace("1", "UNKNOWN")

### Convert columns to correct types:

In [31]:
lrpd["INCIDENT_DATE"] = pd.to_datetime(lrpd["INCIDENT_DATE"])
lrpd["LOCATION_DISTRICT"] = pd.to_numeric(lrpd["LOCATION_DISTRICT"])
lrpd["OFFENSE_DESCRIPTION"] = lrpd["OFFENSE_DESCRIPTION"] # todo: ensure this is parsed as a string
lrpd["WEAPON_TYPE"] = lrpd["WEAPON_TYPE"] # todo: ensure this is parsed as a string
lrpd["ZIP"] = pd.to_numeric(lrpd["ZIP"])
lrpd["LATITUDE"] = pd.to_numeric(lrpd["LATITUDE"])
lrpd["LONGITUDE"] = pd.to_numeric(lrpd["LONGITUDE"])

lrpd.head().T

INCIDENT_NUMBER,2022-036059,2017-029450,2017-114829,2018-046714,2020-042339
INCIDENT_DATE,2022-03-30 17:38:00,2017-03-16 18:30:00,2017-09-15 03:14:00,2018-04-20 16:18:00,2020-04-18 18:15:00
LOCATION_DISTRICT,42.0,82.0,64.0,82.0,61.0
OFFENSE_DESCRIPTION,THEFT FROM MOTOR VEHICLE,THEFT OF MOTOR VEHICLE PARTS,RAPE,RAPE,RAPE
WEAPON_TYPE,NO WEAPON,NO WEAPON,NO WEAPON,UNKNOWN,HANDGUN
ZIP,72202.0,72206.0,,,
LATITUDE,0.0,34.65625,0.0,0.0,0.0
LONGITUDE,0.0,-92.3125,0.0,0.0,0.0


### Appending new columns to DataFrame:

In [32]:
lrpd["WEEK_OF_MONTH"] = lrpd["INCIDENT_DATE"].dt.day // 7
lrpd["YEAR"] = lrpd["INCIDENT_DATE"].dt.year
lrpd["DAY"] = lrpd["INCIDENT_DATE"].dt.day
lrpd["DAY_OF_YEAR"] = lrpd["INCIDENT_DATE"].dt.dayofyear
lrpd["MONTH"] = lrpd["INCIDENT_DATE"].dt.month

### Visualizing current modifications:

In [33]:
lrpd.head().T

INCIDENT_NUMBER,2022-036059,2017-029450,2017-114829,2018-046714,2020-042339
INCIDENT_DATE,2022-03-30 17:38:00,2017-03-16 18:30:00,2017-09-15 03:14:00,2018-04-20 16:18:00,2020-04-18 18:15:00
LOCATION_DISTRICT,42.0,82.0,64.0,82.0,61.0
OFFENSE_DESCRIPTION,THEFT FROM MOTOR VEHICLE,THEFT OF MOTOR VEHICLE PARTS,RAPE,RAPE,RAPE
WEAPON_TYPE,NO WEAPON,NO WEAPON,NO WEAPON,UNKNOWN,HANDGUN
ZIP,72202.0,72206.0,,,
LATITUDE,0.0,34.65625,0.0,0.0,0.0
LONGITUDE,0.0,-92.3125,0.0,0.0,0.0
WEEK_OF_MONTH,4,2,2,2,2
YEAR,2022,2017,2017,2018,2020
DAY,30,16,15,20,18


In [34]:
lrpd.tail().T

INCIDENT_NUMBER,2022-303291,2022-303160,2022-302870,2022-302451,2022-301998
INCIDENT_DATE,2022-12-08 00:00:00,2022-10-29 09:00:00,2022-10-23 14:30:00,2022-09-07 14:00:00,2022-07-21 08:00:00
LOCATION_DISTRICT,73.0,62.0,72.0,92.0,72.0
OFFENSE_DESCRIPTION,THEFT FROM MOTOR VEHICLE,ALL OTHER LARCENY,THEFT FROM MOTOR VEHICLE,SHOPLIFTING,THEFT FROM MOTOR VEHICLE
WEAPON_TYPE,NO WEAPON,NO WEAPON,NO WEAPON,NO WEAPON,NO WEAPON
ZIP,72223.0,72227.0,72211.0,72210.0,72211.0
LATITUDE,0.0,0.0,0.0,0.0,0.0
LONGITUDE,0.0,0.0,0.0,0.0,0.0
WEEK_OF_MONTH,1,4,3,1,3
YEAR,2022,2022,2022,2022,2022
DAY,8,29,23,7,21


### Determining crime risk and violence level:

In [35]:
all_crimes = lrpd["OFFENSE_DESCRIPTION"].unique()
all_weapons = lrpd["WEAPON_TYPE"].unique()

Pre-defining which crimes are considered violent and nonviolent:

In [36]:
def assessCrime(lrpd):
    # Makes a dictionary of crime types and their type (0 = non-violent, 1 = violent)
    crime_types = {'THEFT FROM MOTOR VEHICLE': 0,
                    'THEFT OF MOTOR VEHICLE PARTS': 0,
                    'RAPE': 1,
                    'ALL OTHER LARCENY': 0,
                    'SHOPLIFTING': 0,
                    'BURGLARY/B&E': 1,
                    'MOTOR VEHICLE THEFT': 0,
                    'AGGRAVATED ASSAULT': 1,
                    'THEFT FROM BUILDING': 0,
                    'ROBBERY': 1,
                    'MURDER & NONNEGLIGENT MANSLAUGHTER': 1,
                    'PURSE-SNATCHING': 0,
                    'POCKET-PICKING': 0,
                    'THEFT FROM COIN-OPERATED MACHINE': 0}
    vCrimes = []
    nvCrimes = []
    
    # Append crime into vCrimes and nvCrimes list
    for crime, severity in crime_types.items():
        if severity == 1:
            vCrimes.append(crime)
        else:
            nvCrimes.append(crime)
    
    lrpd['CRIME_TYPE'] = None
    lrpd['RISK_TYPE'] = None

    # Goes through and classifies each crime and it's risk type in both TEXT and BC in the DF
    lrpd.loc[lrpd['OFFENSE_DESCRIPTION'].isin(vCrimes), 'CRIME_TYPE'] = "VIOLENT"
    lrpd.loc[lrpd['OFFENSE_DESCRIPTION'].isin(nvCrimes), 'CRIME_TYPE'] = "NONVIOLENT"
    lrpd.loc[lrpd['OFFENSE_DESCRIPTION'].isin(vCrimes), 'RISK_TYPE'] = "HIGH"
    lrpd.loc[(lrpd['OFFENSE_DESCRIPTION'].isin(nvCrimes)) & (lrpd['WEAPON_TYPE'] != "NO WEAPON"), 'RISK_TYPE'] = "HIGH"
    lrpd.loc[(lrpd['OFFENSE_DESCRIPTION'].isin(nvCrimes)) & (lrpd['WEAPON_TYPE'] == "NO WEAPON"), 'RISK_TYPE'] = "LOW"
    lrpd["RISK_TYPE_BC"] = lrpd["RISK_TYPE"].map({'HIGH': 1, 'LOW': 0})

    return lrpd

lrpd = assessCrime(lrpd)


### Visualizing current modifications:

In [37]:
lrpd.tail()

Unnamed: 0_level_0,INCIDENT_DATE,LOCATION_DISTRICT,OFFENSE_DESCRIPTION,WEAPON_TYPE,ZIP,LATITUDE,LONGITUDE,WEEK_OF_MONTH,YEAR,DAY,DAY_OF_YEAR,MONTH,CRIME_TYPE,RISK_TYPE,RISK_TYPE_BC
INCIDENT_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-303291,2022-12-08 00:00:00,73.0,THEFT FROM MOTOR VEHICLE,NO WEAPON,72223.0,0.0,0.0,1,2022,8,342,12,NONVIOLENT,LOW,0
2022-303160,2022-10-29 09:00:00,62.0,ALL OTHER LARCENY,NO WEAPON,72227.0,0.0,0.0,4,2022,29,302,10,NONVIOLENT,LOW,0
2022-302870,2022-10-23 14:30:00,72.0,THEFT FROM MOTOR VEHICLE,NO WEAPON,72211.0,0.0,0.0,3,2022,23,296,10,NONVIOLENT,LOW,0
2022-302451,2022-09-07 14:00:00,92.0,SHOPLIFTING,NO WEAPON,72210.0,0.0,0.0,1,2022,7,250,9,NONVIOLENT,LOW,0
2022-301998,2022-07-21 08:00:00,72.0,THEFT FROM MOTOR VEHICLE,NO WEAPON,72211.0,0.0,0.0,3,2022,21,202,7,NONVIOLENT,LOW,0


### Saving the modified DataSet to a new CSV:

In [38]:
# lrpd.to_csv("lrpd-clean.csv")

In [39]:
lrpd.shape

(86844, 15)

In [40]:
lrpd["INCIDENT_DATE"] = pd.to_datetime(lrpd["INCIDENT_DATE"])
df = lrpd.groupby(pd.Grouper(key='INCIDENT_DATE', freq='D')).size().reset_index(name='INCIDENT_COUNT')
df.columns = ['ds', 'y']

In [41]:
weather = pd.read_csv('../data/weather.csv')
best_data = weather[weather['NAME'] == 'LITTLE ROCK AIRPORT ADAMS FIELD, AR US']
columns_to_keep = ['DATE', 'AWND', 'PRCP', 'SNWD', 'SNOW', 'TMAX', 'TMIN']
best_data = best_data[columns_to_keep]
best_data.fillna(0, inplace=True)
best_data = best_data[best_data['DATE'] < '2023-02-21']
best_data['DATE'] = pd.to_datetime(best_data['DATE'])
best_df = pd.merge(df, best_data, left_on='ds', right_on='DATE', how='inner')
best_df.drop(columns=['DATE'], inplace=True)

In [42]:
df = pd.read_csv('../data/XGB-lrpd.csv')
best_df = pd.read_csv('data-for-prophet.csv')
len(df), len(best_df)
df.drop(['date_offset'], inplace=True, axis=1)
best_df.drop(['ds'], inplace=True, axis=1)
best_df.drop(['y'], inplace=True, axis=1)
# merge the two dataframes 
df = pd.concat([df, best_df], axis=1)

In [44]:
# df.to_csv('final-lrpd-data.csv', index=False)
df

Unnamed: 0,ds,y,is_holiday,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,is_weekend,is_weekday,season,AWND,PRCP,SNWD,SNOW,TMAX,TMIN
0,2017-01-01,54,1,6,1,1,2017,1,1,52,1,0,4,6.93,0.01,0.0,0.0,46.0,43.0
1,2017-01-02,45,1,0,1,1,2017,2,2,1,0,1,4,5.37,0.20,0.0,0.0,56.0,45.0
2,2017-01-03,51,0,1,1,1,2017,3,3,1,0,1,4,7.61,0.00,0.0,0.0,57.0,40.0
3,2017-01-04,48,0,2,1,1,2017,4,4,1,0,1,4,9.62,0.00,0.0,0.0,41.0,26.0
4,2017-01-05,39,0,3,1,1,2017,5,5,1,0,1,4,8.50,0.00,0.0,0.0,37.0,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2237,2023-02-16,42,0,3,1,2,2023,47,16,7,0,1,4,16.11,0.51,0.0,0.0,72.0,36.0
2238,2023-02-17,32,0,4,1,2,2023,48,17,7,0,1,4,9.62,0.00,0.0,0.0,51.0,34.0
2239,2023-02-18,25,0,5,1,2,2023,49,18,7,1,0,4,5.14,0.00,0.0,0.0,56.0,27.0
2240,2023-02-19,36,0,6,1,2,2023,50,19,7,1,0,4,10.51,0.00,0.0,0.0,68.0,38.0
