# CST 8502 FINAL PROJECT

By:  
Charles-Antoine Campeau  
Joshua Ayyasamy  
Mubarak husain Shaikh  
Curtis Sloan 

Submitted to Dr. Anu Thomas in partial fulfillment of the requirements of CST 8502 

Algonquin College Artificial Intelligence Software Development

2023-11-26

In [None]:
import pandas as pd
import numpy as np
import math
import re
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import seaborn as sns


## DATA PREPARATION

In [None]:
# Import the CSV
bicycleTheftData = pd.read_csv("bicycle-thefts - 4326.csv")

In [None]:
# Remove the unnecessary attributes
bicycleTheftData = bicycleTheftData.drop(columns=["_id","EVENT_UNIQUE_ID", "OCC_DATE", "OCC_YEAR", "OCC_DAY",
                                                   "OCC_DOY", "REPORT_DATE", "REPORT_YEAR", "REPORT_MONTH", "REPORT_DOW", "REPORT_DAY", 
                                                     "REPORT_DOY", "REPORT_HOUR", "LOCATION_TYPE", "BIKE_MODEL", "STATUS", "geometry"])

## DATA CLEANING & CONSTRUCTION
The data cleaning and construction was separated between all members.  Everyone's contribution is indicated

### Charles

In [None]:
# Import the CSV with the BIKE_MAKE changes
makesData = pd.read_csv("MAKESREPLACEMENT.CSV", delimiter=";")
# Set the index as the good values and convert the strings to list
makesData.set_index("NEW", inplace=True)
makesData["OLD"] = makesData["OLD"].str.split(",")

# Convert the list to a series
makesSeries = makesData["OLD"].explode()

In [None]:
def GetMake(make):
    """Correct wrongfully inputted bicycle makes \n
    Parameter:
    ------------
    make: The instance bicycle make

    Return:
    -----------
    The properly written make
    """
    
    try:
        index = makesSeries.index[makesSeries.str.contains(re.escape(make))][0]
        return index
    except: 
        return make

In [None]:
# Fix the errors in the BIKE_MAKE attribute
bicycleTheftData["BIKE_MAKE"] = bicycleTheftData["BIKE_MAKE"].apply(GetMake)

### Joshua

In [None]:
bicycleTheftData['BIKE_SPEED'] = bicycleTheftData['BIKE_SPEED'].fillna(bicycleTheftData['BIKE_SPEED'].mean())

In [None]:
low = bicycleTheftData['BIKE_COST'].quantile(.25)
average = bicycleTheftData['BIKE_COST'].quantile(.5)
high = bicycleTheftData['BIKE_COST'].quantile(.75)
bicycleTheftData['BIKE_COST_CATEGORY'] = np.select(
    [
        bicycleTheftData['BIKE_COST'].isna(),
        bicycleTheftData['BIKE_COST'] <= low,
        (bicycleTheftData['BIKE_COST'] > low) & (bicycleTheftData['BIKE_COST'] <= average),
        (bicycleTheftData['BIKE_COST'] > average) & (bicycleTheftData['BIKE_COST'] <= high),
        bicycleTheftData['BIKE_COST'] > high
    ],
    [
        'NK',
        'Low',
        'Average',
        'High',
        'Luxury'
    ],
    default='Unknown'
)

In [None]:
bicycleTheftData = bicycleTheftData.drop(['BIKE_COST'], axis =1)

### Mubarak

In [None]:
# Replace missing values in PRIMARY_OFFENCE with 'Unknown'
bicycleTheftData['PRIMARY_OFFENCE'].fillna('Unknown', inplace=True)

# Convert entries to lowercase for uniformity
bicycleTheftData['PRIMARY_OFFENCE'] = bicycleTheftData['PRIMARY_OFFENCE'].str.lower()

In [None]:
#there is one instance colored as '18' so to handle such anomalies we will replace it with unknown
bicycleTheftData['BIKE_COLOUR'] = bicycleTheftData['BIKE_COLOUR'].replace('18', 'Unknown')

bicycleTheftData['BIKE_COLOUR'].fillna('Unknown', inplace=True)

In [None]:
# Function to categorize colors
def categorize_color(color):
    color = color.lower()  # Convert to lowercase for uniformity

    color_categories = {
        'black': ['black', 'blk', 'blac'],
        'blue': ['blue', 'blu'],
        'brown': ['brown', 'brn'],
        'beige': ['bge', 'beige'],
        'gold': ['gold', 'gld'],
        'green': ['green', 'grn'],
        'grey': ['grey', 'gray', 'gry'],
        'orange': ['orange', 'ong'],
        'pink': ['pink', 'pnk'],
        'purple': ['purple', 'purp'],
        'red': ['red', 'rd'],
        'silver': ['silver', 'sil'],
        'turquoise': ['turquoise', 'trq'],
        'white': ['white', 'whi'],
        'yellow': ['yellow', 'yel']
       
    }

    for category, values in color_categories.items():
        for value in values:
            if value in color:
                return category

    return 'Unknown'

In [None]:
# Apply the categorize_color function to the BIKE_COLOUR column
bicycleTheftData['BIKE_COLOUR'] = bicycleTheftData['BIKE_COLOUR'].apply(categorize_color)

### Curtis

In [None]:
def month_to_number(data):
    months = ["January", "February", "March","April", "May", "June",
              "July", "August", "September","October", "November", "December"]
    
    if data in months:
        return months.index(data) + 1

bicycleTheftData['OCC_MONTH'] = bicycleTheftData['OCC_MONTH'].apply(month_to_number)

In [None]:
def day_to_number(data):
    days = ["Monday", "Tuesday", "Wednesday", "Thursday",
            "Friday", "Saturday", "Sunday"]
    
    if data in days:
        return days.index(data) + 1

bicycleTheftData["OCC_DOW"] = bicycleTheftData["OCC_DOW"].apply(day_to_number)

In [None]:
ohe = OneHotEncoder()

encoded = pd.DataFrame(ohe.fit_transform(bicycleTheftData[['PREMISES_TYPE']]).toarray())
encoded.columns = ['Apartment','House','Commercial','Outside','Transit','Educational','Other']
bicycleTheftData = bicycleTheftData.join(encoded)


encoded = pd.DataFrame(ohe.fit_transform(bicycleTheftData[['BIKE_TYPE']]).toarray())
encoded.columns = ['BM','EL','FO','MT','OT','RC','RE','RG','SC','TA','TO','TR','UN']
bicycleTheftData = bicycleTheftData.join(encoded)

In [None]:
bicycleTheftData = bicycleTheftData.drop(['PREMISES_TYPE'], axis=1)
bicycleTheftData = bicycleTheftData.drop(['BIKE_TYPE'], axis=1)

In [None]:
def time_of_day(data):
    if pd.isnull(data):
        return "Unknown"
    elif data <= 4:
        return "Night"
    elif data <= 8:
        return "Dawn"
    elif data <= 11:
        return "Morning"
    elif data <= 16:
        return "Afternoon"
    elif data <= 21:
        return "Evening"
    else:
        return "Night"
    
bicycleTheftData["OCC_TOD"] = bicycleTheftData["OCC_HOUR"].apply(time_of_day)
bicycleTheftData = bicycleTheftData.drop(['OCC_HOUR'], axis=1)