In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import pytz

In [2]:
# Loading the dataset

filename = "311_Service_Requests_2yrs.csv"
df = pd.read_csv(filename)
display(df.head(5))

Unnamed: 0,service_request_id,requested_date,updated_date,closed_date,status_description,source,service_name,agency_responsible,address,comm_code,comm_name,location_type,longitude,latitude,point
0,23-00000797,2023/01/02 12:00:00 AM,2023/01/10 12:00:00 AM,2023/01/10 12:00:00 AM,Closed,Other,Finance - ONLINE TIPP Agreement Request,CFOD - Finance,,,,,,,
1,23-00001045,2023/01/02 12:00:00 AM,2024/01/11 12:00:00 AM,2024/01/11 12:00:00 AM,Closed,Other,Active Living Program Application,CS - Recreation and Social Programs,,,,,,,
2,23-00001163,2023/01/02 12:00:00 AM,2023/01/06 12:00:00 AM,2023/01/06 12:00:00 AM,Closed,Phone,CN - Registered Social Worker Letter,CS - Calgary Neighbourhoods,,,,,,,
3,23-00001191,2023/01/02 12:00:00 AM,2024/05/19 12:00:00 AM,2023/01/10 12:00:00 AM,Closed,Other,CT - Lost Property,OS - Calgary Transit,,,,,,,
4,23-00001584,2023/01/02 12:00:00 AM,2023/01/04 12:00:00 AM,2023/01/04 12:00:00 AM,Closed,Other,Recreation - Arena Booking Application,CS - Calgary Recreation,,,,,,,


In [35]:
# Cleaning: Drop rows

df = df.iloc[:1062842]

In [4]:
#Shape

df.shape

(1062842, 15)

In [5]:
#Columns

df.columns

Index(['service_request_id', 'requested_date', 'updated_date', 'closed_date',
       'status_description', 'source', 'service_name', 'agency_responsible',
       'address', 'comm_code', 'comm_name', 'location_type', 'longitude',
       'latitude', 'point'],
      dtype='object')

In [6]:
#Datatypes

df.dtypes

service_request_id     object
requested_date         object
updated_date           object
closed_date            object
status_description     object
source                 object
service_name           object
agency_responsible     object
address               float64
comm_code              object
comm_name              object
location_type          object
longitude             float64
latitude              float64
point                  object
dtype: object

In [7]:
# Identifying number of missing values 

missing_values = df.isna().sum()
missing_values

service_request_id          0
requested_date              0
updated_date                0
closed_date             32093
status_description          0
source                      0
service_name                0
agency_responsible        158
address               1062842
comm_code               73834
comm_name               73833
location_type           73626
longitude               73845
latitude                73845
point                   73845
dtype: int64

In [8]:
# Dropping address column

df = df.drop('address',axis = 1)
df.shape

(1062842, 14)

In [9]:
# Deriving new columns from requested date

df['requested_date'] = pd.to_datetime(df['requested_date'], format = '%Y/%m/%d %I:%M:%S %p')
print(f"Data type of 'requested_date': {df['requested_date'].dtype}")

df['request_year'] = df['requested_date'].dt.year
df['request_month'] = df['requested_date'].dt.month
df['request_day'] = df['requested_date'].dt.day


Data type of 'requested_date': datetime64[ns]


In [10]:
df['updated_date'] = pd.to_datetime(df['updated_date'], format = '%Y/%m/%d %I:%M:%S %p')

df['update_year'] = df['updated_date'].dt.year
df['update_month'] = df['updated_date'].dt.month
df['update_day'] = df['updated_date'].dt.day


In [36]:
# Deriving new columns from requested date

df['closed_date'] = pd.to_datetime(df['closed_date'], format = '%Y/%m/%d %I:%M:%S %p')
print("Dataype:", df['closed_date'].dtype)

# Converting null values to NaT
df['closed_date'] = df['closed_date'].fillna(pd.NaT)


df['closed_year'] = df['closed_date'].dt.year
df['closed_month'] = df['closed_date'].dt.month
df['closed_day'] = df['closed_date'].dt.day

# Replacing null values in derived columns with 0 and converting the column values to int type

df.loc[df['closed_date'].isna(), ['closed_year', 'closed_month', 'closed_day']] = 0
df[['closed_year', 'closed_month', 'closed_day']] = df[['closed_year', 'closed_month', 'closed_day']].astype('Int32')

#display(df.head(5))

Dataype: datetime64[ns]


In [12]:
# Checking datatypes ofconverted columns

df.dtypes

service_request_id            object
requested_date        datetime64[ns]
updated_date          datetime64[ns]
closed_date           datetime64[ns]
status_description            object
source                        object
service_name                  object
agency_responsible            object
comm_code                     object
comm_name                     object
location_type                 object
longitude                    float64
latitude                     float64
point                         object
request_year                   int32
request_month                  int32
request_day                    int32
update_year                    int32
update_month                   int32
update_day                     int32
closed_year                    Int32
closed_month                   Int32
closed_day                     Int32
dtype: object

In [13]:
# Calculating closing delay and creating new inttype column for closing delay

df['closing_delay'] = df['closed_date'] - df['requested_date']
print("1",df['closing_delay'].dtype)
df['closing_delay'] = df['closing_delay'].dt.days
print(df['closing_delay'].dtype)
df['closing_delay'] = df['closing_delay'].astype('Int64')

print(df['closing_delay'].dtype)

#Check
#df_subset = df.iloc[1050:1116]  # Python slicing includes 150 but excludes 166
#display(df_subset)

1 timedelta64[ns]
float64
Int64


Unnamed: 0,service_request_id,requested_date,updated_date,closed_date,status_description,source,service_name,agency_responsible,comm_code,comm_name,...,request_year,request_month,request_day,update_year,update_month,update_day,closed_year,closed_month,closed_day,closing_delay
1050,23-00002677,2023-01-02,2023-01-09,2023-01-09,Closed,Other,CBS Inspection - Residential Improvement Proje...,PD - Calgary Building Services,CPF,COPPERFIELD,...,2023,1,2,2023,1,9,2023,1,9,7
1051,23-00002779,2023-01-02,2024-05-18,2023-01-09,Duplicate (Closed),App,Bylaw - Snow and Ice on Sidewalk,CS - Emergency Management and Community Safety,CIT,CITADEL,...,2023,1,2,2024,5,18,2023,1,9,7
1052,23-00002558,2023-01-02,2023-02-01,2023-01-09,Closed,Phone,AS - Animal Damaging Pet or Property,CS - Calgary Community Standards,HAW,HAWKWOOD,...,2023,1,2,2023,2,1,2023,1,9,7
1053,23-00000852,2023-01-02,2023-01-10,2023-01-10,Closed,Other,Recreation - School Facility Booking Application,CS - Calgary Recreation,,,...,2023,1,2,2023,1,10,2023,1,10,8
1054,23-00000904,2023-01-02,2023-01-10,2023-01-10,Closed,Other,Finance - ONLINE TIPP Agreement Request,CFOD - Finance,CHW,CHARLESWOOD,...,2023,1,2,2023,1,10,2023,1,10,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,23-00002540,2023-01-02,2023-01-10,2023-01-10,Closed,Phone,Finance - TIPP Agreement Request,CFOD - Finance,SOM,SOMERSET,...,2023,1,2,2023,1,10,2023,1,10,8
1112,23-00002638,2023-01-02,2023-01-10,2023-01-10,Closed,Other,Roads - Curb Lowering for Existing Driveways,TRAN - Roads,HAM,HAMPTONS,...,2023,1,2,2023,1,10,2023,1,10,8
1113,23-00002696,2023-01-02,2023-01-10,2023-01-10,Closed,Phone,Finance - Property Tax Account Inquiry,CFOD - Finance,SIL,SILVER SPRINGS,...,2023,1,2,2023,1,10,2023,1,10,8
1114,23-00002722,2023-01-02,2023-01-10,2023-01-10,Closed,Phone,Bylaw - Snow and Ice on Sidewalk,CS - Calgary Community Standards,BDO,BONAVISTA DOWNS,...,2023,1,2,2023,1,10,2023,1,10,8


In [15]:
# Identifying duplicate requests using regex and creating new column 

df['duplicate_request'] = df['status_description'].str.contains(r'Duplicate \(Closed\)', regex=True)

# Convert the boolean values to 'Yes'/'No'
df['duplicate_request'] = df['duplicate_request'].replace({True: 'Yes', False: 'No'})

# Check
#df_subset = df.iloc[150:166]  # Python slicing includes 150 but excludes 166
#display(df_subset)


In [16]:
# Check max date value in requested_date column

max_value = df['requested_date'].max()
print(max_value)

2024-12-31 00:00:00


In [17]:
# Season Categorisation of "Requests"

# Defining Calgary's timezone
calgary_tz = pytz.timezone('America/Edmonton')  

# Exact UTC times for solstices and equinoxes (taken from Govt of Canada Website)

seasons_utc = {
    'Spring_2023': '2023-03-20 21:24:00',
    'Summer_2023': '2023-06-21 14:57:00',
    'Autumn_2023': '2023-09-23 06:50:00',
    'Winter_2023': '2023-12-22 03:27:00',
    'Spring_2024': '2024-03-20 03:06:00',
    'Summer_2024': '2024-06-20 20:50:00',
    'Autumn_2024': '2024-09-22 12:43:00',
    'Winter_2024': '2024-12-21 09:20:00'
}

# Converting the UTC times to Calgary local time

seasons = {}

for season, utc_time_str in seasons_utc.items():
    
    # Converting the UTC string into a datetime object
    
    utc_time = datetime.strptime(utc_time_str, '%Y-%m-%d %H:%M:%S')
    utc_time = pytz.utc.localize(utc_time) 
    
    # Converting to Calgary local time
    local_time = utc_time.astimezone(calgary_tz)
    
    # Saving the result in the dictionary
    seasons[season] = local_time
    
for key, value in seasons.items():
#print(f"{key}: {value.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{key}: {value}")

Spring_2023: 2023-03-20 15:24:00-06:00
Summer_2023: 2023-06-21 08:57:00-06:00
Autumn_2023: 2023-09-23 00:50:00-06:00
Winter_2023: 2023-12-21 20:27:00-07:00
Spring_2024: 2024-03-19 21:06:00-06:00
Summer_2024: 2024-06-20 14:50:00-06:00
Autumn_2024: 2024-09-22 06:43:00-06:00
Winter_2024: 2024-12-21 02:20:00-07:00


In [18]:

# Keeping the local time but making it aware for requested_date columns

if df['requested_date'].dt.tz is None:
    df['new_requested_date'] = df['requested_date'].dt.tz_localize('America/Edmonton')

print(df['new_requested_date'].head())

0   2023-01-02 00:00:00-07:00
1   2023-01-02 00:00:00-07:00
2   2023-01-02 00:00:00-07:00
3   2023-01-02 00:00:00-07:00
4   2023-01-02 00:00:00-07:00
Name: new_requested_date, dtype: datetime64[ns, America/Edmonton]


In [19]:
# Categorizing into seasons and creating a new 'season' column

# Assigning seasons based on request date

def get_season(request_date):
    for season, season_date in seasons.items():
        if request_date < season_date:
            return season
    return 'Winter_2024'  # Default to the latest season

# Creating new season column 

df['Season'] = df['new_requested_date'].apply(get_season)

display(df)

Unnamed: 0,service_request_id,requested_date,updated_date,closed_date,status_description,source,service_name,agency_responsible,comm_code,comm_name,...,update_year,update_month,update_day,closed_year,closed_month,closed_day,closing_delay,duplicate_request,new_requested_date,Season
0,23-00000797,2023-01-02,2023-01-10,2023-01-10,Closed,Other,Finance - ONLINE TIPP Agreement Request,CFOD - Finance,,,...,2023,1,10,2023,1,10,8,No,2023-01-02 00:00:00-07:00,Spring_2023
1,23-00001045,2023-01-02,2024-01-11,2024-01-11,Closed,Other,Active Living Program Application,CS - Recreation and Social Programs,,,...,2024,1,11,2024,1,11,374,No,2023-01-02 00:00:00-07:00,Spring_2023
2,23-00001163,2023-01-02,2023-01-06,2023-01-06,Closed,Phone,CN - Registered Social Worker Letter,CS - Calgary Neighbourhoods,,,...,2023,1,6,2023,1,6,4,No,2023-01-02 00:00:00-07:00,Spring_2023
3,23-00001191,2023-01-02,2024-05-19,2023-01-10,Closed,Other,CT - Lost Property,OS - Calgary Transit,,,...,2024,5,19,2023,1,10,8,No,2023-01-02 00:00:00-07:00,Spring_2023
4,23-00001584,2023-01-02,2023-01-04,2023-01-04,Closed,Other,Recreation - Arena Booking Application,CS - Calgary Recreation,,,...,2023,1,4,2023,1,4,2,No,2023-01-02 00:00:00-07:00,Spring_2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062837,24-00979069,2024-12-31,2025-01-27,2025-01-27,Closed,Other,DBBS - RIM - Property Research,"PDS - Development, Business and Building Services",ASP,ASPEN WOODS,...,2025,1,27,2025,1,27,27,No,2024-12-31 00:00:00-07:00,Winter_2024
1062838,24-00978685,2024-12-31,2025-01-21,2025-01-21,Closed,Other,Bylaw - Vehicle Concerns,CS - Emergency Management and Community Safety,TEM,TEMPLE,...,2025,1,21,2025,1,21,21,No,2024-12-31 00:00:00-07:00,Winter_2024
1062839,24-00977260,2024-12-31,2025-01-21,2025-01-02,Open,Other,WRS - Cart Management,OS - Waste and Recycling Services,HUN,HUNTINGTON HILLS,...,2025,1,21,2025,1,2,2,No,2024-12-31 00:00:00-07:00,Winter_2024
1062840,24-00978925,2024-12-31,2025-01-21,2025-01-21,Closed,Other,CT AC - Trip Feedback - CTA,OS - Calgary Transit,DNW,DOWNTOWN WEST END,...,2025,1,21,2025,1,21,21,No,2024-12-31 00:00:00-07:00,Winter_2024


In [20]:
# Closed requests taken into account

statistics_closing_delay = df['closing_delay'].describe()
print(statistics_closing_delay)

count    1030749.0
mean     12.630157
std      39.587334
min            0.0
25%            1.0
50%            3.0
75%            8.0
max          740.0
Name: closing_delay, dtype: Float64


In [21]:
na_count = df['closing_delay'].isna().sum()
print(na_count)

32093


In [25]:
no_open_requests = len(df[df['status_description'] == 'Open'])
no_open_requests

32781

In [23]:
no_closed_dates = df['closed_date'].isna().sum()
no_closed_dates


32093

In [34]:
filtered_df = df[df['status_description'] == 'Open']
grouped_data = filtered_df.groupby('status_description')['closed_date']
pd.set_option('display.max_rows', None)
display(grouped_data.head(300))


105            NaT
115            NaT
237            NaT
1237           NaT
1256           NaT
1257           NaT
1260           NaT
1263           NaT
1265           NaT
1266           NaT
1270           NaT
1271           NaT
1273           NaT
1276           NaT
1277           NaT
1284           NaT
1287           NaT
1290           NaT
1378           NaT
1386           NaT
1387           NaT
1388           NaT
1389           NaT
1392           NaT
1516           NaT
1718    2023-01-04
3078           NaT
3108           NaT
3111           NaT
3115           NaT
3135           NaT
3418           NaT
3550    2023-01-04
4725           NaT
4766           NaT
4805           NaT
4852           NaT
4854           NaT
4902           NaT
4906           NaT
4909           NaT
4910           NaT
5014    2023-01-06
5105           NaT
5416           NaT
5657           NaT
5978    2023-01-09
6046           NaT
6279           NaT
6320           NaT
6326           NaT
6327           NaT
6331        

In [None]:
# To solve this data discrepency, change the "status" of requests with closed dates to "Closed".

