# Association Rule Mining

This Notebook takes the merged service requests and creates association rules on the data

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
!pip install apyori
from apyori import apriori
%matplotlib inline
plt.style.use('ggplot')

# Function to read data from URL
def read_data_from_URL(url):
    df=pd.read_csv(url)
    return df



In [2]:
# Read in the merged, preprocessed data
serv_req_url='https://bitbucket.org/nthammad-uncc/knowledge_discovery_charlotte/raw/main/data/merged_requests.zip'
requests = read_data_from_URL(serv_req_url)
requests.head()

Unnamed: 0.1,Unnamed: 0,OBJECTID,DEPARTMENT,REQUEST_NO,REQUEST_CAT,REQUEST_TYPE,RECEIVED_DATE,RECEIVED_MONTH,RECEIVED_YEAR,SEASON,...,PERCENT HOUSEHOLD INCOME_Higher Income Households,PERCENT HOUSEHOLD INCOME_Retired Householders,INCOME AND BENEFITS_Median households income (dollars),INCOME AND BENEFITS_Mean households income (dollars),INCOME AND BENEFITS_Per capita Income,INCOME AND BENEFITS_Median earnings for workers (dollars),PERCENT INSURED_Population with health insurance,PERCENT INSURED_Population without health insurance,PERCENTAGE BELOW POVERTY LEVEL_All,HISTORIC_REDLINING
0,0,1,Solid Waste Services,6402056,NON_RECYCLABLE ITEMS,NON_RECYCLABLE ITEMS,2016-11-07 14:00:00,11,2016,4,...,0.210336,0.157483,79300.428571,111885.642857,47529.928571,43332.642857,0.917971,0.082029,7.135714,
1,1,2,Housing and NBHD Serv,6402064,UTILITY VERIFICATION LTR,UTILITY VERIFICATION LTR,2016-11-07 14:00:00,11,2016,4,...,0.084375,0.129942,63391.25,74852.45,29914.9,37226.3,0.869184,0.130816,8.05,
2,2,3,Solid Waste Services,6402082,NON_RECYCLABLE ITEMS,NON_RECYCLABLE ITEMS,2016-11-07 14:00:00,11,2016,4,...,0.03753,0.113167,41973.411765,51723.764706,19338.294118,23576.882353,0.814082,0.185918,25.811765,
3,3,4,Solid Waste Services,6402103,MISSED SERVICE,MISSED RECYCLING,2016-11-07 14:00:00,11,2016,4,...,0.053345,0.128969,46401.066667,59119.666667,22976.8,28399.6,0.834642,0.165358,16.96,
4,4,6,Finance/City,6402112,CWP,CWP REQUEST,2016-11-07 14:00:00,11,2016,4,...,0.050732,0.097692,42745.529412,53659.352941,23808.882353,29559.117647,0.825033,0.174967,20.682353,Y


In [3]:
requests['RECEIVED_DAY_OF_WEEK'] = pd.to_datetime(requests['RECEIVED_DATE']).dt.day_name()

In [4]:
monthly_requests=requests.groupby(['RECEIVED_DAY_OF_WEEK','RECEIVED_MONTH','RECEIVED_YEAR','REQUEST_CAT','NEIGHBORHOOD_PROFILE_AREA','CRIME_SCORE']).size().reset_index(name='REQUEST_COUNT')
monthly_requests.head()

Unnamed: 0,RECEIVED_DAY_OF_WEEK,RECEIVED_MONTH,RECEIVED_YEAR,REQUEST_CAT,NEIGHBORHOOD_PROFILE_AREA,CRIME_SCORE,REQUEST_COUNT
0,Friday,1,2017,311 DOCUMENT,4.0,0.000444,1
1,Friday,1,2017,311 DOCUMENT,20.0,0.001333,1
2,Friday,1,2017,311 DOCUMENT,22.0,0.004147,1
3,Friday,1,2017,311 DOCUMENT,29.0,0.000296,1
4,Friday,1,2017,311 DOCUMENT,36.0,0.002073,1


In [5]:
monthly_requests['CRIME_SCORE']=monthly_requests['CRIME_SCORE'].apply(lambda x: round(x,3))
monthly_requests['NEIGHBORHOOD_PROFILE_AREA']=monthly_requests['NEIGHBORHOOD_PROFILE_AREA'].apply(lambda x: str(int(x)))

In [6]:
max_crime_score=monthly_requests['CRIME_SCORE'].max()
monthly_requests['CRIME_INDEX']=monthly_requests['CRIME_SCORE'].apply(lambda x: 'Low' if (x/max_crime_score)<=0.33 else ('Mid' if (x/max_crime_score)<=0.66 else 'High'))

In [7]:
monthly_requests.head()

Unnamed: 0,RECEIVED_DAY_OF_WEEK,RECEIVED_MONTH,RECEIVED_YEAR,REQUEST_CAT,NEIGHBORHOOD_PROFILE_AREA,CRIME_SCORE,REQUEST_COUNT,CRIME_INDEX
0,Friday,1,2017,311 DOCUMENT,4,0.0,1,Low
1,Friday,1,2017,311 DOCUMENT,20,0.001,1,Low
2,Friday,1,2017,311 DOCUMENT,22,0.004,1,Low
3,Friday,1,2017,311 DOCUMENT,29,0.0,1,Low
4,Friday,1,2017,311 DOCUMENT,36,0.002,1,Low


In [8]:
monthly_requests.describe()

Unnamed: 0,RECEIVED_MONTH,RECEIVED_YEAR,CRIME_SCORE,REQUEST_COUNT
count,538372.0,538372.0,538372.0,538372.0
mean,6.467298,2018.603728,-0.023879,2.83087
std,3.351832,1.519669,0.16208,3.848134
min,1.0,2016.0,-1.0,1.0
25%,4.0,2017.0,0.001,1.0
50%,7.0,2019.0,0.002,1.0
75%,9.0,2020.0,0.004,3.0
max,12.0,2021.0,0.025,74.0


In [9]:
monthly_requests['REQUEST_COUNT'].value_counts().head(60)

1     294970
2      87011
3      44956
4      26941
5      17940
6      13343
7      10118
8       7802
9       6169
10      5018
11      3959
12      3261
13      2608
14      2130
15      1838
16      1594
17      1194
18      1069
19       923
20       792
21       639
22       602
23       467
24       392
25       342
26       296
27       241
28       234
29       190
30       176
31       143
32       139
33       109
34        84
35        78
36        68
37        58
38        49
39        48
41        45
40        43
42        36
45        27
43        24
49        23
46        22
47        20
48        17
44        16
55        13
50        12
53        11
52        11
51         8
59         8
54         5
65         5
57         5
66         4
63         4
Name: REQUEST_COUNT, dtype: int64

In [10]:
monthly_requests['BIN_REQUEST_COUNT']=monthly_requests['REQUEST_COUNT'].apply(lambda x: 0 if x<=10 else 1)

In [11]:
monthly_requests.head()

Unnamed: 0,RECEIVED_DAY_OF_WEEK,RECEIVED_MONTH,RECEIVED_YEAR,REQUEST_CAT,NEIGHBORHOOD_PROFILE_AREA,CRIME_SCORE,REQUEST_COUNT,CRIME_INDEX,BIN_REQUEST_COUNT
0,Friday,1,2017,311 DOCUMENT,4,0.0,1,Low,0
1,Friday,1,2017,311 DOCUMENT,20,0.001,1,Low,0
2,Friday,1,2017,311 DOCUMENT,22,0.004,1,Low,0
3,Friday,1,2017,311 DOCUMENT,29,0.0,1,Low,0
4,Friday,1,2017,311 DOCUMENT,36,0.002,1,Low,0


In [12]:
asc_rules_requests=pd.get_dummies(monthly_requests,['RECEIVED_DAY_OF_WEEK','RECEIVED_MONTH','REQUEST_CAT','NEIGHBORHOOD_PROFILE_AREA'])
asc_rules_requests.drop(['RECEIVED_YEAR','CRIME_SCORE','REQUEST_COUNT'],axis=1,inplace=True)
asc_rules_requests.head()

Unnamed: 0,RECEIVED_MONTH,BIN_REQUEST_COUNT,RECEIVED_DAY_OF_WEEK_Friday,RECEIVED_DAY_OF_WEEK_Monday,RECEIVED_DAY_OF_WEEK_Saturday,RECEIVED_DAY_OF_WEEK_Sunday,RECEIVED_DAY_OF_WEEK_Thursday,RECEIVED_DAY_OF_WEEK_Tuesday,RECEIVED_DAY_OF_WEEK_Wednesday,RECEIVED_MONTH_311 DOCUMENT,...,REQUEST_CAT_93,REQUEST_CAT_94,REQUEST_CAT_95,REQUEST_CAT_96,REQUEST_CAT_97,REQUEST_CAT_98,REQUEST_CAT_99,NEIGHBORHOOD_PROFILE_AREA_High,NEIGHBORHOOD_PROFILE_AREA_Low,NEIGHBORHOOD_PROFILE_AREA_Mid
0,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [13]:
association_rules = apriori(asc_rules_requests, min_support=0.005, min_confidence=0.2, min_lift=3, min_length=2)

In [14]:
association_results = list(association_rules)