# Association Rule Mining

This Notebook takes the merged service requests and creates association rules on the data

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
!pip install apyori
from apyori import apriori
%matplotlib inline
plt.style.use('ggplot')

# Function to read data from URL
def read_data_from_URL(url):
    df=pd.read_csv(url)
    return df



In [2]:
# Read in the merged, preprocessed data
serv_req_url='https://bitbucket.org/nthammad-uncc/knowledge_discovery_charlotte/raw/main/data/merged_requests.zip'
requests = read_data_from_URL(serv_req_url)
requests.head()

Unnamed: 0.1,Unnamed: 0,OBJECTID,DEPARTMENT,REQUEST_NO,REQUEST_CAT,REQUEST_TYPE,RECEIVED_DATE,RECEIVED_MONTH,RECEIVED_YEAR,SEASON,...,PERCENT HOUSEHOLD INCOME_Higher Income Households,PERCENT HOUSEHOLD INCOME_Retired Householders,INCOME AND BENEFITS_Median households income (dollars),INCOME AND BENEFITS_Mean households income (dollars),INCOME AND BENEFITS_Per capita Income,INCOME AND BENEFITS_Median earnings for workers (dollars),PERCENT INSURED_Population with health insurance,PERCENT INSURED_Population without health insurance,PERCENTAGE BELOW POVERTY LEVEL_All,HISTORIC_REDLINING
0,0,1,Solid Waste Services,6402056,NON_RECYCLABLE ITEMS,NON_RECYCLABLE ITEMS,2016-11-07 14:00:00,11,2016,4,...,0.210336,0.157483,79300.428571,111885.642857,47529.928571,43332.642857,0.917971,0.082029,7.135714,
1,1,2,Housing and NBHD Serv,6402064,UTILITY VERIFICATION LTR,UTILITY VERIFICATION LTR,2016-11-07 14:00:00,11,2016,4,...,0.084375,0.129942,63391.25,74852.45,29914.9,37226.3,0.869184,0.130816,8.05,
2,2,3,Solid Waste Services,6402082,NON_RECYCLABLE ITEMS,NON_RECYCLABLE ITEMS,2016-11-07 14:00:00,11,2016,4,...,0.03753,0.113167,41973.411765,51723.764706,19338.294118,23576.882353,0.814082,0.185918,25.811765,
3,3,4,Solid Waste Services,6402103,MISSED SERVICE,MISSED RECYCLING,2016-11-07 14:00:00,11,2016,4,...,0.053345,0.128969,46401.066667,59119.666667,22976.8,28399.6,0.834642,0.165358,16.96,
4,4,6,Finance/City,6402112,CWP,CWP REQUEST,2016-11-07 14:00:00,11,2016,4,...,0.050732,0.097692,42745.529412,53659.352941,23808.882353,29559.117647,0.825033,0.174967,20.682353,Y


In [3]:
#list all column names
requests.columns.values

array(['Unnamed: 0', 'OBJECTID', 'DEPARTMENT', 'REQUEST_NO',
       'REQUEST_CAT', 'REQUEST_TYPE', 'RECEIVED_DATE', 'RECEIVED_MONTH',
       'RECEIVED_YEAR', 'SEASON', 'INTERNAL_FIELD_OBSERVATION',
       'NEIGHBORHOOD_PROFILE_AREA', 'BLOCK_NO', 'ZIP_CODE', 'X_COORD',
       'Y_COORD', 'LATITUDE', 'LONGITUDE', 'RECEIVED_DAY', 'TOTAL_CALLS',
       'ADDRESS_ID', 'CRIME_SCORE', 'EMPLOYED_All',
       'PERCENT EMPLOYED_In labor force',
       'PERCENT EMPLOYED_Not in labor force',
       'PERCENT EMPLOYED_Female Only', 'COMMUTING TO WORK_All',
       'PERCENT COMMUTING TO WORK_By Car',
       'PERCENT COMMUTING TO WORK_Public transportation',
       'PERCENT COMMUTING TO WORK_Walk',
       'PERCENT COMMUTING TO WORK_Other',
       'PERCENT COMMUTING TO WORK_Worked at home',
       'INCOME AND BENEFITS_Total households',
       'PERCENT HOUSEHOLD INCOME_Lower Income Households',
       'PERCENT HOUSEHOLD INCOME_Mid Income Households',
       'PERCENT HOUSEHOLD INCOME_Higher Income Househol

In [4]:
#calculate the day of the week to make the request count per neighborhood profile area lesser
requests['RECEIVED_DAY_OF_WEEK'] = pd.to_datetime(requests['RECEIVED_DATE']).dt.day_name()
requests['RECEIVED_MONTH']=requests['RECEIVED_MONTH'].apply(lambda x:str(x))

In [5]:
#function to calculate if the income bracket is predominantly low, mid or high
def calculate_income_bracket(low,mid,high):
    if low>mid and low>high:
        return 'Low'
    elif mid>high:
        return 'Mid'
    else:
        return 'High'

In [6]:
requests['PREDOMINANT_INCOME_BRACKET']=requests[['PERCENT HOUSEHOLD INCOME_Lower Income Households','PERCENT HOUSEHOLD INCOME_Mid Income Households','PERCENT HOUSEHOLD INCOME_Higher Income Households']].apply(lambda x: calculate_income_bracket(*x),axis=1)


In [7]:
requests['PREDOMINANT_INCOME_BRACKET'].value_counts()

Mid     1491811
Low       29346
High       2904
Name: PREDOMINANT_INCOME_BRACKET, dtype: int64

In [8]:
requests['CRIME_SCORE']=requests['CRIME_SCORE'].apply(lambda x: round(x,3))
requests['NEIGHBORHOOD_PROFILE_AREA']=requests['NEIGHBORHOOD_PROFILE_AREA'].apply(lambda x: str(int(x)))

In [9]:
max_crime_score=requests['CRIME_SCORE'].max()
requests['CRIME_INDEX']=requests['CRIME_SCORE'].apply(lambda x: 'Low' if (x/max_crime_score)<=0.33 else ('Mid' if (x/max_crime_score)<=0.66 else 'High'))

In [10]:
requests.head()

Unnamed: 0.1,Unnamed: 0,OBJECTID,DEPARTMENT,REQUEST_NO,REQUEST_CAT,REQUEST_TYPE,RECEIVED_DATE,RECEIVED_MONTH,RECEIVED_YEAR,SEASON,...,INCOME AND BENEFITS_Mean households income (dollars),INCOME AND BENEFITS_Per capita Income,INCOME AND BENEFITS_Median earnings for workers (dollars),PERCENT INSURED_Population with health insurance,PERCENT INSURED_Population without health insurance,PERCENTAGE BELOW POVERTY LEVEL_All,HISTORIC_REDLINING,RECEIVED_DAY_OF_WEEK,PREDOMINANT_INCOME_BRACKET,CRIME_INDEX
0,0,1,Solid Waste Services,6402056,NON_RECYCLABLE ITEMS,NON_RECYCLABLE ITEMS,2016-11-07 14:00:00,11,2016,4,...,111885.642857,47529.928571,43332.642857,0.917971,0.082029,7.135714,,Monday,Mid,Low
1,1,2,Housing and NBHD Serv,6402064,UTILITY VERIFICATION LTR,UTILITY VERIFICATION LTR,2016-11-07 14:00:00,11,2016,4,...,74852.45,29914.9,37226.3,0.869184,0.130816,8.05,,Monday,Mid,Low
2,2,3,Solid Waste Services,6402082,NON_RECYCLABLE ITEMS,NON_RECYCLABLE ITEMS,2016-11-07 14:00:00,11,2016,4,...,51723.764706,19338.294118,23576.882353,0.814082,0.185918,25.811765,,Monday,Mid,Low
3,3,4,Solid Waste Services,6402103,MISSED SERVICE,MISSED RECYCLING,2016-11-07 14:00:00,11,2016,4,...,59119.666667,22976.8,28399.6,0.834642,0.165358,16.96,,Monday,Mid,Low
4,4,6,Finance/City,6402112,CWP,CWP REQUEST,2016-11-07 14:00:00,11,2016,4,...,53659.352941,23808.882353,29559.117647,0.825033,0.174967,20.682353,Y,Monday,Mid,Low


In [11]:
#get all high income requests and calculate their request count on weekly basis
high_income_requests=requests[requests['PREDOMINANT_INCOME_BRACKET'] == 'High'].groupby(['RECEIVED_DAY_OF_WEEK','RECEIVED_MONTH','RECEIVED_YEAR','REQUEST_CAT','NEIGHBORHOOD_PROFILE_AREA','CRIME_INDEX']).size().reset_index(name='REQUEST_COUNT')
high_income_requests.head()

Unnamed: 0,RECEIVED_DAY_OF_WEEK,RECEIVED_MONTH,RECEIVED_YEAR,REQUEST_CAT,NEIGHBORHOOD_PROFILE_AREA,CRIME_INDEX,REQUEST_COUNT
0,Friday,1,2017,BLOCKAGE,13,Low,1
1,Friday,1,2017,MISSED SERVICE,3,Mid,1
2,Friday,1,2017,NON_RECYCLABLE ITEMS,60,Low,4
3,Friday,1,2017,TRANSPORTATION,13,Low,1
4,Friday,1,2018,311 DOCUMENT,313,Low,1


In [12]:
high_income_requests.shape

(2532, 7)

In [13]:
high_income_requests.describe()

Unnamed: 0,RECEIVED_YEAR,REQUEST_COUNT
count,2532.0,2532.0
mean,2018.711295,1.146919
std,1.540697,0.580653
min,2016.0,1.0
25%,2017.0,1.0
50%,2019.0,1.0
75%,2020.0,1.0
max,2021.0,5.0


In [14]:
high_income_requests.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2532 entries, 0 to 2531
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   RECEIVED_DAY_OF_WEEK       2532 non-null   object
 1   RECEIVED_MONTH             2532 non-null   object
 2   RECEIVED_YEAR              2532 non-null   int64 
 3   REQUEST_CAT                2532 non-null   object
 4   NEIGHBORHOOD_PROFILE_AREA  2532 non-null   object
 5   CRIME_INDEX                2532 non-null   object
 6   REQUEST_COUNT              2532 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 138.6+ KB


In [15]:
high_income_requests['REQUEST_COUNT'].value_counts().head(60)

1    2331
2     109
4      43
3      31
5      18
Name: REQUEST_COUNT, dtype: int64

In [16]:
high_income_requests['BIN_REQUEST_COUNT']=high_income_requests['REQUEST_COUNT'].apply(lambda x: 0 if x<=1 else 1)

In [17]:
high_income_requests.drop(['RECEIVED_YEAR','REQUEST_COUNT'],axis=1,inplace=True)
high_income_requests.head()

Unnamed: 0,RECEIVED_DAY_OF_WEEK,RECEIVED_MONTH,REQUEST_CAT,NEIGHBORHOOD_PROFILE_AREA,CRIME_INDEX,BIN_REQUEST_COUNT
0,Friday,1,BLOCKAGE,13,Low,0
1,Friday,1,MISSED SERVICE,3,Mid,0
2,Friday,1,NON_RECYCLABLE ITEMS,60,Low,1
3,Friday,1,TRANSPORTATION,13,Low,0
4,Friday,1,311 DOCUMENT,313,Low,0


In [18]:
high_income_requests['CRIME_INDEX'].value_counts()

Low     2385
Mid      133
High      14
Name: CRIME_INDEX, dtype: int64

In [19]:
high_income_asc_rules=pd.get_dummies(high_income_requests)
high_income_asc_rules.head()

Unnamed: 0,BIN_REQUEST_COUNT,RECEIVED_DAY_OF_WEEK_Friday,RECEIVED_DAY_OF_WEEK_Monday,RECEIVED_DAY_OF_WEEK_Saturday,RECEIVED_DAY_OF_WEEK_Sunday,RECEIVED_DAY_OF_WEEK_Thursday,RECEIVED_DAY_OF_WEEK_Tuesday,RECEIVED_DAY_OF_WEEK_Wednesday,RECEIVED_MONTH_1,RECEIVED_MONTH_10,...,NEIGHBORHOOD_PROFILE_AREA_85,NEIGHBORHOOD_PROFILE_AREA_87,NEIGHBORHOOD_PROFILE_AREA_92,NEIGHBORHOOD_PROFILE_AREA_94,NEIGHBORHOOD_PROFILE_AREA_95,NEIGHBORHOOD_PROFILE_AREA_96,NEIGHBORHOOD_PROFILE_AREA_97,CRIME_INDEX_High,CRIME_INDEX_Low,CRIME_INDEX_Mid
0,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [20]:
high_income_asc_rules.columns.values

array(['BIN_REQUEST_COUNT', 'RECEIVED_DAY_OF_WEEK_Friday',
       'RECEIVED_DAY_OF_WEEK_Monday', 'RECEIVED_DAY_OF_WEEK_Saturday',
       'RECEIVED_DAY_OF_WEEK_Sunday', 'RECEIVED_DAY_OF_WEEK_Thursday',
       'RECEIVED_DAY_OF_WEEK_Tuesday', 'RECEIVED_DAY_OF_WEEK_Wednesday',
       'RECEIVED_MONTH_1', 'RECEIVED_MONTH_10', 'RECEIVED_MONTH_11',
       'RECEIVED_MONTH_12', 'RECEIVED_MONTH_2', 'RECEIVED_MONTH_3',
       'RECEIVED_MONTH_4', 'RECEIVED_MONTH_5', 'RECEIVED_MONTH_6',
       'RECEIVED_MONTH_7', 'RECEIVED_MONTH_8', 'RECEIVED_MONTH_9',
       'REQUEST_CAT_311 DOCUMENT', 'REQUEST_CAT_ADMINISTRATIVE ACTIONS',
       'REQUEST_CAT_ANIMAL FOUND REPORT', 'REQUEST_CAT_BLOCKAGE',
       'REQUEST_CAT_BOARDED UP STRUCTURE', 'REQUEST_CAT_BULKY ITEM',
       'REQUEST_CAT_CART', 'REQUEST_CAT_CLEANLINESS',
       'REQUEST_CAT_COMPLAINT', 'REQUEST_CAT_CUSTOMER FEEDBACK',
       'REQUEST_CAT_CWP', 'REQUEST_CAT_DEAD ANIMAL COLLECTION',
       'REQUEST_CAT_ESCALATION REQUEST', 'REQUEST_CAT_GARBAGE',


In [21]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [22]:
frequent_itemsets = apriori(high_income_asc_rules, min_support=0.07, use_colnames=True)

In [23]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(REQUEST_CAT_NON_RECYCLABLE ITEMS),(RECEIVED_DAY_OF_WEEK_Monday),0.50158,0.226698,0.114139,0.227559,1.003797,0.000432,1.001114
1,(RECEIVED_DAY_OF_WEEK_Monday),(REQUEST_CAT_NON_RECYCLABLE ITEMS),0.226698,0.50158,0.114139,0.503484,1.003797,0.000432,1.003836
2,(CRIME_INDEX_Low),(RECEIVED_DAY_OF_WEEK_Monday),0.941943,0.226698,0.213665,0.226834,1.0006,0.000128,1.000176
3,(RECEIVED_DAY_OF_WEEK_Monday),(CRIME_INDEX_Low),0.226698,0.941943,0.213665,0.942509,1.0006,0.000128,1.009838
4,(RECEIVED_DAY_OF_WEEK_Tuesday),(CRIME_INDEX_Low),0.203791,0.941943,0.192338,0.943798,1.00197,0.000378,1.033012


In [24]:
#association_rules = apriori(high_income_asc_rules, min_support=0.01, min_confidence=0.2, min_lift=3, min_length=2)

In [59]:
all_requests=requests.groupby(['RECEIVED_MONTH','RECEIVED_YEAR','REQUEST_CAT','NEIGHBORHOOD_PROFILE_AREA']).size().reset_index(name='REQUEST_COUNT')
all_requests.head()

Unnamed: 0,RECEIVED_MONTH,RECEIVED_YEAR,REQUEST_CAT,NEIGHBORHOOD_PROFILE_AREA,REQUEST_COUNT
0,1,2017,311 DOCUMENT,10,1
1,1,2017,311 DOCUMENT,100,7
2,1,2017,311 DOCUMENT,103,2
3,1,2017,311 DOCUMENT,107,1
4,1,2017,311 DOCUMENT,109,1


In [45]:
all_requests.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247125 entries, 0 to 247124
Data columns (total 5 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   RECEIVED_MONTH             247125 non-null  object
 1   RECEIVED_YEAR              247125 non-null  int64 
 2   REQUEST_CAT                247125 non-null  object
 3   NEIGHBORHOOD_PROFILE_AREA  247125 non-null  object
 4   REQUEST_COUNT              247125 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 9.4+ MB


In [60]:
all_requests.describe()

Unnamed: 0,RECEIVED_YEAR,REQUEST_COUNT
count,247125.0,247125.0
mean,2018.582143,6.167166
std,1.516325,13.456791
min,2016.0,1.0
25%,2017.0,1.0
50%,2019.0,2.0
75%,2020.0,4.0
max,2021.0,303.0


In [47]:
all_requests['REQUEST_COUNT'].value_counts()

1      110166
2       42137
3       21338
4       13051
5        8620
        ...  
224         1
223         1
222         1
212         1
203         1
Name: REQUEST_COUNT, Length: 217, dtype: int64

In [58]:
#all_requests['BIN_REQUEST_COUNT']=all_requests['REQUEST_COUNT'].apply(lambda x: 0 if x<5 else 1)
all_requests.drop(['RECEIVED_YEAR','REQUEST_COUNT'],axis=1,inplace=True)
all_requests.head()

Unnamed: 0,RECEIVED_MONTH,REQUEST_CAT,NEIGHBORHOOD_PROFILE_AREA,BIN_REQUEST_COUNT
0,1,311 DOCUMENT,10,0
1,1,311 DOCUMENT,100,1
2,1,311 DOCUMENT,103,0
3,1,311 DOCUMENT,107,0
4,1,311 DOCUMENT,109,0


In [50]:
all_asc_rules=pd.get_dummies(all_requests)
all_asc_rules.head()

Unnamed: 0,BIN_REQUEST_COUNT,RECEIVED_MONTH_1,RECEIVED_MONTH_10,RECEIVED_MONTH_11,RECEIVED_MONTH_12,RECEIVED_MONTH_2,RECEIVED_MONTH_3,RECEIVED_MONTH_4,RECEIVED_MONTH_5,RECEIVED_MONTH_6,...,NEIGHBORHOOD_PROFILE_AREA_90,NEIGHBORHOOD_PROFILE_AREA_91,NEIGHBORHOOD_PROFILE_AREA_92,NEIGHBORHOOD_PROFILE_AREA_93,NEIGHBORHOOD_PROFILE_AREA_94,NEIGHBORHOOD_PROFILE_AREA_95,NEIGHBORHOOD_PROFILE_AREA_96,NEIGHBORHOOD_PROFILE_AREA_97,NEIGHBORHOOD_PROFILE_AREA_98,NEIGHBORHOOD_PROFILE_AREA_99
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
all_frequent_itemsets = apriori(all_asc_rules, min_support=0.05, use_colnames=True)

In [52]:
all_rules = association_rules(all_frequent_itemsets, metric="lift", min_threshold=1)
all_rules.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(REQUEST_CAT_NON_RECYCLABLE ITEMS),(BIN_REQUEST_COUNT),0.092698,0.244544,0.087563,0.944605,3.862714,0.064894,13.637494
1,(BIN_REQUEST_COUNT),(REQUEST_CAT_NON_RECYCLABLE ITEMS),0.244544,0.092698,0.087563,0.358066,3.862714,0.064894,1.413388
2,(BIN_REQUEST_COUNT),(REQUEST_CAT_RECYCLABLE ITEMS),0.244544,0.08626,0.060002,0.245363,2.844455,0.038908,1.210833
3,(REQUEST_CAT_RECYCLABLE ITEMS),(BIN_REQUEST_COUNT),0.08626,0.244544,0.060002,0.695595,2.844455,0.038908,2.481746


In [None]:
all_rules.tail(20)

In [54]:
all_rules.describe()

Unnamed: 0,antecedent support,consequent support,support,confidence,lift,leverage,conviction
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,0.167012,0.167012,0.073782,0.560907,3.353584,0.051901,4.685865
std,0.089566,0.089566,0.015912,0.319415,0.587892,0.015003,5.99374
min,0.08626,0.08626,0.060002,0.245363,2.844455,0.038908,1.210833
25%,0.091089,0.091089,0.060002,0.32989,2.844455,0.038908,1.362749
50%,0.168621,0.168621,0.073782,0.526831,3.353584,0.051901,1.947567
75%,0.244544,0.244544,0.087563,0.757847,3.862714,0.064894,5.270683
max,0.244544,0.244544,0.087563,0.944605,3.862714,0.064894,13.637494
