# Finding Association Rules

Q1:
Report 3 rules with support at least 0.2 and confidence at least 0.9. Specify
for each of them the support and the confidence.

In [107]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


data_from_csv = pd.read_csv('mammographic_masses.csv')

# # mapping values for nominal attributes
# nominal_attributes = ['Shape', 'Margin', 'Severity']
# shape_dict = {1: 'round', 2: 'oval', 3: 'lobular', 4: 'irregular', '?': 'Missing'}
# margin_dict = {1: 'circumscribed', 2: 'microlobulated', 3: 'obscured', 4: 'ill-defined', 5: 'spiculated', '?': 'Missing'}
# severity_dict = {0: 'benign', 1: 'malignant', '?': 'Missing'}
# nominal_dicts = [shape_dict, margin_dict, severity_dict]
# 
# def convert_value(col, value):
#     """if nominal, find the mapping value; if ordinal, assign the value directly"""
#     if col not in nominal_attributes:
#         return value
#     attribute_dict = nominal_dicts[nominal_attributes.index(col)]
#     if isinstance(value, str) and str.isdigit(value):
#         value = int(value)
#     return attribute_dict[value]

# convert to "attribute=value"
data = data_from_csv.apply(lambda row: [f"{col}={val}" for col, val in row.items()], axis=1).tolist()
te = TransactionEncoder()
# hot-pot encoding
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)
# computing frequent itemsets and association rules
frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)

# visualizing association rules results
rules[["antecedents","consequents","support","confidence"]]

Unnamed: 0,antecedents,consequents,support,confidence
0,(Shape=4),(Density=3),0.37461,0.9
1,"(Margin=1, Density=3)",(BI-RADS=4),0.263267,0.900356
2,"(Margin=1, Severity=0)",(BI-RADS=4),0.299688,0.911392
3,"(Margin=1, BI-RADS=4)",(Severity=0),0.299688,0.911392
4,"(Shape=4, BI-RADS=5)",(Density=3),0.245578,0.904215
5,"(Shape=4, BI-RADS=5)",(Severity=1),0.246618,0.908046
6,"(Severity=1, Shape=4)",(Density=3),0.295525,0.901587
7,"(Margin=1, Severity=0, Density=3)",(BI-RADS=4),0.238293,0.927126
8,"(Margin=1, Density=3, BI-RADS=4)",(Severity=0),0.238293,0.905138
9,"(Shape=4, Density=3, BI-RADS=5)",(Severity=1),0.224766,0.915254


Q2: determining some attributes and their values useful to find out whether a given instance is benign (severity =0) or malign (severity=1)

In [108]:
# exclude 'BI-RADS'
data_exclude = data_from_csv[['Age', 'Shape', 'Margin', 'Density', 'Severity']]
data = data_exclude.apply(lambda row: [f"{col}={val}" for col, val in row.items()], axis=1).tolist()
te = TransactionEncoder()
# hot-pot encoding
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)
# computing frequent itemsets and association rules
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)
# filter rules about severity
severity_rules = rules[rules['consequents'].apply(lambda x: 'Severity=0' in x or 'Severity=1' in x)]
severity_rules[["antecedents","consequents","support","confidence"]]

Unnamed: 0,antecedents,consequents,support,confidence
5,"(Margin=1, Shape=2)",(Severity=0),0.136316,0.903448
6,"(Margin=1, Shape=1, Density=3)",(Severity=0),0.1436,0.901961


Q3: As discussed above, the BI-RADS assessment is not always accurate ,and it might lead to unnecessary breast biopsy. Provide one or two rules that might give some evidence that the BI-RADS assessment is not always accurate. Explain your answer.

## Q4: Age=35 $\Rightarrow$ Severity=0
Write a script in Python to find the confidence and support of the following rule: Age=35 ⇒ Severity=0. Report its support and
confidence. Do you think this rule tells us something valuable or that we should ignore it as there is not enough evidence to support this rule?

**Answer**
TODO

In [109]:
df = data_from_csv
total = len(df)
# coerce: replace non-numeric values with NaN, '?' in this case
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
age_cnt = len(df[df['Age'] == 35])
age_and_severity_cnt = len(df[(df['Age'] == 35) & (df['Severity'] == 0)])
support = age_and_severity_cnt / total
confidence = age_and_severity_cnt / age_cnt
print('support:', support)
print('confidence:', confidence)

support: 0.012486992715920915
confidence: 0.9230769230769231


## Q5

In [110]:
n = 50
df = data_from_csv.copy(deep=True)
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Age≥n'] = df['Age'].apply(lambda x: 1 if pd.notnull(x) and x >= n else 0)
df = df.drop(columns=['Age'])
data = df.apply(lambda row: [f"{col}={val}" for col, val in row.items()], axis=1).tolist()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)
# computing frequent itemsets and association rules
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)
filtered_rules = rules[
    (rules['consequents'].apply(lambda x: 'Severity=1' in x)) & 
    (rules['antecedents'].apply(lambda x: 'Age≥n=1' in x))
]
filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

Unnamed: 0,antecedents,consequents,support,confidence,lift
26,"(Density=3, BI-RADS=5, Age≥n=1)",(Severity=1),0.243496,0.914062,1.973964
28,"(Shape=4, BI-RADS=5, Age≥n=1)",(Severity=1),0.208117,0.921659,1.990369
50,"(Margin=4, Density=3, BI-RADS=5, Age≥n=1)",(Severity=1),0.103018,0.916667,1.979588
52,"(Shape=4, Density=3, BI-RADS=5, Age≥n=1)",(Severity=1),0.192508,0.929648,2.007622
