Q1P1

In [20]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from itertools import chain

df = pd.read_excel("Online Retail.xlsx")  

df.dropna(subset=['InvoiceNo', 'StockCode'], inplace=True)
df['StockCode'] = df['StockCode'].astype(str)

grouped = df.groupby('InvoiceNo')['StockCode'].apply(list)
transactions = grouped.tolist()

total_transactions = len(transactions)
print(f"Total transactions: {total_transactions}")

k = 4
chunk_size = total_transactions // k
parts = [transactions[i*chunk_size:(i+1)*chunk_size] for i in range(k-1)]
parts.append(transactions[(k-1)*chunk_size:]) 

overall_min_sup = 0.05

local_results = []
print("\n--- Scan 1: Local Frequent Itemsets per Partition ---")
for idx, part in enumerate(parts, start=1):
    part_size = len(part)
    
    local_min_sup = (part_size / total_transactions) * overall_min_sup
    print(f"Partition {idx}: size={part_size}, local_min_sup={local_min_sup:.5f}")

    
    te = TransactionEncoder()
    te_array = te.fit(part).transform(part)
    df_part = pd.DataFrame(te_array, columns=te.columns_)

    
    freq = apriori(df_part, min_support=local_min_sup, use_colnames=True)
    freq['length'] = freq['itemsets'].apply(lambda x: len(x))
    print(freq.sort_values(['length', 'support'], ascending=[True, False]).to_string(index=False))
    local_results.append(freq)

candidates = set(chain.from_iterable(local['itemsets'].apply(lambda x: tuple(sorted(x))).tolist() for local in local_results))

total_te = TransactionEncoder()
total_array = total_te.fit(transactions).transform(transactions)
df_total = pd.DataFrame(total_array, columns=total_te.columns_)

global_frequents = []
print("\n--- Scan 2: Global Frequent Itemsets (min_sup=0.05) ---")
for c in candidates:
    cols = list(c)
    sup = df_total[cols].all(axis=1).sum() / total_transactions
    if sup >= overall_min_sup:
        global_frequents.append((c, sup))

for itemset, sup in sorted(global_frequents, key=lambda x: (len(x[0]), -x[1])):
    print(f"{itemset}  support={sup:.4f}")



Total transactions: 25900

--- Scan 1: Local Frequent Itemsets per Partition ---
Partition 1: size=6475, local_min_sup=0.01250
 support                itemsets  length
0.123398                (85123A)       1
0.110425                 (22423)       1
0.088803                (85099B)       1
0.087413                 (22720)       1
0.071815                 (22961)       1
0.070425                 (22457)       1
0.070425                 (22469)       1
0.070116                 (22960)       1
0.069961                 (21212)       1
0.068263                 (20725)       1
0.065483                 (47566)       1
0.065328                 (22722)       1
0.063012                 (84879)       1
0.061622                 (22197)       1
0.059923                 (22666)       1
0.058687                 (21181)       1
0.054826                 (22178)       1
0.053436                 (21080)       1
0.052664                 (22699)       1
0.052510                 (22697)       1
0.052355    

Q1P2

In [25]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from itertools import chain

df = pd.read_excel("Online Retail.xlsx")  # or pd.read_csv("data.csv")

df.dropna(subset=['InvoiceNo', 'StockCode'], inplace=True)
df['StockCode'] = df['StockCode'].astype(str)

grouped = df.groupby('InvoiceNo')['StockCode'].apply(list)
transactions = grouped.tolist()

total_transactions = len(transactions)
print(f"Total transactions: {total_transactions}")

k = 4
chunk_size = total_transactions // k
parts = [transactions[i*chunk_size:(i+1)*chunk_size] for i in range(k-1)]
parts.append(transactions[(k-1)*chunk_size:])

overall_min_sup = 0.05
min_conf = 0.4

print("\n--- Scan 1: Local Frequent Itemsets per Partition ---")
local_results = []
for idx, part in enumerate(parts, start=1):
    part_size = len(part)
    local_min_sup = (part_size / total_transactions) * overall_min_sup
    print(f"Partition {idx}: size={part_size}, local_min_sup={local_min_sup:.5f}")

    te = TransactionEncoder()
    te_array = te.fit(part).transform(part)
    df_part = pd.DataFrame(te_array, columns=te.columns_)

    freq = apriori(df_part, min_support=local_min_sup, use_colnames=True)
    freq['length'] = freq['itemsets'].apply(lambda x: len(x))
    print(freq.sort_values(['length', 'support'], ascending=[True, False]).to_string(index=False))
    local_results.append(freq)

candidates = set(chain.from_iterable(local['itemsets'].apply(lambda x: tuple(sorted(x))).tolist() for local in local_results))

total_te = TransactionEncoder()
total_array = total_te.fit(transactions).transform(transactions)
df_total = pd.DataFrame(total_array, columns=total_te.columns_)

global_frequents = []
for c in candidates:
    sup = df_total[list(c)].all(axis=1).sum() / total_transactions
    if sup >= overall_min_sup:
        global_frequents.append((c, sup))

print("\n--- Scan 2: Global Frequent Itemsets (min_sup=0.05) ---")
for itemset, sup in sorted(global_frequents, key=lambda x: (len(x[0]), -x[1])):
    print(f"{itemset}  support={sup:.4f}")

print(f"\n--- Global Association Rules (min_conf={min_conf}) ---")
global_freq_df = pd.DataFrame([
    {'itemsets': set(itemset), 'support': sup} for itemset, sup in global_frequents
])
rules_global = association_rules(global_freq_df, metric="confidence", min_threshold=min_conf)
if not rules_global.empty:
    print(rules_global[['antecedents','consequents','support','confidence','lift']].to_string(index=False))
else:
    print("No global rules found.")



Total transactions: 25900

--- Scan 1: Local Frequent Itemsets per Partition ---
Partition 1: size=6475, local_min_sup=0.01250
 support                itemsets  length
0.123398                (85123A)       1
0.110425                 (22423)       1
0.088803                (85099B)       1
0.087413                 (22720)       1
0.071815                 (22961)       1
0.070425                 (22457)       1
0.070425                 (22469)       1
0.070116                 (22960)       1
0.069961                 (21212)       1
0.068263                 (20725)       1
0.065483                 (47566)       1
0.065328                 (22722)       1
0.063012                 (84879)       1
0.061622                 (22197)       1
0.059923                 (22666)       1
0.058687                 (21181)       1
0.054826                 (22178)       1
0.053436                 (21080)       1
0.052664                 (22699)       1
0.052510                 (22697)       1
0.052355    

Q1p3

In [27]:
group_candidates = [(itemset, sup) for itemset, sup in global_frequents if len(itemset) >= 2]

group_candidates_sorted = sorted(group_candidates, key=lambda x: x[1], reverse=True)

print("\n--- suggested discount combinations  ---")
for itemset, sup in group_candidates_sorted[:5]:
    print(f"{itemset}  support={sup:.4f}")

top_rules_by_lift = rules_global.sort_values('lift', ascending=False).head(5)
print("\n--- association rules for discount ---")
print(top_rules_by_lift[['antecedents','consequents','support','confidence','lift']].to_string(index=False))



--- suggested discount combinations  ---

--- association rules for discount ---
Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


Q1p4

In [28]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Weekday'] = df['InvoiceDate'].dt.weekday

weekend_df = df[df['Weekday'].isin([5, 6])]
weekday_df = df[~df['Weekday'].isin([5, 6])]

weekend_trans = weekend_df.groupby('InvoiceNo')['StockCode'].apply(list).tolist()
weekday_trans = weekday_df.groupby('InvoiceNo')['StockCode'].apply(list).tolist()

te = total_te  
weekend_array = te.transform(weekend_trans)
df_weekend = pd.DataFrame(weekend_array, columns=te.columns_)

weekday_array = te.transform(weekday_trans)
df_weekday = pd.DataFrame(weekday_array, columns=te.columns_)

only_weekend_itemsets = []
for itemset, sup in global_frequents:
    cols = list(itemset)
    in_weekend = df_weekend[cols].all(axis=1).any()
    in_weekday = df_weekday[cols].all(axis=1).any()
    if in_weekend and not in_weekday:
        only_weekend_itemsets.append((itemset, sup))

print("\n--- frequent itemsets in the weekend---")
if only_weekend_itemsets:
    for itemset, sup in only_weekend_itemsets:
        print(f"{itemset}  support={sup:.4f}")
else:
    print("There isnt any itemset")



--- frequent itemsets in the weekend---
There isnt any itemset


Q2P1

In [29]:
import pandas as pd
from collections import defaultdict

class FPNode:
    def __init__(self, item_name, count, parent):
        self.item_name = item_name
        self.count = count
        self.parent = parent
        self.children = {}
        self.node_link = None  
class FPTree:
    def __init__(self, transactions, min_support):
        self.min_support = min_support
        self.header_table = {}  
        self.root = FPNode(None, 1, None)
        self._build_header_table(transactions)
        self._build_fp_tree(transactions)

    def _build_header_table(self, transactions):
        
        item_counts = defaultdict(int)
        for transaction in transactions:
            for item in transaction:
                item_counts[item] += 1
        
        self.header_table = {item: [count, None]
                             for item, count in item_counts.items()
                             if count >= self.min_support}
        
        self.freq_items = sorted(self.header_table.keys(),
                                 key=lambda i: self.header_table[i][0],
                                 reverse=True)

    def _build_fp_tree(self, transactions):
        
        for transaction in transactions:
            
            sorted_items = [item for item in self.freq_items if item in transaction]
            if sorted_items:
                self._insert_tree(sorted_items, self.root)

    def _insert_tree(self, items, node):
        first = items[0]
        
        if first in node.children:
            node.children[first].count += 1
        else:
            new_child = FPNode(first, 1, node)
            node.children[first] = new_child
            
            head = self.header_table[first][1]
            if head is None:
                self.header_table[first][1] = new_child
            else:
                curr = head
                while curr.node_link:
                    curr = curr.node_link
                curr.node_link = new_child
        
        if len(items) > 1:
            self._insert_tree(items[1:], node.children[first])


def mine_tree(header_table, prefix, freq_patterns, min_support):
    
    items = sorted(header_table.items(), key=lambda x: x[1][0])
    for item, (count, node) in items:
        new_pattern = prefix.copy()
        new_pattern.add(item)
        freq_patterns[frozenset(new_pattern)] = count
        
        cond_patterns = []
        cond_counts = []
        curr = node
        while curr:
            path = []
            parent = curr.parent
            while parent and parent.item_name:
                path.append(parent.item_name)
                parent = parent.parent
            if path:
                cond_patterns.append(path[::-1])
                cond_counts.append(curr.count)
            curr = curr.node_link
        
        expanded = []
        for pat, cnt in zip(cond_patterns, cond_counts):
            expanded.extend([pat] * cnt)
        
        cond_tree = FPTree(expanded, min_support)
        if cond_tree.header_table:
            mine_tree(cond_tree.header_table, new_pattern, freq_patterns, min_support)

weather = pd.read_csv("weatherHistory.csv")
weather['Formatted Date'] = pd.to_datetime(
    weather['Formatted Date'],
    format='%Y-%m-%d %H:%M:%S.%f %z',
    errors='coerce'
)
weather = weather.dropna(subset=['Formatted Date'])
weather['Formatted Date'] = weather['Formatted Date'].astype(str)

def label_items(row):
    items = []
    if row['Temperature (C)'] > 30:
        items.append('HighTemp')
    if row['Humidity'] > 0.8:
        items.append('HighHumidity')
    return items

transactions = weather.apply(label_items, axis=1).tolist()

min_sup_count = int(0.002* len(transactions))

fp_tree = FPTree(transactions, min_sup_count)
frequent_patterns = {}
mine_tree(fp_tree.header_table, set(), frequent_patterns, min_sup_count)

results = pd.DataFrame([
    {'itemset': set(pattern), 'count': cnt, 'support': cnt/len(transactions)}
    for pattern, cnt in frequent_patterns.items()
])
results = results[results['support'] >= 0.002 ]
results['length'] = results['itemset'].apply(len)
results = results.sort_values(['support', 'length'], ascending=[False, True])

print("--- Frequent Patterns (min_support=0.002) ---")
print(results.to_string(index=False))


  weather['Formatted Date'] = pd.to_datetime(


--- Frequent Patterns (min_support=0.002) ---
       itemset  count  support  length
{HighHumidity}  44261 0.458887       1
    {HighTemp}   2673 0.027713       1


In [30]:
import pandas as pd
from collections import defaultdict

class FPNode:
    def __init__(self, item_name, count, parent):
        self.item_name = item_name
        self.count = count
        self.parent = parent
        self.children = {}
        self.node_link = None

class FPTree:
    def __init__(self, transactions, min_support):
        self.min_support = min_support
        self.header_table = {}
        self.root = FPNode(None, 1, None)
        self._build_header_table(transactions)
        self._build_tree(transactions)

    def _build_header_table(self, transactions):
        item_counts = defaultdict(int)
        for t in transactions:
            for item in t:
                item_counts[item] += 1
        
        self.header_table = {item:[cnt, None] for item, cnt in item_counts.items() if cnt >= self.min_support}
        
        self.freq_items = sorted(self.header_table, key=lambda i: self.header_table[i][0], reverse=True)

    def _build_tree(self, transactions):
        for t in transactions:
            sorted_items = [i for i in self.freq_items if i in t]
            if sorted_items:
                self._insert(sorted_items, self.root)

    def _insert(self, items, node):
        first = items[0]
        if first in node.children:
            node.children[first].count += 1
        else:
            new_node = FPNode(first, 1, node)
            node.children[first] = new_node
            
            head = self.header_table[first][1]
            if head is None:
                self.header_table[first][1] = new_node
            else:
                curr = head
                while curr.node_link:
                    curr = curr.node_link
                curr.node_link = new_node
        if len(items) > 1:
            self._insert(items[1:], node.children[first])


def mine_tree(header_table, prefix, freq_patterns, min_support):
    items = sorted(header_table.items(), key=lambda x: x[1][0])
    for item, (count, node) in items:
        new_pat = prefix.copy()
        new_pat.add(item)
        freq_patterns[frozenset(new_pat)] = count
        
        cond_pats, cond_counts = [], []
        curr = node
        while curr:
            path = []
            parent = curr.parent
            while parent and parent.item_name:
                path.append(parent.item_name)
                parent = parent.parent
            if path:
                cond_pats.append(path[::-1])
                cond_counts.append(curr.count)
            curr = curr.node_link
        expanded = [pat for pat, cnt in zip(cond_pats, cond_counts) for _ in range(cnt)]
        tree = FPTree(expanded, min_support)
        if tree.header_table:
            mine_tree(tree.header_table, new_pat, freq_patterns, min_support)

if __name__ == '__main__':
    
    weather = pd.read_csv("weatherHistory.csv")
    weather['Formatted Date'] = pd.to_datetime(
        weather['Formatted Date'], format='%Y-%m-%d %H:%M:%S.%f %z', utc=True, errors='coerce')
    weather = weather.dropna(subset=['Formatted Date'])

    
    def label_items(row):
        items = []
        if row['Temperature (C)'] > 30:
            items.append('HighTemp')
        if row['Humidity'] > 0.8:
            items.append('HighHumidity')
        return items

    transactions = weather.apply(label_items, axis=1).tolist()
    total = len(transactions)

    
    count_ht = sum('HighTemp' in t for t in transactions)
    count_hh = sum('HighHumidity' in t for t in transactions)
    count_both = sum({'HighTemp','HighHumidity'}.issubset(t) for t in transactions)
    print("--- Direct Counting ---")
    print(f"HighTemp = {count_ht}/{total} ({count_ht/total:.4f})")
    print(f"HighHumidity = {count_hh}/{total} ({count_hh/total:.4f})")
    print(f"Both = {count_both}/{total} ({count_both/total:.4f})")

    
    fp = FPTree(transactions, min_support=1)
    freq_patterns = {}
    mine_tree(fp.header_table, set(), freq_patterns, min_support=1)
    ht_fp = freq_patterns.get(frozenset({'HighTemp'}), 0)
    hh_fp = freq_patterns.get(frozenset({'HighHumidity'}), 0)
    both_fp = freq_patterns.get(frozenset({'HighTemp','HighHumidity'}), 0)
    print("\n--- FP-tree Counting ---")
    print(f"HighTemp = {ht_fp}/{total} ({ht_fp/total:.4f})")
    print(f"HighHumidity = {hh_fp}/{total} ({hh_fp/total:.4f})")
    print(f"Both = {both_fp}/{total} ({both_fp/total:.4f})")


--- Direct Counting ---
HighTemp = 2673/96453 (0.0277)
HighHumidity = 44261/96453 (0.4589)
Both = 0/96453 (0.0000)

--- FP-tree Counting ---
HighTemp = 2673/96453 (0.0277)
HighHumidity = 44261/96453 (0.4589)
Both = 0/96453 (0.0000)


Q3P1

In [31]:

import pandas as pd
from scipy.stats import chi2_contingency

if __name__ == '__main__':
    
    treatment_to_tids = {
        'T1': {101, 203, 305, 407, 509, 612, 714, 816, 919, 1020},
        'T2': {101, 305, 509, 612, 816, 919},
        'T3': {203, 407, 612, 714, 816},
        'T4': {1020, 305, 407, 612, 714},
        'T5': {101, 203, 305, 509},
        'T6': {612, 714, 816, 919, 1020},
        'T7': {203, 407, 509, 612, 714},
        'T8': {101, 305, 816, 919},
        'T9': {509, 612, 714, 1020},
        'T10': {203, 305, 407, 612},
        'T11': {101, 816, 919, 1020},
        'T12': {305, 407, 509, 714},
        'T13': {612, 714, 816, 919},
        'T14': {101, 203, 305, 509},
        'T15': {305, 509, 714, 919}
    }

    
    
    df_readm = pd.read_excel('Q3_Data.xlsx', sheet_name=1)
    df_readm.columns = ['TID', 'Readmit']
    
    readmission = dict(zip(df_readm['TID'], df_readm['Readmit']))

    
    treatment = 'T1'
    tids = treatment_to_tids[treatment]

    # a: in treatment & readmit=1
    a = sum(1 for tid in tids if readmission.get(tid, 0) == 1)
    # b: in treatment & readmit=0
    b = sum(1 for tid in tids if readmission.get(tid, 0) == 0)
    # c: not in treatment & readmit=1
    c = sum(1 for tid, val in readmission.items() if tid not in tids and val == 1)
    # d: not in treatment & readmit=0
    d = sum(1 for tid, val in readmission.items() if tid not in tids and val == 0)

    
    print(f"Contingency table for treatment {treatment}:")
    print(f"               Readmit=1   Readmit=0")
    print(f"{treatment}=Yes       {a:<12}{b}")
    print(f"{treatment}=No        {c:<12}{d}")

    # Compute Lift and Chi-square
    total = a + b + c + d
    p_readmit = (a + c) / total
    p_readmit_trt = a / (a + b) if (a + b) > 0 else 0
    lift = p_readmit_trt / p_readmit if p_readmit > 0 else float('nan')

    try:
        chi2, pval, _, _ = chi2_contingency([[a, b], [c, d]], correction=False)
    except ValueError:
        chi2, pval = float('nan'), float('nan')

    print(f"\nLift = {lift:.4f}")
    print(f"Chi-square = {chi2:.4f}, p-value = {pval:.4f}")


Contingency table for treatment T1:
               Readmit=1   Readmit=0
T1=Yes       7           3
T1=No        1993        7997

Lift = 3.5000
Chi-square = 15.6406, p-value = 0.0001


Q3P2

In [32]:

from itertools import combinations

treatment_to_tids = {
    'T1': {101, 203, 305, 407, 509, 612, 714, 816, 919, 1020},
    'T2': {101, 305, 509, 612, 816, 919},
    'T3': {203, 407, 612, 714, 816},
    'T4': {1020, 305, 407, 612, 714},
    'T5': {101, 203, 305, 509},
    'T6': {612, 714, 816, 919, 1020},
    'T7': {203, 407, 509, 612, 714},
    'T8': {101, 305, 816, 919},
    'T9': {509, 612, 714, 1020},
    'T10': {203, 305, 407, 612},
    'T11': {101, 816, 919, 1020},
    'T12': {305, 407, 509, 714},
    'T13': {612, 714, 816, 919},
    'T14': {101, 203, 305, 509},
    'T15': {305, 509, 714, 919}
}


min_support = 5


print("--- Frequent Single Treatments (support >= 5) ---")
frequent_singles = [t for t, tids in treatment_to_tids.items() if len(tids) >= min_support]
for t in sorted(frequent_singles):
    print(f"{t}: support = {len(treatment_to_tids[t])}")


print("\n--- Frequent Treatment Pairs (support >= 5) ---")
frequent_pairs = []
for t1, t2 in combinations(treatment_to_tids, 2):
    inter = treatment_to_tids[t1] & treatment_to_tids[t2]
    if len(inter) >= min_support:
        frequent_pairs.append((t1, t2, len(inter)))
if frequent_pairs:
    for t1, t2, sup in sorted(frequent_pairs):
        print(f"({t1}, {t2}): support = {sup}")
else:
    print("No frequent pairs.")


print("\n--- Frequent Treatment Triples (support >= 5) ---")
frequent_triples = []
for t1, t2, t3 in combinations(treatment_to_tids, 3):
    inter = treatment_to_tids[t1] & treatment_to_tids[t2] & treatment_to_tids[t3]
    if len(inter) >= min_support:
        frequent_triples.append((t1, t2, t3, len(inter)))
if frequent_triples:
    for t1, t2, t3, sup in sorted(frequent_triples):
        print(f"({t1}, {t2}, {t3}): support = {sup}")
else:
    print("No frequent triples.")


--- Frequent Single Treatments (support >= 5) ---
T1: support = 10
T2: support = 6
T3: support = 5
T4: support = 5
T6: support = 5
T7: support = 5

--- Frequent Treatment Pairs (support >= 5) ---
(T1, T2): support = 6
(T1, T3): support = 5
(T1, T4): support = 5
(T1, T6): support = 5
(T1, T7): support = 5

--- Frequent Treatment Triples (support >= 5) ---
No frequent triples.


Q3P3

In [33]:
import pandas as pd



treatment_to_tids = {
    'T1': {101, 203, 305, 407, 509, 612, 714, 816, 919, 1020}
}


df = pd.read_excel('Q3_Data.xlsx', sheet_name=1)
df.columns = ['TID', 'Readmit']  
readmission = dict(zip(df['TID'], df['Readmit']))


returned_after_t1 = [pid for pid in treatment_to_tids['T1'] if readmission.get(pid) == 1]


print("Patients who received T1 and then returned within 30 days:")
for pid in sorted(returned_after_t1):
    print(f"- Patient ID: {pid}")


Patients who received T1 and then returned within 30 days:
- Patient ID: 101
- Patient ID: 203
- Patient ID: 509
- Patient ID: 612
- Patient ID: 714
- Patient ID: 919
- Patient ID: 1020


In [34]:
d={
101: {"T1", "T2", "T5", "T8", "T11", "T14"},
203: {"T1", "T3", "T5", "T7", "T10", "T14"},
305: {"T1", "T2", "T4", "T5", "T8", "T10", "T12", "T15"},
407: {"T1", "T3", "T4", "T7", "T10", "T12"},
509: {"T1", "T2", "T5", "T7", "T9", "T12", "T14", "T15"},
612: {"T1", "T2", "T3", "T4", "T6", "T7", "T9", "T10", "T13"},
714: {"T1", "T3", "T4", "T6", "T7", "T9", "T12", "T13", "T15"},
816: {"T1", "T2", "T3", "T6", "T8", "T11", "T13"},
919: {"T1", "T2", "T6", "T8", "T11", "T13", "T15"},
1020: {"T1", "T4", "T6", "T9", "T11"}}

In [35]:
item_to_tid = {
    101: {"T1", "T2", "T5", "T8", "T11", "T14"},
    203: {"T1", "T3", "T5", "T7", "T10", "T14"},
    305: {"T1", "T2", "T4", "T5", "T8", "T10", "T12", "T15"},
    407: {"T1", "T3", "T4", "T7", "T10", "T12"},
    509: {"T1", "T2", "T5", "T7", "T9", "T12", "T14", "T15"},
    612: {"T1", "T2", "T3", "T4", "T6", "T7", "T9", "T10", "T13"},
    714: {"T1", "T3", "T4", "T6", "T7", "T9", "T12", "T13", "T15"},
    816: {"T1", "T2", "T3", "T6", "T8", "T11", "T13"},
    919: {"T1", "T2", "T6", "T8", "T11", "T13", "T15"},
    1020: {"T1", "T4", "T6", "T9", "T11"}
}

costs = {
    "T1": 150,
    "T2": 200,
    "T3": 75,
    "T4": 120,
    "T5": 90,
    "T6": 180,
    "T7": 85,
    "T8": 250,
    "T9": 95,
    "T10": 110,
    "T11": 300,
    "T12": 80,
    "T13": 160,
    "T14": 170,
    "T15": 300
}

total_costs = {}
for pid, treatments in item_to_tid.items():
    total_cost = sum(costs[t] for t in treatments)
    total_costs[pid] = total_cost

for pid, cost in total_costs.items():
    print(f" Patient {pid}: Total Cost = ${cost}")


 Patient 101: Total Cost = $1160
 Patient 203: Total Cost = $680
 Patient 305: Total Cost = $1300
 Patient 407: Total Cost = $620
 Patient 509: Total Cost = $1170
 Patient 612: Total Cost = $1175
 Patient 714: Total Cost = $1245
 Patient 816: Total Cost = $1315
 Patient 919: Total Cost = $1540
 Patient 1020: Total Cost = $845


In [36]:
import numpy as np
from scipy.stats import chi2_contingency

contingency = np.array([
    [2, 1],      # T1=Yes
    [1993, 7997] # T1=No
])

chi2, p, dof, expected = chi2_contingency(contingency)

print(f"Chi-square: {chi2:.4f}")
print(f"P-value: {p:.4e}")
print(f"Degrees of freedom: {dof}")
print("Expected frequencies:")
print(expected)


Chi-square: 1.6944
P-value: 1.9303e-01
Degrees of freedom: 1
Expected frequencies:
[[5.98919243e-01 2.40108076e+00]
 [1.99440108e+03 7.99559892e+03]]
