In [1]:
!wget https://raw.githubusercontent.com/PacktPublishing/Big-Data-Analytics-with-Hadoop-3/master/Chapter04/OnlineRetail.csv

--2022-08-27 10:11:39--  https://raw.githubusercontent.com/PacktPublishing/Big-Data-Analytics-with-Hadoop-3/master/Chapter04/OnlineRetail.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5353956 (5.1M) [text/plain]
Saving to: ‘OnlineRetail.csv’


2022-08-27 10:11:40 (94.5 MB/s) - ‘OnlineRetail.csv’ saved [5353956/5353956]



In [2]:
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules
from numpy.random import default_rng

# Apriori

In [3]:
df = pd.read_csv('./OnlineRetail.csv', on_bad_lines='skip', engine='python')
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo', 'Description'], inplace=True)
df = df[~df['Description'].str.contains('wrong|POSTAGE')]
df['InvoiceNo']= df['InvoiceNo'].astype('str')
df = df[df['UnitPrice'] != 0]
# Dropping all transactions which were done on credit
df = df[~df['InvoiceNo'].str.contains('C')]
print(df.shape)
df.head()

(63536, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850,United Kingdom


In [4]:
df_France = df[df['Country'] == "France"].groupby(['InvoiceNo', 'Description'])['Quantity']\
.sum().unstack().reset_index().fillna(0)\
.set_index('InvoiceNo')\
.astype(float)\
.clip(upper=1, lower=0)

df_France.columns = ['CMD' + str(x+1000) for x in default_rng().choice(df_France.shape[1], size=df_France.shape[1], replace=False)]
df_France = df_France.loc[(df_France.sum(axis=1)>1).values]
df_France.index.name='SequenceNo'
df_France.head()

Unnamed: 0_level_0,CMD1443,CMD1385,CMD1166,CMD1437,CMD1015,CMD1029,CMD1377,CMD1049,CMD1004,CMD1256,...,CMD1159,CMD1169,CMD1031,CMD1081,CMD1307,CMD1337,CMD1389,CMD1448,CMD1330,CMD1447
SequenceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df = df_France.astype(bool).copy()
df = df.stack().reset_index()
df.columns = ['SequenceNo','AlarmType', 'bool']
df = df[df['bool'] == True].drop(columns=['bool'])
df = df.groupby('SequenceNo')['AlarmType'].apply(list).reset_index()
df.head()

Unnamed: 0,SequenceNo,AlarmType
0,536370,"[CMD1009, CMD1450, CMD1231, CMD1267, CMD1295, ..."
1,536852,"[CMD1267, CMD1074, CMD1254, CMD1367, CMD1315, ..."
2,536974,"[CMD1054, CMD1343, CMD1168, CMD1423, CMD1439, ..."
3,537065,"[CMD1004, CMD1009, CMD1320, CMD1450, CMD1231, ..."
4,537463,"[CMD1343, CMD1132, CMD1328, CMD1427, CMD1363, ..."


In [None]:
# Building the model 
frq_items = apriori(df_France, min_support = 0.02, use_colnames = True) 
# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules = rules.loc[(len(x) == 1 for x in rules['consequents'])]
rules['consequents'] = [list(x)[0].replace('CMD', 'Fault ') for x in rules['consequents'].values]
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
419,(CMD1665),Fault 1022,0.024590,0.043716,0.024590,1.000000,22.875000,0.023515,inf
2327,"(CMD1078, CMD1703)",Fault 2253,0.030055,0.043716,0.030055,1.000000,22.875000,0.028741,inf
4654,"(CMD1078, CMD1073, CMD1703)",Fault 2253,0.030055,0.043716,0.030055,1.000000,22.875000,0.028741,inf
2320,"(CMD1073, CMD2253)",Fault 1703,0.035519,0.049180,0.035519,1.000000,20.333333,0.033772,inf
4652,"(CMD1078, CMD1073, CMD2253)",Fault 1703,0.030055,0.049180,0.030055,1.000000,20.333333,0.028577,inf
...,...,...,...,...,...,...,...,...,...
1121,(CMD1528),Fault 1356,0.191257,0.087432,0.021858,0.114286,1.307143,0.005136,1.030319
589,(CMD2231),Fault 1778,0.196721,0.068306,0.021858,0.111111,1.626667,0.008421,1.048156
192,(CMD2231),Fault 1075,0.196721,0.073770,0.021858,0.111111,1.506173,0.007346,1.042008
1194,(CMD2231),Fault 1113,0.196721,0.076503,0.021858,0.111111,1.452381,0.006808,1.038934


In [None]:
rules.sample(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2178,"(CMD1073, CMD1254)",Fault 1620,0.021858,0.076503,0.021858,1.0,13.071429,0.020186,inf
5142,"(CMD1572, CMD2470, CMD1415)",Fault 2535,0.038251,0.136612,0.038251,1.0,7.32,0.033026,inf
5226,"(CMD1008, CMD1364, CMD1572)",Fault 1415,0.038251,0.060109,0.032787,0.857143,14.25974,0.030488,6.579235
3406,"(CMD1598, CMD1364)",Fault 1415,0.021858,0.060109,0.021858,1.0,16.636364,0.020544,inf
3809,"(CMD2119, CMD2328)",Fault 1528,0.043716,0.191257,0.021858,0.5,2.614286,0.013497,1.617486
2837,"(CMD1764, CMD2128)",Fault 1795,0.071038,0.180328,0.021858,0.307692,1.706294,0.009048,1.183971
101,(CMD2484),Fault 2192,0.112022,0.10929,0.038251,0.341463,3.12439,0.026009,1.35256
1475,"(CMD2036, CMD2328)",Fault 1786,0.030055,0.103825,0.02459,0.818182,7.880383,0.02147,4.928962
3958,"(CMD1364, CMD2535)",Fault 1572,0.10929,0.147541,0.106557,0.975,6.608333,0.090433,34.098361
1338,(CMD1549),Fault 1073,0.169399,0.054645,0.027322,0.16129,2.951613,0.018066,1.127154


# Bayesian Network

In [None]:
pip install pgmpy

In [8]:
from pgmpy.models import BayesianModel
from pgmpy.estimators import ParameterEstimator
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

In [9]:
data = pd.DataFrame(data={'Problem_C': ["C_Coil", "C_Coil", "C_Coil", "Exp_Motor", 
                                          "Exp_Motor","Exp_Motor", "Exp_Motor", "HS_Switch", 
                                          "HS_Switch", "HS_Switch", "HS_Switch", 
                                          "Exp_Valve", "Exp_Valve", "Exp_Valve",], 
                          'Cause_C': ["trip", "Short circuit", "Unbalance", "Unbalance", 
                                         "Unbalance", "trip", "trip", "trip", 
                                         "Unbalance", "Short circuit", "trip", "Short circuit", 
                                         "Short circuit", "Unbalance"], 
                          'Remedy_C': ["Replace", "Replace", "Replace", "Clean", 
                                          "Clean", "Clean", "Replace","Replace", 
                                          "Clean", "Replace", "Replace", "Clean", 
                                          "Replace", "Clean"]})
print(data)

    Problem_C        Cause_C Remedy_C
0      C_Coil           trip  Replace
1      C_Coil  Short circuit  Replace
2      C_Coil      Unbalance  Replace
3   Exp_Motor      Unbalance    Clean
4   Exp_Motor      Unbalance    Clean
5   Exp_Motor           trip    Clean
6   Exp_Motor           trip  Replace
7   HS_Switch           trip  Replace
8   HS_Switch      Unbalance    Clean
9   HS_Switch  Short circuit  Replace
10  HS_Switch           trip  Replace
11  Exp_Valve  Short circuit    Clean
12  Exp_Valve  Short circuit  Replace
13  Exp_Valve      Unbalance    Clean


In [11]:
model = BayesianModel([('Problem_C', 'Cause_C'), ('Cause_C', 'Remedy_C')])  # ProblemCode -> Cause Code -> Remedy Code
mle = MaximumLikelihoodEstimator(model, data)
print(mle.estimate_cpd('Problem_C'))
print(mle.estimate_cpd('Cause_C'))
print(mle.estimate_cpd('Remedy_C'))
model.fit(data, estimator=MaximumLikelihoodEstimator)

+----------------------+----------+
| Problem_C(C_Coil)    | 0.214286 |
+----------------------+----------+
| Problem_C(Exp_Motor) | 0.285714 |
+----------------------+----------+
| Problem_C(Exp_Valve) | 0.214286 |
+----------------------+----------+
| Problem_C(HS_Switch) | 0.285714 |
+----------------------+----------+
+------------------------+-----+----------------------+
| Problem_C              | ... | Problem_C(HS_Switch) |
+------------------------+-----+----------------------+
| Cause_C(Short circuit) | ... | 0.25                 |
+------------------------+-----+----------------------+
| Cause_C(Unbalance)     | ... | 0.25                 |
+------------------------+-----+----------------------+
| Cause_C(trip)          | ... | 0.5                  |
+------------------------+-----+----------------------+
+-------------------+-----+---------------+
| Cause_C           | ... | Cause_C(trip) |
+-------------------+-----+---------------+
| Remedy_C(Clean)   | ... | 0.2         

In [12]:
print(model.nodes())
print(model.edges())
model.get_cpds()

['Problem_C', 'Cause_C', 'Remedy_C']
[('Problem_C', 'Cause_C'), ('Cause_C', 'Remedy_C')]


[<TabularCPD representing P(Problem_C:4) at 0x7f94063fd650>,
 <TabularCPD representing P(Cause_C:3 | Problem_C:4) at 0x7f94063e4750>,
 <TabularCPD representing P(Remedy_C:2 | Cause_C:3) at 0x7f94063fa9d0>]

In [13]:
infer = VariableElimination(model)
p = infer.query(variables=['Remedy_C'], evidence={'Problem_C': 'Exp_Valve'})

infer = VariableElimination(model)
q = infer.query(variables=['Cause_C'], evidence={'Problem_C': 'Exp_Valve'})

print('\n************************************************')
print('Cause & Remedy for Problem Code == Expansion Valve')
print('************************************************\n')
print(q)
print(p)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]


************************************************
Cause & Remedy for Problem Code == Expansion Valve
************************************************

+------------------------+----------------+
| Cause_C                |   phi(Cause_C) |
| Cause_C(Short circuit) |         0.6667 |
+------------------------+----------------+
| Cause_C(Unbalance)     |         0.3333 |
+------------------------+----------------+
| Cause_C(trip)          |         0.0000 |
+------------------------+----------------+
+-------------------+-----------------+
| Remedy_C          |   phi(Remedy_C) |
| Remedy_C(Clean)   |          0.4333 |
+-------------------+-----------------+
| Remedy_C(Replace) |          0.5667 |
+-------------------+-----------------+
