In [1]:
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules
from numpy.random import default_rng

# Apriori

In [2]:
df = pd.read_csv('./data/Online_Retail.csv', engine='python')
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo', 'Description'], inplace = True)
df = df[~df['Description'].str.contains('wrong|POSTAGE')]
df['InvoiceNo']= df['InvoiceNo'].astype('str')
df = df[df['UnitPrice']!=0]
# Dropping all transactions which were done on credit
df = df[~df['InvoiceNo'].str.contains('C')]
print(df.shape)
df.head()

(528274, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,1/12/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,1/12/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,1/12/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,1/12/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,1/12/10 8:26,3.39,17850.0,United Kingdom


In [3]:
df_France = (df[df['Country'] == "France"] 
        .groupby(['InvoiceNo', 'Description'])['Quantity'] 
        .sum().unstack().reset_index().fillna(0) 
        .set_index('InvoiceNo')
        .clip(upper=1, lower=0))
df_France.columns = ['CMD' + str(x+1000) for x in default_rng().choice(df_France.shape[1], size=df_France.shape[1], replace=False)]
df_France = df_France.loc[(df_France.sum(axis=1)>1).values]
df_France.index.name='SequenceNo'
df_France.head()

Unnamed: 0_level_0,CMD1200,CMD1204,CMD2159,CMD2557,CMD1791,CMD1156,CMD2353,CMD1759,CMD1536,CMD2030,...,CMD1493,CMD1151,CMD2358,CMD1812,CMD2140,CMD2224,CMD2003,CMD1018,CMD1915,CMD1294
SequenceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df_France.to_csv('Sequence_Data_format2.csv')

In [5]:
df = df_France.astype(bool)
df = df.stack().reset_index()
df.columns = ['SequenceNo','AlarmType', 'bool']
df = df[df['bool'] == True].drop(columns=['bool'])
df = df.groupby('SequenceNo')['AlarmType'].apply(list).reset_index()
df.head()
df.to_csv('Sequence_Data.csv')

In [6]:
# Building the model 
frq_items = apriori(df_France, min_support = 0.02, use_colnames = True) 
# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules = rules.loc[(len(x) == 1 for x in rules['consequents'])]
rules['consequents'] = [list(x)[0].replace('CMD', 'Fault ') for x in rules['consequents'].values]
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
419,(CMD1510),Fault 1030,0.024590,0.043716,0.024590,1.000000,22.875000,0.023515,inf
2328,"(CMD1801, CMD1890)",Fault 2171,0.030055,0.043716,0.030055,1.000000,22.875000,0.028741,inf
4655,"(CMD1801, CMD1121, CMD1890)",Fault 2171,0.030055,0.043716,0.030055,1.000000,22.875000,0.028741,inf
2321,"(CMD2171, CMD1121)",Fault 1801,0.035519,0.049180,0.035519,1.000000,20.333333,0.033772,inf
4654,"(CMD2171, CMD1121, CMD1890)",Fault 1801,0.030055,0.049180,0.030055,1.000000,20.333333,0.028577,inf
...,...,...,...,...,...,...,...,...,...
1120,(CMD1480),Fault 1484,0.191257,0.087432,0.021858,0.114286,1.307143,0.005136,1.030319
589,(CMD2113),Fault 1596,0.196721,0.068306,0.021858,0.111111,1.626667,0.008421,1.048156
193,(CMD2113),Fault 1436,0.196721,0.073770,0.021858,0.111111,1.506173,0.007346,1.042008
1195,(CMD2113),Fault 1674,0.196721,0.076503,0.021858,0.111111,1.452381,0.006808,1.038934


In [7]:
rules.sample(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
12,(CMD1775),Fault 1333,0.030055,0.10929,0.02459,0.818182,7.486364,0.021306,4.898907
4389,"(CMD1336, CMD1219, CMD1719)",Fault 2153,0.021858,0.101093,0.021858,1.0,9.891892,0.019648,inf
1800,"(CMD2291, CMD1333)",Fault 2153,0.021858,0.101093,0.021858,1.0,9.891892,0.019648,inf
660,(CMD1480),Fault 1561,0.191257,0.13388,0.035519,0.185714,1.387172,0.009914,1.063656
1992,"(CMD1856, CMD1244)",Fault 2153,0.038251,0.101093,0.02459,0.642857,6.359073,0.020723,2.51694
2676,"(CMD1561, CMD1183)",Fault 2324,0.068306,0.054645,0.021858,0.32,5.856,0.018125,1.390228
1931,"(CMD1333, CMD1719)",Fault 1244,0.040984,0.169399,0.021858,0.533333,3.148387,0.014915,1.779859
1998,"(CMD1336, CMD1719)",Fault 2153,0.095628,0.101093,0.027322,0.285714,2.826255,0.017655,1.25847
3719,"(CMD2305, CMD1719)",Fault 1480,0.112022,0.191257,0.027322,0.243902,1.275261,0.005897,1.069628
5185,"(CMD1630, CMD1557, CMD2410)",Fault 1002,0.035519,0.071038,0.021858,0.615385,8.662722,0.019335,2.415301


# Bayesian Network

In [8]:
from pgmpy.models import BayesianModel
from pgmpy.estimators import ParameterEstimator
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.estimators import ConstraintBasedEstimator
from pgmpy.inference import VariableElimination

In [9]:
data = pd.DataFrame(data={'Problem_C': ["C_Coil", "C_Coil", "C_Coil", "Exp_Motor", 
                                          "Exp_Motor","Exp_Motor", "Exp_Motor", "HS_Switch", 
                                          "HS_Switch", "HS_Switch", "HS_Switch", 
                                          "Exp_Valve", "Exp_Valve", "Exp_Valve",], 
                          'Cause_C': ["trip", "Short circuit", "Unbalance", "Unbalance", 
                                         "Unbalance", "trip", "trip", "trip", 
                                         "Unbalance", "Short circuit", "trip", "Short circuit", 
                                         "Short circuit", "Unbalance"], 
                          'Remedy_C': ["Replace", "Replace", "Replace", "Clean", 
                                          "Clean", "Clean", "Replace","Replace", 
                                          "Clean", "Replace", "Replace", "Clean", 
                                          "Replace", "Clean"]})
print(data)

    Problem_C        Cause_C Remedy_C
0      C_Coil           trip  Replace
1      C_Coil  Short circuit  Replace
2      C_Coil      Unbalance  Replace
3   Exp_Motor      Unbalance    Clean
4   Exp_Motor      Unbalance    Clean
5   Exp_Motor           trip    Clean
6   Exp_Motor           trip  Replace
7   HS_Switch           trip  Replace
8   HS_Switch      Unbalance    Clean
9   HS_Switch  Short circuit  Replace
10  HS_Switch           trip  Replace
11  Exp_Valve  Short circuit    Clean
12  Exp_Valve  Short circuit  Replace
13  Exp_Valve      Unbalance    Clean


In [10]:
data

Unnamed: 0,Problem_C,Cause_C,Remedy_C
0,C_Coil,trip,Replace
1,C_Coil,Short circuit,Replace
2,C_Coil,Unbalance,Replace
3,Exp_Motor,Unbalance,Clean
4,Exp_Motor,Unbalance,Clean
5,Exp_Motor,trip,Clean
6,Exp_Motor,trip,Replace
7,HS_Switch,trip,Replace
8,HS_Switch,Unbalance,Clean
9,HS_Switch,Short circuit,Replace


In [11]:
model = BayesianModel([('Problem_C', 'Cause_C'), ('Cause_C', 'Remedy_C')])  # ProblemCode -> Cause Code -> Remedy Code
mle = MaximumLikelihoodEstimator(model, data)
print(mle.estimate_cpd('Problem_C'))
print(mle.estimate_cpd('Cause_C'))
print(mle.estimate_cpd('Remedy_C'))
model.fit(data, estimator=MaximumLikelihoodEstimator)

+----------------------+----------+
| Problem_C(C_Coil)    | 0.214286 |
+----------------------+----------+
| Problem_C(Exp_Motor) | 0.285714 |
+----------------------+----------+
| Problem_C(Exp_Valve) | 0.214286 |
+----------------------+----------+
| Problem_C(HS_Switch) | 0.285714 |
+----------------------+----------+
+------------------------+--------------------+----------------------+----------------------+----------------------+
| Problem_C              | Problem_C(C_Coil)  | Problem_C(Exp_Motor) | Problem_C(Exp_Valve) | Problem_C(HS_Switch) |
+------------------------+--------------------+----------------------+----------------------+----------------------+
| Cause_C(Short circuit) | 0.3333333333333333 | 0.0                  | 0.6666666666666666   | 0.25                 |
+------------------------+--------------------+----------------------+----------------------+----------------------+
| Cause_C(Unbalance)     | 0.3333333333333333 | 0.5                  | 0.3333333333333333  

In [12]:
print(model.nodes())
print(model.edges())
model.get_cpds()

['Problem_C', 'Cause_C', 'Remedy_C']
[('Problem_C', 'Cause_C'), ('Cause_C', 'Remedy_C')]


[<TabularCPD representing P(Cause_C:3 | Problem_C:4) at 0x122551810>,
 <TabularCPD representing P(Problem_C:4) at 0x1225536d0>,
 <TabularCPD representing P(Remedy_C:2 | Cause_C:3) at 0x122553250>]

In [13]:
infer = VariableElimination(model)
p = infer.query(variables=['Remedy_C'], evidence={'Problem_C': 'Exp_Valve'})

infer = VariableElimination(model)
q = infer.query(variables=['Cause_C'], evidence={'Problem_C': 'Exp_Valve'})

print('\n************************************************')
print('Cause & Remedy for Problem Code == Expansion Valve')
print('************************************************\n')
print(q)
print(p)

Finding Elimination Order: : 100%|██████████| 1/1 [00:00<00:00, 204.45it/s]
Eliminating: Cause_C: 100%|██████████| 1/1 [00:00<00:00, 304.62it/s]
Finding Elimination Order: : 100%|██████████| 1/1 [00:00<00:00, 536.22it/s]
Eliminating: Remedy_C: 100%|██████████| 1/1 [00:00<00:00, 233.91it/s]


************************************************
Cause & Remedy for Problem Code == Expansion Valve
************************************************

+------------------------+----------------+
| Cause_C                |   phi(Cause_C) |
| Cause_C(Short circuit) |         0.6667 |
+------------------------+----------------+
| Cause_C(Unbalance)     |         0.3333 |
+------------------------+----------------+
| Cause_C(trip)          |         0.0000 |
+------------------------+----------------+
+-------------------+-----------------+
| Remedy_C          |   phi(Remedy_C) |
| Remedy_C(Clean)   |          0.4333 |
+-------------------+-----------------+
| Remedy_C(Replace) |          0.5667 |
+-------------------+-----------------+



