In [14]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
import time
import mlxtend.frequent_patterns
from mlxtend.frequent_patterns import fpgrowth
import matplotlib as plt
from mlxtend.frequent_patterns import association_rules

In [15]:
def load_transactions (file_name, sep='\t'):
    lines = open(file_name, 'rt').readlines()
    transactions_matrix = []
    for l in lines:
        l = l.rstrip('\n')
        transaction = l.split(sep)
        transactions_matrix.append(transaction)
    return transactions_matrix

In [16]:
biotargets=load_transactions("bio_targets_filt.txt", sep='\t')

len(biotargets)

1101

In [17]:
biotargets

[['POL', 'ABCB1'],
 ['ADRB1', 'SLC22A1'],
 ['REP'],
 ['PRNP', 'LEF', 'LMNA', 'TSHR', 'CYP2D6', 'CYP1A2'],
 ['SLCO1B1',
  'CA2',
  'CYP2D6',
  'SLCO1B3',
  'CA3',
  'CYP2B6',
  'MB',
  'CYP2C9',
  'PTGS2',
  'CYP1A2',
  'FAAH',
  'AKR1C3',
  'CA7',
  'BRD4',
  'CYP2C8',
  'CA15',
  'MGLL',
  'BRD2',
  'BRD3',
  'CA12',
  'TSHR',
  'CYP3A4',
  'CYP2C19'],
 ['CA2',
  'CA6',
  'CA3',
  'CYNT',
  'MTCA1',
  'MTCA2',
  'CA7',
  'CA13',
  'CA4',
  'CA15',
  'CA1',
  'CA9',
  'CA5B',
  'CA12',
  'CA14',
  'NCE103',
  'CA',
  'CA5A'],
 ['TSHR', 'BLM', 'REP', 'THRB'],
 ['SLCO1B1',
  'LEF',
  'MMP2',
  'MMP1',
  'SLCO1B3',
  'MMP8',
  'MMP3',
  'MMP9',
  'NOS2',
  'REP'],
 ['AR'],
 ['SMN2',
  'THRB',
  'RECQL',
  'SLCO1B1',
  'NFKB1',
  'SLCO1B3',
  'ALOX15',
  'SLC22A1',
  'BLM',
  'REP'],
 ['TBXAS1', 'PLA2G1B', 'TSHR', 'PTGS2', 'CHRM1', 'BLM', 'PTGS1'],
 ['SMN2', 'REP'],
 ['POLA1', 'TK1'],
 ['GLRA1', 'LMNA', 'RARG', 'RARA', 'RARB', 'REP'],
 ['REP'],
 ['SYK',
  'LEF',
  'SLCO1B1',
  'ERBB4',
  '

In [18]:
#Compute binary database
tr_enc = TransactionEncoder()
trans_array = tr_enc.fit(biotargets).transform(biotargets)
biotargets_df = pd.DataFrame(trans_array, columns=tr_enc.columns_)
biotargets_df.drop(biotargets_df.columns[0], axis=1, inplace=True)
biotargets_df

Unnamed: 0,1272966,5HTR1D,AADAC,AAK1,ABAT,ABCB1,ABCB11,ABCB1A,ABCB1B,ABCC1,...,VDRA,VIM-2,VKORC1,VRK2,WEE1,WEE2,XBP1,XDH,YES1,ZAP70
0,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1097,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1098,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1099,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
molecular_labels = pd.read_csv("molecular_labels.csv", header=None)
mlabels_df = pd.DataFrame(molecular_labels)
mlabels_df.columns= ["Molecules"]
mlabels_df

Unnamed: 0,Molecules
0,PDFDA0001
1,PDFDA0002
2,PDFDA0003
3,PDFDA0004
4,PDFDA0005
...,...
1096,PDFDA1097
1097,PDFDA1098
1098,PDFDA1099
1099,PDFDA1100


In [20]:
biotargets_df = biotargets_df.T
biotargets_df.columns = mlabels_df["Molecules"]
biotargets_df = biotargets_df.T

In [21]:
biotargets_df

Unnamed: 0_level_0,1272966,5HTR1D,AADAC,AAK1,ABAT,ABCB1,ABCB11,ABCB1A,ABCB1B,ABCC1,...,VDRA,VIM-2,VKORC1,VRK2,WEE1,WEE2,XBP1,XDH,YES1,ZAP70
Molecules,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PDFDA0001,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PDFDA0002,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PDFDA0003,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PDFDA0004,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PDFDA0005,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PDFDA1097,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PDFDA1098,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PDFDA1099,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
PDFDA1100,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Testing

In [13]:
#biotargets_df[biotargets_df["PDFDA0001"] == True]

In [None]:
# def perform_eval(data):
#     D={"support": [0.09,0.08,0.07,0.06,0.05,0.04,0.03,0.02,0.01], 
#     "num_itemsets":[], 
#     "FP-growth":[]}

#     for min_supp in D["support"]:
#         # record the number of itemsets and computing time for each support level for FP-growth
#         t0 = time.time()
#         FI_fpg= fpgrowth(data, min_supp, use_colnames=True)
#         t1 = time.time()
#         D["num_itemsets"].append(FI_fpg.shape[0])
#         D["FP-growth"].append(t1-t0)

#     # these are the results, ordered by descending support level
#     df_performance=pd.DataFrame(D)
#     return df_performance

In [None]:
#perform_df = perform_eval(biotargets_df)

In [None]:
#perform_df

In [None]:
# # simple plot of the number of itemsets generated according to minimum support
# plt.plot(perform_df["support"], perform_df["num_itemsets"], '--bo')
# plt.title('Number of itemsets by minimum support level', fontsize=13)
# plt.xlabel('Minimum support', fontsize=13)
# plt.ylabel('Number of itemsets', fontsize=13)
# plt.grid()
# plt.show()

In [22]:
FI_fpgrowth = fpgrowth(biotargets_df, min_support=0.02, use_colnames=True)
FI_fpgrowth['length'] = FI_fpgrowth['itemsets'].apply(lambda x: len(x))
fpg_sup = FI_fpgrowth.sort_values(by=['support'], ascending=False)
fpg_sup[fpg_sup['length'] > 2]

Unnamed: 0,support,itemsets,length
162,0.151680,"(REP, SLCO1B1, SLCO1B3)",3
109,0.099909,"(SLCO1B1, LMNA, SLCO1B3)",3
4256,0.076294,"(SLCO1B1, SLCO2B1, SLCO1B3)",3
167,0.064487,"(SLCO1B1, CYP3A4, SLCO1B3)",3
382,0.054496,"(HTR2C, HTR2B, HTR2A)",3
...,...,...,...
31943,0.020890,"(HTR2B, ADRA1A, HTR2A, HTR6, CHRM2, CHRM1, ADR...",7
31942,0.020890,"(HTR2B, ADRA1A, CHRM4, HTR6, HTR2A, CHRM2, DRD3)",7
31941,0.020890,"(HTR2B, ADRA1A, CHRM4, HTR6, CHRM2, DRD3, CHRM1)",7
31940,0.020890,"(HTR2B, ADRA1A, CHRM4, HTR6, HTR2A, CHRM2, CHRM1)",7


In [23]:
rules = association_rules(FI_fpgrowth, metric="support", min_threshold=0.7)

In [26]:
print("Rules with a minimum confidence of 70%:", len(FI_fpgrowth))
print("Top 10 rules with greater support:")
rules.sort_values(by=['support'], ascending=False).head(30)

Rules with a minimum confidence of 70%: 53063
Top 10 rules with greater support:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
56,(SLCO1B1),(SLCO1B3),0.306085,0.305177,0.297911,0.973294,3.189275,0.204501,26.017257
57,(SLCO1B3),(SLCO1B1),0.305177,0.306085,0.297911,0.97619,3.189275,0.204501,29.144414
58,"(REP, SLCO1B1)",(SLCO1B3),0.156222,0.305177,0.15168,0.97093,3.18153,0.104005,23.901907
59,"(REP, SLCO1B3)",(SLCO1B1),0.153497,0.306085,0.15168,0.988166,3.228399,0.104697,58.635786
9,"(LMNA, SLCO1B3)",(SLCO1B1),0.102634,0.306085,0.099909,0.973451,3.180326,0.068494,26.137451
8,"(SLCO1B1, LMNA)",(SLCO1B3),0.102634,0.305177,0.099909,0.973451,3.189791,0.068588,26.171662
155118,(SLCO2B1),(SLCO1B3),0.087193,0.305177,0.080836,0.927083,3.037853,0.054226,9.529
155119,(SLCO2B1),(SLCO1B1),0.087193,0.306085,0.080836,0.927083,3.028839,0.054147,9.516543
155122,(SLCO2B1),"(SLCO1B1, SLCO1B3)",0.087193,0.297911,0.076294,0.875,2.937119,0.050318,5.616712
155120,"(SLCO1B1, SLCO2B1)",(SLCO1B3),0.080836,0.305177,0.076294,0.94382,3.092697,0.051625,12.367847
