In [16]:
import pandas as pd
from data_process.preprocessing import discretization, get_nodes_type, code_categories
from block_learning.train_bn import structure_learning, parameter_learning
from block_learning.partial_bn_train import connect_partial_bn
from libpgm.hybayesiannetwork import HyBayesianNetwork
from libpgm.sampleaggregator import SampleAggregator
import seaborn as sns
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from block_learning.save_bn import save_structure, save_params
from block_learning.read_bn import read_structure, read_params
from kmodes.kmodes import KModes

In [17]:
data = pd.read_csv('data/final_dataset.csv')
data.head(10)

Unnamed: 0,age,sex,is_closed,has_high_education,relation_status,number_of_relatives,len_of_about,number_of_activities,number_of_books,number_of_friends,...,parent,driver,has_pets,cash_usage,gulyaka,zhavoronok,sum_act,top1,top2,top3
0,34,1,0,1,1,2,14,1,1,911,...,0,0,0,0,1,0,1,Gifts & holidays,Music,History & politics
1,0,1,0,0,0,0,0,0,0,687,...,0,1,0,0,1,0,2,Gifts & holidays,History & politics,Love & relation
2,37,1,0,0,0,0,0,0,0,24,...,0,0,1,0,0,0,1,Purchase & sale,Gifts & holidays,Cooking
3,37,1,0,0,0,0,0,0,0,2535,...,0,0,0,0,0,0,0,History & politics,Gifts & holidays,Love & relation
4,39,1,0,1,4,0,0,1,1,291,...,1,0,0,0,1,0,2,History & politics,Gifts & holidays,Love & relation
5,34,1,0,0,0,0,0,0,0,413,...,0,0,0,0,1,0,1,Gifts & holidays,Music,History & politics
6,36,0,0,0,0,0,0,0,0,175,...,0,1,0,0,0,0,1,Gifts & holidays,History & politics,Love & relation
7,0,0,0,0,0,0,0,0,0,982,...,0,0,1,0,1,0,2,Gifts & holidays,Music,History & politics
8,35,0,0,0,0,0,0,0,0,279,...,0,1,0,0,0,0,1,Gifts & holidays,History & politics,Music
9,41,0,0,0,0,0,0,0,0,134,...,0,1,0,0,0,0,1,Gifts & holidays,Love & relation,History & politics


In [18]:
data = data.loc[(data['sex'] != 0) & (data['age'] != 0)]

In [19]:
data.shape

(24747, 47)

In [20]:
data = data[['age',	'sex',	'has_high_education',	'relation_status',	'number_of_relatives','len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies','top1','top2','top3','nodes',	'edges',	'size',	'betweenness_centrality',	'density',	'eigenvector_centrality',	'degree_assortativity_coefficient', 'number_of_followers', 'max_tr',	'mean_tr',	'med_tr', 'parent',	'driver',	'has_pets',	'cash_usage']]

In [21]:
data = data.loc[data['mean_tr'] < 1000]

In [22]:
data.shape

(16109, 28)

In [23]:
discrete_data = discretization(data, 'kmeans', bins= 5, columns=['nodes',	'edges',	'size', 'betweenness_centrality',	'density',	'eigenvector_centrality',	'degree_assortativity_coefficient','max_tr',	'mean_tr',	'med_tr'])

In [24]:
new_data = code_categories(discrete_data, ['top1', 'top2', 'top3'])

In [25]:
module1 = new_data[['age',	'sex',	'has_high_education',	'relation_status',	'number_of_relatives']]
module2 = new_data[['len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies']]
module3 = new_data[['top1', 'top2', 'top3']]
module4 = new_data[['nodes',	'betweenness_centrality',	'eigenvector_centrality', 'number_of_followers']]
module5 = new_data[['max_tr',	'mean_tr', 'parent',	'driver',	'has_pets',	'cash_usage']]

In [26]:
node_type1 = get_nodes_type(data[['age',	'sex',	'has_high_education',	'relation_status',	'number_of_relatives']])
node_type1

{'age': 'disc',
 'sex': 'disc',
 'has_high_education': 'disc',
 'relation_status': 'disc',
 'number_of_relatives': 'disc'}

In [27]:
node_type2 = get_nodes_type(data[['len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies']])
node_type2

{'len_of_about': 'disc',
 'number_of_activities': 'disc',
 'number_of_books': 'disc',
 'number_of_interests': 'disc',
 'number_of_movies': 'disc'}

In [28]:
node_type3 = get_nodes_type(data[['top1', 'top2', 'top3']])
node_type3

{'top1': 'disc', 'top2': 'disc', 'top3': 'disc'}

In [29]:
node_type4 = get_nodes_type(data[['nodes',	'betweenness_centrality',	'eigenvector_centrality', 'number_of_followers']])
node_type4

{'nodes': 'disc',
 'betweenness_centrality': 'cont',
 'eigenvector_centrality': 'cont',
 'number_of_followers': 'disc'}

In [30]:
node_type5 = get_nodes_type(data[['max_tr',	'mean_tr', 'parent',	'driver',	'has_pets',	'cash_usage']])
node_type5

{'max_tr': 'cont',
 'mean_tr': 'cont',
 'parent': 'disc',
 'driver': 'disc',
 'has_pets': 'disc',
 'cash_usage': 'disc'}

In [31]:
start = time.time()
bn1 = structure_learning(module1, 'MI', node_type1)

In [32]:
bn1

{'V': ['age',
  'sex',
  'has_high_education',
  'relation_status',
  'number_of_relatives'],
 'E': [['number_of_relatives', 'sex'],
  ['sex', 'has_high_education'],
  ['age', 'relation_status'],
  ['sex', 'relation_status']]}

In [33]:
param1 = parameter_learning(data[['age',	'sex',	'has_high_education',	'relation_status',	'number_of_relatives']], node_type1, bn1)

In [34]:
save_structure(bn1, 'skel1')
skel1 = read_structure('skel1')
save_params(param1, 'params1')
params1 = read_params('params1')
hybn1 = HyBayesianNetwork(skel1, params1)

In [35]:
end = time.time()
print(end - start)

6.935981750488281


In [36]:
bn2 = structure_learning(module2, 'K2', node_type2)

In [37]:
bn2

{'V': ['len_of_about',
  'number_of_activities',
  'number_of_books',
  'number_of_interests',
  'number_of_movies'],
 'E': [['number_of_activities', 'number_of_books'],
  ['number_of_activities', 'len_of_about'],
  ['number_of_books', 'number_of_movies'],
  ['number_of_books', 'number_of_interests']]}

In [38]:
param2 = parameter_learning(data[['len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies']], node_type2, bn2)

In [39]:
save_structure(bn2, 'skel2')
skel2 = read_structure('skel2')
save_params(param2, 'params2')
params2 = read_params('params2')
hybn2 = HyBayesianNetwork(skel2, params2)


In [40]:
hybn_1_2 = connect_partial_bn(hybn1, hybn2, data[['age',	'sex',	'has_high_education',	'relation_status',	'number_of_relatives', 'len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies']], 'LV1')

In [41]:
end = time.time()
print(end - start)

141.30213499069214


In [42]:
bn3 = structure_learning(module3, 'MI', node_type3)
bn3

{'V': ['top1', 'top2', 'top3'],
 'E': [['top1', 'top2'], ['top3', 'top2'], ['top1', 'top3']]}

In [43]:
param3 = parameter_learning(data[['top1', 'top2', 'top3']], node_type3, bn3)

In [44]:
save_structure(bn3, 'skel3')
skel3 = read_structure('skel3')
save_params(param3, 'params3')
params3 = read_params('params3')
hybn3 = HyBayesianNetwork(skel3, params3)

In [45]:
bn4 = structure_learning(module4, 'K2', node_type4)
bn4

{'V': ['nodes',
  'betweenness_centrality',
  'eigenvector_centrality',
  'number_of_followers'],
 'E': [['nodes', 'eigenvector_centrality'],
  ['nodes', 'number_of_followers'],
  ['nodes', 'betweenness_centrality'],
  ['eigenvector_centrality', 'betweenness_centrality']]}

In [46]:
param4 = parameter_learning(data[['nodes',	'betweenness_centrality',	'eigenvector_centrality', 'number_of_followers']], node_type4, bn4)

In [47]:
save_structure(bn4, 'skel4')
skel4 = read_structure('skel4')
save_params(param4, 'params4')
params4 = read_params('params4')
hybn4 = HyBayesianNetwork(skel4, params4)

In [48]:
hybn_2_3 = connect_partial_bn(hybn2, hybn3, data[['len_of_about',	'number_of_activities',	'number_of_books',	'number_of_interests',	'number_of_movies', 'top1','top2', 'top3']], 'LV2')