In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [4]:
import pandas as pd
import numpy as np
from bayesian.train_bn import structure_learning, parameter_learning, parameter_learning_mix
from preprocess.discretization import get_nodes_type, discretization, inverse_discretization, code_categories
from bayesian.save_bn import save_structure, save_params, read_structure, read_params
from external.libpgm.hybayesiannetwork import HyBayesianNetwork
from visualization.visualization import draw_BN
from bayesian.calculate_accuracy import calculate_acc
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from copy import copy
from external.libpgm.sampleaggregator import SampleAggregator
import operator
from sklearn.metrics import accuracy_score, mean_squared_error
from scipy import stats
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
import math
from pomegranate import DiscreteDistribution
import random
from sklearn.model_selection import train_test_split

In [3]:
geo = geo = pd.read_csv('../datasets/hackathon_processed.csv')
columns = ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Hydrocarbon type', 'Gross','Netpay','Porosity','Permeability', 'Depth']
geo = geo[columns]
geo.dropna(inplace=True)
geo.reset_index(inplace=True, drop=True)

In [5]:
geo_train, geo_test = train_test_split(geo, test_size=0.3)
geo_train.reset_index(inplace=True, drop=True)
geo_test.reset_index(inplace=True, drop=True)

In [7]:
geo_types = get_nodes_type(geo)
geo_types

{'Tectonic regime': 'disc',
 'Period': 'disc',
 'Lithology': 'disc',
 'Structural setting': 'disc',
 'Hydrocarbon type': 'disc',
 'Gross': 'cont',
 'Netpay': 'cont',
 'Porosity': 'cont',
 'Permeability': 'cont',
 'Depth': 'cont'}

In [11]:
colums_for_code = []
columns_for_disc = []
for c in columns:
    if geo_types[c] == 'disc':
        colums_for_code.append(c)
    else:
        columns_for_disc.append(c)

In [15]:
geo_coded, label_coder = code_categories(geo_train, 'label', colums_for_code)
geo_discrete, coder = discretization(geo_coded, 'equal_frequency', columns_for_disc)
geo_only_discrete, discrete_coder = discretization(geo_train, 'equal_frequency', columns_for_disc)

# Дискретная структра + смешанные параметры

In [18]:
bn_geo = structure_learning(geo_discrete, 'HC', 'MI_mixed', geo_types)

  0%|          | 14/1000000 [00:04<90:34:28,  3.07it/s] 


In [19]:
params = parameter_learning(geo_train, geo_types, bn_geo)

In [20]:
save_structure(bn_geo, 'geo_net')
skel = read_structure('geo_net')
save_params(params, 'geo_net_param')
params = read_params('geo_net_param')
geo_bn = HyBayesianNetwork(skel, params)

In [21]:
acc, rmse = calculate_acc(geo_bn, geo_test, columns)

3
3
6
6
6
7
7
7
11
11
12
12
12
13
13
17
18
18
21
21
21
22
22
22
22
23
26
26
27
27
27
29
29
32
32
34
35
36
36
39
39
39
42
42
42
44
51
52
52
54
54
54
55
56
56
56
57
57
62
63
64
64
70
71
74
74
75
75
78
78
79
80
80
81
82
82
82
82
88
96
97
98
98
98
99
101
101
102
103
103
103
105
105
111
118
118
120
120
121
123
126
129


In [22]:
acc

{'Tectonic regime': 0.52,
 'Period': 0.24,
 'Lithology': 0.59,
 'Structural setting': 0.58,
 'Hydrocarbon type': 0.71}

In [23]:
rmse

{'Gross': 442.13,
 'Netpay': 50.26,
 'Porosity': 6.17,
 'Permeability': 831.56,
 'Depth': 883.21}

# Смешанная структура + смешанные параметры

In [25]:
bn_geo = structure_learning(geo_coded, 'HC', 'MI_mixed', geo_types)

  0%|          | 8/1000000 [00:11<401:04:39,  1.44s/it]


In [26]:
params = parameter_learning(geo_train, geo_types, bn_geo)

In [27]:
save_structure(bn_geo, 'geo_net')
skel = read_structure('geo_net')
save_params(params, 'geo_net_param')
params = read_params('geo_net_param')
geo_bn = HyBayesianNetwork(skel, params)

In [28]:
acc, rmse = calculate_acc(geo_bn, geo_test, columns)

22
51
51
51
57
71
71
71
82
120
120
120


In [29]:
acc

{'Tectonic regime': 0.52,
 'Period': 0.24,
 'Lithology': 0.59,
 'Structural setting': 0.58,
 'Hydrocarbon type': 0.71}

In [30]:
rmse

{'Gross': 485.64,
 'Netpay': 99.74,
 'Porosity': 7.56,
 'Permeability': 1000.7,
 'Depth': 1080.52}

# Дискретная стурктура + дискретные параметры

In [36]:
geo_new_types = {'Tectonic regime': 'disc',
 'Period': 'disc',
 'Lithology': 'disc',
 'Structural setting': 'disc',
 'Hydrocarbon type': 'disc',
 'Gross': 'disc',
 'Netpay': 'disc',
 'Porosity': 'disc',
 'Permeability': 'disc',
 'Depth': 'disc'}

In [37]:
bn_geo = structure_learning(geo_discrete, 'HC', 'MI_mixed', geo_new_types)

  0%|          | 14/1000000 [00:04<95:45:09,  2.90it/s] 


In [38]:
params = parameter_learning(geo_discrete, geo_new_types, bn_geo)

In [39]:
save_structure(bn_geo, 'geo_net')
skel = read_structure('geo_net')
save_params(params, 'geo_net_param')
params = read_params('geo_net_param')
geo_bn = HyBayesianNetwork(skel, params)