In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)


import pandas as pd
from preprocess.discretization import get_nodes_type, code_categories
from bayesian.structure_score import MIG

import networkx as nx


from pgmpy.estimators import HillClimbSearch




orig_data = pd.read_csv('../datasets/hackathon_processed.csv')
orig_data.dropna(inplace=True)
orig_data.reset_index(inplace=True, drop=True)
columns = ['Period', 'Tectonic regime', 'Hydrocarbon type']
#columns = ['Gross', 'Netpay','Porosity']
#columns = ['Gross', 'Netpay', 'Period']
data_test = orig_data[columns]

node_type = get_nodes_type(data_test)
columns_for_discrete = []
for param in columns:
    if node_type[param] == 'cont':
        columns_for_discrete.append(param)
columns_for_code = []
for param in columns:
    if node_type[param] == 'disc':
        columns_for_code.append(param)        

data_coded, code_dict = code_categories(data_test, "label", columns_for_code)
scoring_method = MIG(data=data_coded)

local_score = scoring_method.local_score
print(local_score('Period', ['Hydrocarbon type']))

-35.139


In [2]:
est = HillClimbSearch(data=data_coded, scoring_method=scoring_method, use_cache=False)
estimated_model = est.estimate(max_indegree=4, max_iter=int(1e4), epsilon=1e-6)
print(estimated_model.nodes(), estimated_model.edges())

['Period', 'Tectonic regime', 'Hydrocarbon type'] [('Hydrocarbon type', 'Period'), ('Hydrocarbon type', 'Tectonic regime')]


In [3]:
#Fast example with all nodes
columns = ['Period', 'Tectonic regime','Gross', 'Netpay']
data_test = orig_data[columns]

node_type = get_nodes_type(data_test)
columns_for_discrete = []
for param in columns:
    if node_type[param] == 'cont':
        columns_for_discrete.append(param)
columns_for_code = []
for param in columns:
    if node_type[param] == 'disc':
        columns_for_code.append(param)        
data_coded, code_dict = code_categories(data_test, "label", columns_for_code)
scoring_method = MIG(data=data_coded)
est = HillClimbSearch(data=data_coded, scoring_method=scoring_method, use_cache=False)
estimated_model = est.estimate(max_indegree=4, max_iter=int(1e4), epsilon=1e-4)
print(estimated_model.nodes(), estimated_model.edges())

['Period', 'Tectonic regime', 'Gross', 'Netpay'] [('Tectonic regime', 'Gross'), ('Tectonic regime', 'Netpay'), ('Tectonic regime', 'Period')]


In [5]:
#Example with all nodes
columns = orig_data.columns
data_test = orig_data[columns]

node_type = get_nodes_type(data_test)
columns_for_discrete = []
for param in columns:
    if node_type[param] == 'cont':
        columns_for_discrete.append(param)
columns_for_code = []
for param in columns:
    if node_type[param] == 'disc':
        columns_for_code.append(param)        
data_coded, code_dict = code_categories(data_test, "label", columns_for_code)
scoring_method = MIG(data=data_coded)
est = HillClimbSearch(data=data_coded, scoring_method=scoring_method, use_cache=False)
estimated_model = est.estimate(max_indegree=4, max_iter=int(1e4), epsilon=1e-4)
print(estimated_model.nodes(), estimated_model.edges())

KeyboardInterrupt: 