# Team Members
1. AXB210119	Abhinava Bharamasagara Nanjundaiah
2. HXD220007	Harsha Priya Daggubati
3. PXP210104	Pritika Priyadarshini

In [2]:
!git clone https://github.com/vkomaragiri/VEC.git
%cd ./VEC/
!pip install igraph
!pip install Cython
!pip install .
%cd ../

fatal: destination path 'VEC' already exists and is not an empty directory.
/home/abhinava/VEC
Processing /home/abhinava/VEC
Building wheels for collected packages: TPM
  Building wheel for TPM (setup.py) ... [?25ldone
[?25h  Created wheel for TPM: filename=TPM-0.0.0-cp38-cp38-linux_x86_64.whl size=2755801 sha256=fc383dd29a902b20b25dc2c69cd60cd7e592d690605d0225dc8c2f5fd5d4651d
  Stored in directory: /tmp/pip-ephem-wheel-cache-xou7i1_w/wheels/44/34/0d/41a430ec86534abcc318b33aa547d7cc734f7be087dcf3badc
Successfully built TPM
Installing collected packages: TPM
  Attempting uninstall: TPM
    Found existing installation: TPM 0.0.0
    Uninstalling TPM-0.0.0:
      Successfully uninstalled TPM-0.0.0
Successfully installed TPM-0.0.0
/home


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from MN import MN 
from BTP import BTP
import time

/


In [4]:
class Data:
    def __init__(self, fpath):

        f = open(fpath, "r")

        self.nvars = int(f.readline())  # 1

        line = np.asarray(f.readline().split(), dtype=np.int32)  # 2
        self.evid_var_ids = line[1:]
        evid_indices = range(1, self.evid_var_ids.shape[0]*2, 2)

        line = np.asarray(f.readline().split(), dtype=np.int32)  # 3
        self.query_var_ids = line[1:]
        query_indices = range(
            self.evid_var_ids.shape[0]*2+1, (self.evid_var_ids.shape[0]+self.query_var_ids.shape[0])*2, 2)

        line = np.asarray(f.readline().split(), dtype=np.int32)  # 4
        self.hidden_var_ids = line[1:]

        line = f.readline()  # 5
        self.nproblems = int(f.readline())  # 6

        self.evid_assignments = []
        self.query_assignments = []
        self.weights = []
        for i in range(self.nproblems):
            line = np.asarray(f.readline().split(), dtype=float)
            self.evid_assignments.append(np.asarray(
                line[evid_indices], dtype=np.int32))
            self.query_assignments.append(np.asarray(
                line[query_indices], dtype=np.int32))
            self.weights.append(line[-1])
        self.evid_assignments = np.asarray(self.evid_assignments)
        self.query_assignments = np.asarray(self.query_assignments)
        self.weights = np.asarray(self.weights)
        self.hidden_assignments = []

    def convertToXYWithH(self, hidden_assignments):
        return (np.concatenate((self.evid_assignments, hidden_assignments), axis=1), self.query_assignments)

    def convertToXY(self):
        return (self.evid_assignments, self.query_assignments)

    def convertResults(self, query_predictions, removed_qvars):
        self.query_var_ids = np.delete(self.query_var_ids, removed_qvars)
        out = np.zeros(
            (query_predictions.shape[0], 1+2*self.query_var_ids.shape[0]), dtype=int)
        out[:, 2::2] = query_predictions[:, :]
        out[:, 1::2] = self.query_var_ids
        out[:, 0] = self.query_var_ids.shape[0]
        return out

    def computeLogProb(self, mn, order, X, y):
        out = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            for j in range(len(self.evid_var_ids)):
                mn.setEvidence(self.evid_var_ids[j], X[i][j])
            for j in range(y.shape[1]):
                mn.setEvidence(self.query_var_ids[j], y[i][j])
            btp = BTP(mn, order)
            out[i] = np.log10(btp.getPR())
        return out

    @staticmethod
    def computeErr(true_ll, pred_ll):
        return np.sum(true_ll)-np.sum(pred_ll)

    @staticmethod
    def computeScore(err, max_err):
        return np.max((0, 100*(1.0-err/max_err)))

In [11]:
data_directory = './content/MLC/'
dname = 'Sample_1_MLC_2022'
data = Data(data_directory+dname+'.data')

In [12]:
X, y = data.convertToXY()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [13]:
load_order = np.loadtxt(data_directory+dname+'.order',
                        dtype=np.int32, delimiter=' ').astype(np.int32)

In [None]:
def generate_features(ev_id, ev_ass, q_id, q_ass):
    mn = MN()
    mn.read(data_directory+dname+'.uai')
    for j in range(len(ev_id)):
        mn.setEvidence(ev_id[j], ev_ass[j])
    for j in range(len(q_id)):
        mn.setEvidence(q_id[j], q_ass[j])

    btp = BTP(mn, load_order)
    btp.performUpwardPass()

    store_all = {}
    # storing reversed order only
    for i, bucket in enumerate(reversed(btp.buckets)):
        # don't care about empty bucket
        if len(bucket) == 0:
            continue
        for func in bucket:
            # loading in reversed order, since buckets are stored in order
            bucket_id = btp.order[len(btp.buckets) - i - 1]
            # can also get bucket_id from most id occurrence in that bucket (not concrete but i tried this first)
            # bucket_id = max(set(func.getVarIDs()), key=lambda x: list(func.getVarIDs()).count(x))
            store_all.setdefault(bucket_id, [])
            min_order = []
            # seeing all the id's
            for id in func.getVarIDs():
                min_order.append([list(btp.order).index(id), id])
            # irrespective of it contains bucket_id, we are placing at the minimum order
            # coz if not we will be missing an hidden assignment
            mini = min(min_order, key=lambda x: x[0])
            store_all.setdefault(mini[1], [])
            store_all[mini[1]].append(func)
    hidden_assignments = {}
    for key in reversed(btp.order):
        if store_all.get(key) == None:
            # its either query or evidence
            continue
        if len(store_all[key]) == 0:
            assert ("should not come here")
        # print(key)
        max_val_0 = []
        max_val_1 = []
        for func in store_all[key]:
            if len(func.getVarIDs()) > 1:
                solved_func = func.instantiateEvid()
                max_val_0.append(solved_func.getPotential()[0])
                max_val_1.append(solved_func.getPotential()[1])
            else:
                max_val_0.append(func.getPotential()[0])
                max_val_1.append(func.getPotential()[1])

        m0 = max(max_val_0)
        m1 = max(max_val_1)

        hidden_assignments[key] = 1 if m1 > m0 else 0
        mn.setEvidence(key, hidden_assignments[key])

    new_features = [0] * len(hidden_assignments)
    for key in hidden_assignments:
        new_features[list(data.hidden_var_ids).index(key)
                     ] = hidden_assignments[key]
    return new_features

In [None]:
data_set = np.zeros((data.nproblems, len(
    data.hidden_var_ids)))
start = time.time()
for index in range(data.evid_assignments.shape[0]):
    each = time.time()
    hidden_assignments = generate_features(
        data.evid_var_ids, data.evid_assignments[index], data.query_var_ids, data.query_assignments[index])

    data_set[index][:] = hidden_assignments
    if index % 500 == 0:
        np.savetxt(X=data_set, delimiter=' ', fmt='%d',
                   fname=data_directory+dname+'.new_features')
    print(index, "Done in ", time.time() - each)
np.savetxt(X=data_set, delimiter=' ', fmt='%d',
           fname=data_directory+dname+'.new_features')
print("Total Time: ", time.time() - start)