# Team Members
1. AXB210119	Abhinava Bharamasagara Nanjundaiah
2. HXD220007	Harsha Priya Daggubati
3. PXP210104	Pritika Priyadarshini

In [1]:
!git clone https://github.com/vkomaragiri/VEC.git
%cd ./VEC/
!pip install igraph
!pip install Cython
!pip install .
%cd ../

fatal: destination path 'VEC' already exists and is not an empty directory.
/home/abhinava/Multi-Class-Bayesian/VEC
Processing /home/abhinava/Multi-Class-Bayesian/VEC
Building wheels for collected packages: TPM
  Building wheel for TPM (setup.py) ... [?25ldone
[?25h  Created wheel for TPM: filename=TPM-0.0.0-cp38-cp38-linux_x86_64.whl size=2755803 sha256=450c234071af24912796f4fe3296930687c39da68eeb6b8e60aa0583ae5ad926
  Stored in directory: /tmp/pip-ephem-wheel-cache-nv3zjqzk/wheels/2e/d5/44/c81e6a35fc73ef0bcff6cb95c982a259c326a0c801c11cfb8e
Successfully built TPM
Installing collected packages: TPM
  Attempting uninstall: TPM
    Found existing installation: TPM 0.0.0
    Uninstalling TPM-0.0.0:
      Successfully uninstalled TPM-0.0.0
Successfully installed TPM-0.0.0
/home/abhinava/Multi-Class-Bayesian


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from MN import MN 
from BTP import BTP
import pickle

Mounted at /content/drive


In [None]:
class Data:
    def __init__(self, fpath):

        f = open(fpath, "r")

        self.nvars = int(f.readline())  # 1

        line = np.asarray(f.readline().split(), dtype=np.int32)  # 2
        self.evid_var_ids = line[1:]
        evid_indices = range(1, self.evid_var_ids.shape[0]*2, 2)

        line = np.asarray(f.readline().split(), dtype=np.int32)  # 3
        self.query_var_ids = line[1:]
        query_indices = range(
            self.evid_var_ids.shape[0]*2+1, (self.evid_var_ids.shape[0]+self.query_var_ids.shape[0])*2, 2)

        line = np.asarray(f.readline().split(), dtype=np.int32)  # 4
        self.hidden_var_ids = line[1:]

        line = f.readline()  # 5
        self.nproblems = int(f.readline())  # 6

        self.evid_assignments = []
        self.query_assignments = []
        self.weights = []
        for i in range(self.nproblems):
            line = np.asarray(f.readline().split(), dtype=float)
            self.evid_assignments.append(np.asarray(
                line[evid_indices], dtype=np.int32))
            self.query_assignments.append(np.asarray(
                line[query_indices], dtype=np.int32))
            self.weights.append(line[-1])
        self.evid_assignments = np.asarray(self.evid_assignments)
        self.query_assignments = np.asarray(self.query_assignments)
        self.weights = np.asarray(self.weights)
        self.hidden_assignments = []

    def convertToXYWithH(self, hidden_assignments):
        return (np.concatenate((self.evid_assignments, hidden_assignments), axis=1), self.query_assignments)

    def convertToXY(self):
        return (self.evid_assignments, self.query_assignments)

    def convertResults(self, query_predictions, removed_qvars):
        self.query_var_ids = np.delete(self.query_var_ids, removed_qvars)
        out = np.zeros(
            (query_predictions.shape[0], 1+2*self.query_var_ids.shape[0]), dtype=int)
        out[:, 2::2] = query_predictions[:, :]
        out[:, 1::2] = self.query_var_ids
        out[:, 0] = self.query_var_ids.shape[0]
        return out

    def computeLogProb(self, mn, order, X, y):
        out = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            for j in range(len(self.evid_var_ids)):
                mn.setEvidence(self.evid_var_ids[j], X[i][j])
            for j in range(y.shape[1]):
                mn.setEvidence(self.query_var_ids[j], y[i][j])
            btp = BTP(mn, order)
            out[i] = np.log10(btp.getPR())
        return out

    @staticmethod
    def computeErr(true_ll, pred_ll):
        return np.sum(true_ll)-np.sum(pred_ll)

    @staticmethod
    def computeScore(err, max_err):
        return np.max((0, 100*(1.0-err/max_err)))


In [None]:
data_directory = './content/MLC/'
dname = 'Sample_1_MLC_2022'
data = Data(data_directory+dname+'.data')

In [None]:
hidden_assignments = np.loadtxt(data_directory+dname+'.new_features', delimiter=' ', dtype=np.int32)
print("hidden_assignments: \n", hidden_assignments)
X_with_hidden, Y_with_hidden = data.convertToXYWithH(hidden_assignments)
X_train_hidden, X_test_hidden, Y_train_hidden, Y_test_hidden = train_test_split(X_with_hidden, Y_with_hidden, test_size=0.33, shuffle=False)
print("X_train_hidden len: ", len(X_train_hidden[0]))

X_old, y_old = data.convertToXY()
X_train_old, X_test_old, y_train_old, y_test_old = train_test_split(X_old, y_old, test_size=0.33, shuffle=False)
print("X_train_old len: ",len(X_train_old[0]))

hidden_assignments: 
 [[0 1 1 ... 0 1 1]
 [0 1 1 ... 1 1 0]
 [1 0 0 ... 0 0 1]
 ...
 [1 1 1 ... 0 1 1]
 [1 1 0 ... 0 1 0]
 [1 0 1 ... 0 1 1]]
X_train_hidden len:  808
X_train_old len:  400


In [None]:
# label cleaning
col_to_remove = []
for j in range(y_train_old.shape[1]):
    if len(set(y_train_old[:,j])) == 1:
        print("useless column as it has one class", y_train_old[:,j])
        print(j)
        col_to_remove.append(j)
Y_train_hidden = np.delete(Y_train_hidden, col_to_remove, 1)
Y_test_hidden = np.delete(Y_test_hidden, col_to_remove, 1)
y_train_old = np.delete(y_train_old, col_to_remove, 1)
y_test_old = np.delete(y_test_old, col_to_remove, 1)

In [None]:
base_logistic = MultiOutputClassifier(LogisticRegression(max_iter=1000, n_jobs=-1)).fit(X_train_old, y_train_old)
base_logistic_pred = base_logistic.predict(X_test_old)

In [None]:
# Not much change using NN
# neural_network = MLPClassifier(max_iter=1000).fit(X_train_hidden, Y_train_hidden)
# neural_network_pred = neural_network.predict(X_test_hidden)

In [None]:
logistic = MultiOutputClassifier(LogisticRegression(max_iter=1000, n_jobs=-1)).fit(X_train_hidden, Y_train_hidden)
logistic_pred = logistic.predict(X_test_hidden)

In [None]:
random_forest = MultiOutputClassifier(RandomForestClassifier(n_estimators = 10, max_depth=2, n_jobs=-1)).fit(X_train_old, y_train_old)
random_forest_pred = random_forest.predict(X_test_old)

In [None]:
random_forest_hidden = MultiOutputClassifier(RandomForestClassifier(n_estimators = 10, max_depth=5, n_jobs=-1)).fit(X_train_hidden, Y_train_hidden)
random_forest_hidden_pred = random_forest_hidden.predict(X_test_hidden)

In [None]:
pickle.dump(random_forest_hidden, open(data_directory+"model/"+dname+'.random_forest_hidden_model', 'wb'))
pickle.dump(random_forest, open(data_directory+"model/"+dname+'.random_forest_base_model', 'wb'))
pickle.dump(logistic, open(data_directory+"model/"+dname+'.logistic_hidden_model', 'wb'))
pickle.dump(base_logistic, open(data_directory+"model/"+dname+'.logistic_basic_model', 'wb'))

In [None]:
order = np.asarray(np.arange(data.nvars), dtype=np.int32)
np.random.shuffle(order)

In [None]:
# running for only sub-sample of test
nTest = 10 #len(y_test_old)
mn = MN()
mn.read(data_directory+dname+'.uai')
lProb_true = data.computeLogProb(mn, order, X_test_old[:nTest, :], y_test_old[:nTest, :])
lProb_trivial = data.computeLogProb(mn, order, X_test_old[:nTest, :], random_forest_pred[:nTest, :])
lProb_base_lr = data.computeLogProb(mn, order, X_test_old[:nTest, :], base_logistic_pred[:nTest, :])
lProb_pred = data.computeLogProb(mn, order, X_test_hidden[:nTest, :], logistic_pred[:nTest, :])
# lProb_nn_hidden = data.computeLogProb(mn, order, X_test_hidden[:nTest, :], random_forest_hidden_pred[:nTest, :])
lProb_hidden_trivial = data.computeLogProb(mn, order, X_test_hidden[:nTest, :], random_forest_hidden_pred[:nTest, :])

In [None]:
hiddenLRErr = Data.computeErr(lProb_true, lProb_pred) 
maxTrivialErr = Data.computeErr(lProb_true, lProb_trivial)
maxHiddenErr = Data.computeErr(lProb_true, lProb_hidden_trivial)
maxBaseLRErr = Data.computeErr(lProb_true, lProb_base_lr)
# maxNNErr = Data.computeErr(lProb_true, lProb_nn_hidden)

print("Error with Random Forest `with` Hidden Assignments: \t\t\t", maxHiddenErr)
print("Error with Random Forest `without` Hidden Assignments: \t\t\t", maxTrivialErr)
print("Error with Logistic Regression `with` Hidden Assignments: \t\t", hiddenLRErr)
print("Error with Logistic Regression `without` Hidden Assignments: \t\t", maxBaseLRErr)
# print("Error with NN `with` Hidden Assignments: \t\t\t", maxNNErr)
print()
print("Score LR `without` hidden vs Random Forest `without` Hidden Assignments: ", Data.computeScore(maxBaseLRErr, maxTrivialErr))
print("Score LR `with` hidden vs Random Forest `without` Hidden Assignments: \t", Data.computeScore(hiddenLRErr, maxTrivialErr ))
print("Score LR `with` hidden vs Random Forest `with` hidden Assignments: \t", Data.computeScore(hiddenLRErr, maxHiddenErr))
# print("Score NN `with` hidden vs Random Forest `without` hidden Assignments: \t", Data.computeScore(maxNNErr, maxTrivialErr))