# **UAI Competition 2022 - Example solver for the MLC task**

## **Part-1: Build solver**

We will load the data and use it for building our solver

### Download data

### Import libraries

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

### Load data

---

The data consists of three sets of variables:

1.   Evidence (observed) variables - X
2.   Hidden variables - H
3.   Query variables - Y


The data loader class *Data* reads the data and partitions it accordingly. 



Helper functions:

```convertToXY()```: This function returns (X, Y) from the .data file

In [4]:
class Data:
  #fpath: File path of the .data file
  
  #self.evid_var_ids: Contains the indices of the observed variables
  #self.query_var_ids: Contains the indices of the query variables
  #self.hidden_var_ids: Contains the indices of the hidden variables
  
  #self.evid_assignments: Assignments to evid variables
  #self.query_assignments: Assignments to query variables
  #self.weights: Pr(e, q)
  def __init__(self, fpath):

    f = open(fpath, "r")
    
    self.nvars = int(f.readline()) #1
    
    line = np.asarray(f.readline().split(), dtype=np.int32)#2
    self.evid_var_ids = line[1:]
    evid_indices = range(1, self.evid_var_ids.shape[0]*2, 2)

    line = np.asarray(f.readline().split(), dtype=np.int32) #3
    self.query_var_ids = line[1:]
    query_indices = range(self.evid_var_ids.shape[0]*2+1, (self.evid_var_ids.shape[0]+self.query_var_ids.shape[0])*2, 2)

    line = np.asarray(f.readline().split(), dtype=np.int32)#4
    self.hidden_var_ids = line[1:]
    
    line = f.readline()#5
    self.nproblems = int(f.readline())#6
    
    self.evid_assignments = []
    self.query_assignments = []
    self.weights = []
    for i in range(self.nproblems):
      line = np.asarray(f.readline().split(), dtype=float)
      self.evid_assignments.append(np.asarray(line[evid_indices], dtype=np.int32))
      self.query_assignments.append(np.asarray(line[query_indices], dtype=np.int32))
      self.weights.append(line[-1])
    self.evid_assignments = np.asarray(self.evid_assignments)
    self.query_assignments = np.asarray(self.query_assignments)
    self.weights = np.asarray(self.weights)

  def convertToXY(self):
    return (self.evid_assignments, self.query_assignments)

  def convertResults(self, query_predictions):
    out = np.zeros((query_predictions.shape[0], 1+2*self.query_var_ids.shape[0]), dtype=int)
    out[:, 2::2] = query_predictions[:, :]
    out[:, 1::2] = self.query_var_ids
    out[:, 0] = self.query_var_ids.shape[0]
    return out

In [7]:
data_directory = '/content/MLC/'
dname = 'Sample_3_MLC_2022'

In [8]:
f =open(data_directory+dname+'.data','r')
nvars = int(f.readline())
line = np.asarray(f.readline().split(), dtype=np.int32)

In [9]:
f =open(data_directory+dname+'.data','r')
x=f.readlines()

In [10]:
len(x)

10006

In [11]:
evid_var_ids = line[1:]
evid_var_ids.shape

(358,)

In [12]:
data = Data(data_directory+dname+'.data')

In [14]:
#Getting Evidence and Query data into X, y

X, y = data.convertToXY()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print(len(X_train[0]))

358


### Train solver: Logistic Regression

In [15]:
clf = MultiOutputClassifier(LogisticRegression(max_iter=1000)).fit(X_train, y_train)

### Predict Query Assignments

In [16]:
y_pred = clf.predict(X_test)

Store the query assignments in file - **Note this is the file to submit as the result**

In [17]:
results_in_format = data.convertResults(y_pred)
np.savetxt(X=results_in_format, delimiter=' ', fmt='%d', fname=data_directory+dname+'.pred')

$\;\;\;\;\;\;$

$\;\;\;\;\;\;$






## **Part-2: Test solver**

Once we have trained the solver, we want to test how good it is.

For a given evidence $E=e$, let $Q=\widehat{q}$ denote the solver prediction and $Q=q$ denote the ground truth value.

$$Err = log \frac{\prod_{i\in Data} Pr(e^{(i)}, q^{(i)})}{\prod_{i\in Data} Pr(e^{(i)}, \widehat{q}^{(i)})}$$

Let $MaxErr$ denote the $Err$ for a trivial solver.
Then,
$$Score = max (0, 100(1-\frac{Err}{MaxErr}))$$

### Using Random Forests as the trivial solver

In [83]:
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators = 10, max_depth=2)).fit(X_train, y_train)

In [84]:
y_trivial = clf.predict(X_test)

### Load Variable Elimination Code

In [23]:
!git clone https://github.com/vkomaragiri/VEC.git

Cloning into 'VEC'...
remote: Enumerating objects: 111, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 111 (delta 30), reused 8 (delta 2), pack-reused 39
Receiving objects: 100% (111/111), 82.04 MiB | 1.23 MiB/s, done.
Resolving deltas: 100% (37/37), done.


In [56]:
cd ./VEC/

/home/abhinava/Multi-Class-Bayesian/VEC


In [25]:
!pip install igraph
!pip install Cython

Collecting igraph
  Downloading igraph-0.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.1 MB/s eta 0:00:01
Installing collected packages: igraph
Successfully installed igraph-0.10.2
Collecting Cython
  Downloading Cython-0.29.32-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 610 kB/s eta 0:00:01
[?25hInstalling collected packages: Cython
Successfully installed Cython-0.29.32


In [60]:
!pip install .
!cd ../../

### Read the Markov network

In [76]:
from MN import MN 
from BTP import BTP

import os
print(os.getcwd())
mn = MN()
mn.read(data_directory+dname+'.uai')

/home/abhinava/Multi-Class-Bayesian


Uncomment and run if you want to use min-fill ordering for Variable Elimination. Saving computed order in the .order file helps speed computation.

In [None]:
#temp = BTP(mn)
#temp.getOrder(2)
#np.savetxt(X=np.asarray(temp.order), fname=data_directory+dname+'.order', delimiter=' ', fmt='%d')

In [None]:
#order = np.loadtxt(data_directory+dname+'.order', delimiter=' ', dtype=np.int32)

In [81]:
order = np.asarray(np.arange(mn.nvars), dtype=np.int32)
np.random.shuffle(order)

### Compute $log_{10} Pr(X, y)$

In [77]:
def computeLogProb(X, y):
  out = np.zeros(X.shape[0])
  for i in range(X.shape[0]):
    for j in range(X.shape[1]):
      mn.setEvidence(data.evid_var_ids[j], X[i][j])
    for j in range(y.shape[1]):
      mn.setEvidence(data.query_var_ids[j], y[i][j])
    btp = BTP(mn, order)
    out[i] = np.log10(btp.getPR())
  return out    

### Compute error and score

In [78]:
def computeErr(true_ll, pred_ll):
  return np.sum(true_ll)-np.sum(pred_ll)

In [79]:
def computeScore(err, max_err):
  return np.max((0, 100*(1.0-err/max_err)))

In [85]:
y_pred = np.loadtxt(data_directory+dname+'.pred', dtype=int, delimiter=' ')[:, 1:][:, 1::2]
ntest = 10
lprob_true = computeLogProb(X_test[:ntest, :], y_test[:ntest, :])
lprob_pred = computeLogProb(X_test[:ntest, :], y_pred[:ntest, :])
lprob_trivial = computeLogProb(X_test[:ntest, :], y_trivial[:ntest, :])

err = computeErr(lprob_true, lprob_pred)
maxErr = computeErr(lprob_true, lprob_trivial)

In [86]:
print(err, maxErr)

0.039193449850017714 44.60461697045889


In [87]:
print("Score:", computeScore(err, maxErr))

Score: 99.91213140586774


In [88]:
print(err, maxErr)

0.039193449850017714 44.60461697045889


In [None]:
!pip freeze

absl-py==1.3.0
aeppl==0.0.33
aesara==2.7.9
aiohttp==3.8.3
aiosignal==1.3.1
alabaster==0.7.12
albumentations==1.2.1
altair==4.2.0
appdirs==1.4.4
arviz==0.12.1
astor==0.8.1
astropy==4.3.1
astunparse==1.6.3
async-timeout==4.0.2
asynctest==0.13.0
atari-py==0.2.9
atomicwrites==1.4.1
attrs==22.1.0
audioread==3.0.0
autograd==1.5
Babel==2.11.0
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==5.0.1
blis==0.7.9
bokeh==2.3.3
branca==0.6.0
bs4==0.0.1
CacheControl==0.12.11
cached-property==1.5.2
cachetools==5.2.0
catalogue==2.0.8
certifi==2022.9.24
cffi==1.15.1
cftime==1.6.2
chardet==3.0.4
charset-normalizer==2.1.1
click==7.1.2
clikit==0.6.2
cloudpickle==1.5.0
cmake==3.22.6
cmdstanpy==1.0.8
colorcet==3.0.1
colorlover==0.3.0
community==1.0.0b1
confection==0.0.3
cons==0.4.5
contextlib2==0.5.5
convertdate==2.4.0
crashtest==0.3.1
crcmod==1.7
cufflinks==0.17.3
cvxopt==1.3.0
cvxpy==1.2.2
cycler==0.11.0
cymem==2.0.7
Cython==0.29.32
daft==0.0.4
dask==2022.2.0
datascience==0.17.5
db-dtypes==1.0.4
debugpy==1.0.