In [27]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import pandas as pd

import utils.luigi_wrapper as luigi
from utils.utils import *

# from preprocess.questions_label_extraction import QuestionsLabelExtractionTask
# from questions_model.create_predictions import QuestionsMakePredictionsTask
# from preprocess.train_test_split import TrainTestSplitTask
# from preprocess.data_tokenization import DataTokenizationTask
# from preprocess.feature_selection import FeatureSelectionTask
from preprocess.create_dataset import CreateDataSetTask
from preprocess.dataset import DataSet


from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, f1_score

from IPython.display import display

In [28]:
from scipy.sparse.csgraph import minimum_spanning_tree
import networkx as nx

In [29]:
pd.set_option('max_rows', 70)

In [30]:
def get_prior(y):
    num_of_pages = y.shape[0]
    num_of_classes = y.shape[1]

    p_ij = np.zeros((num_of_classes, num_of_classes))
    for lst in y:
        for i in range(num_of_classes):
            for j in range(num_of_classes):
                p_ij[i, j] += (lst[i] * lst[j]) / num_of_pages
    return p_ij

def get_mutual_information(p_ij, nan=0, neginf=-5):
    E = np.zeros(p_ij.shape)
    for i in range(p_ij.shape[0]):
        for j in range(p_ij.shape[0]):
            E[i, j] = p_ij[i,j] * np.log(p_ij[i,j] / (p_ij[i, i] * p_ij[j, j]))
    E_filled = - np.nan_to_num(E, nan=nan, neginf=neginf)
    return E_filled

def theta(p_ij, i, j, i_val, j_val):
    if i_val == 1 and j_val == 1:
        return p_ij[i,j] / (p_ij[i,i] * p_ij[j,j])
    elif i_val == 1 and j_val == 0:
        return (p_ij[i,i] - p_ij[i,j]) / (p_ij[i,i] * (1 - p_ij[j,j]))
    elif i_val == 0 and j_val == 1:
        return (p_ij[j,j] - p_ij[i,j]) / (p_ij[j,j] * (1 - p_ij[i,i]))
    elif i_val == 0 and j_val == 0:
        return (1 - p_ij[j,j] - p_ij[i,i] + p_ij[i,j]) / ((1 - p_ij[j,j]) * (1 - p_ij[i,i]))
    
def get_neg_conn_strength(p_ij):
    conn_strength = np.zeros(p_ij.shape)
    for i in range(p_ij.shape[0]):
        for j in range(p_ij.shape[0]):
            conn_strength[i, j] = (theta(p_ij, i, j, 1, 0) + theta(p_ij, i, j, 0, 1)) / 2
    return conn_strength

In [32]:
def get_y_by_thresh(y_proba, th= 0.5):
    return (y_proba > th).astype(int)

def get_thresh(y_true, y_pred, n_thers = 500, plot=False):
    thresholds = [n / n_thers for n in list(range(1, n_thers, 1))]
    f1_scores = [f1_score(y_true, get_y_by_thresh(y_pred, thresh), average = 'macro') for thresh in thresholds]
    
    if plot:
        plt.plot(thresholds, f1_scores)
        plt.show()
    
    return thresholds[np.argmax(f1_scores)]

def normalize(y_proba, th= 0.5):
    y_proba_fixed = np.zeros(y_proba.shape)
    y_proba_fixed[y_proba<th] = (y_proba[y_proba<th] / th) * 0.5
    y_proba_fixed[y_proba>=th] = ((y_proba[y_proba>=th] - th) / (1-th)) * 0.5 + 0.5
    return y_proba_fixed

# def sigmoid_normalize(y_proba, th= 0.5):
#     y_proba_fixed = np.zeros(y_proba.shape)
#     y_proba_fixed = 1 / (1 + np.exp())
#     return y_proba_fixed

def get_y(y_test, y_proba, do_normalize=False):
    y = []
    y_proba_fixed = []
    for c in range(y_proba.shape[1]):
        y_proba_c = y_proba[:, c]
        y_test_c = y_test[:, c]
        th = get_thresh(y_test_c, y_proba_c)
        y.append(get_y_by_thresh(y_proba_c, th))
        y_proba_fixed.append(normalize(y_proba_c, th))
    
    if do_normalize:
        return np.array(y).T, np.array(y_proba_fixed).T
    else:
        return np.array(y).T, y_proba    

## Read Data

In [33]:
data: DataSet = read_data(get_file_path('dataset.pickle', 'data'))
X = data.X
y = data.y
X_train, X_test = data.X_train, data.X_test
y_train, y_test = data.y_train, data.y_test

## Get Prior

In [34]:
p_ij = get_prior(y_train)

In [50]:
import random

In [58]:
import random
def get_random_tree_no_zeros(E):
    E_rand1 = np.zeros(E.shape)
    for i in range(len(E)):
        for j in range(len(E)):
            if E[i,j] != 0:
                E_rand1[i,j] = random.random()
    return minimum_spanning_tree(E)            

In [59]:
import random
def get_random_tree():
    E_rand1 = np.array(E)
    for i in range(len(E)):
        for j in range(len(E)):
            E_rand1[i,j] = random.random()
    minimum_spanning_tree(E)            