In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import pandas as pd

import utils.luigi_wrapper as luigi
from utils.utils import *

# from preprocess.questions_label_extraction import QuestionsLabelExtractionTask
# from questions_model.create_predictions import QuestionsMakePredictionsTask
# from preprocess.train_test_split import TrainTestSplitTask
# from preprocess.data_tokenization import DataTokenizationTask
# from preprocess.feature_selection import FeatureSelectionTask
from preprocess.create_dataset import CreateDataSetTask
from preprocess.dataset import DataSet


from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, f1_score

from IPython.display import display

In [2]:
from scipy.sparse.csgraph import minimum_spanning_tree
import networkx as nx

In [3]:
pd.set_option('max_rows', 70)

In [4]:
def get_prior(y):
    num_of_pages = y.shape[0]
    num_of_classes = y.shape[1]

    p_ij = np.zeros((num_of_classes, num_of_classes))
    for lst in y:
        for i in range(num_of_classes):
            for j in range(num_of_classes):
                p_ij[i, j] += (lst[i] * lst[j]) / num_of_pages
    return p_ij

def get_mutual_information(p_ij, nan=0, neginf=-5):
    E = np.zeros(p_ij.shape)
    for i in range(p_ij.shape[0]):
        for j in range(p_ij.shape[0]):
            i1_j1 = p_ij[i,j] * np.log(p_ij[i,j] / (p_ij[i, i] * p_ij[j, j]))
            i1_j0 = (p_ij[i,i] - p_ij[i,j]) * np.log((p_ij[i,i] - p_ij[i,j]) / (p_ij[i,i] * (1 - p_ij[j,j])))
            i0_j1 = (p_ij[j,j] - p_ij[i,j]) * np.log((p_ij[j,j] - p_ij[i,j]) / (p_ij[j,j] * (1 - p_ij[i,i])))
            i0_j0 = (1 - p_ij[j,j] - p_ij[i,i] + p_ij[i,j]) * np.log((1 - p_ij[j,j] - p_ij[i,i] + p_ij[i,j]) / (1 - p_ij[j,j]) * (1 - p_ij[i,i]))
            E[i, j] = i1_j1 + i1_j0 + i0_j1 + i0_j0
    E_filled = - np.nan_to_num(E, nan=nan, neginf=neginf)
    return E_filled

def theta(p_ij, i, j, i_val, j_val):
    if i_val == 1 and j_val == 1:
        return p_ij[i,j] / (p_ij[i,i] * p_ij[j,j])
    elif i_val == 1 and j_val == 0:
        return (p_ij[i,i] - p_ij[i,j]) / (p_ij[i,i] * (1 - p_ij[j,j]))
    elif i_val == 0 and j_val == 1:
        return (p_ij[j,j] - p_ij[i,j]) / (p_ij[j,j] * (1 - p_ij[i,i]))
    elif i_val == 0 and j_val == 0:
        return (1 - p_ij[j,j] - p_ij[i,i] + p_ij[i,j]) / ((1 - p_ij[j,j]) * (1 - p_ij[i,i]))
    
def get_neg_conn_strength(p_ij):
    conn_strength = np.zeros(p_ij.shape)
    for i in range(p_ij.shape[0]):
        for j in range(p_ij.shape[0]):
            conn_strength[i, j] = (theta(p_ij, i, j, 1, 0) + theta(p_ij, i, j, 0, 1)) / 2
    return conn_strength

In [5]:
def get_neg_mutual_information(p_ij, nan=0, neginf=-5):
    E = np.zeros(p_ij.shape)
    for i in range(p_ij.shape[0]):
        for j in range(p_ij.shape[0]):
            i1_j0 = (p_ij[i,i] - p_ij[i,j]) * np.log((p_ij[i,i] - p_ij[i,j]) / (p_ij[i,i] * (1 - p_ij[j,j])))
            i0_j1 = (p_ij[j,j] - p_ij[i,j]) * np.log((p_ij[j,j] - p_ij[i,j]) / (p_ij[j,j] * (1 - p_ij[i,i])))
            E[i, j] = i1_j0 + i0_j1
    E_filled = - np.nan_to_num(E, nan=nan, neginf=neginf)
    return E_filled

In [6]:
def get_y_by_thresh(y_proba, th= 0.5):
    return (y_proba > th).astype(int)

def get_thresh(y_true, y_pred, n_thers = 500, plot=False):
    thresholds = [n / n_thers for n in list(range(1, n_thers, 1))]
    f1_scores = [f1_score(y_true, get_y_by_thresh(y_pred, thresh), average = 'macro') for thresh in thresholds]
    
    if plot:
        plt.plot(thresholds, f1_scores)
        plt.show()
    
    return thresholds[np.argmax(f1_scores)]

def normalize(y_proba, th= 0.5):
    y_proba_fixed = np.zeros(y_proba.shape)
    y_proba_fixed[y_proba<th] = (y_proba[y_proba<th] / th) * 0.5
    y_proba_fixed[y_proba>=th] = ((y_proba[y_proba>=th] - th) / (1-th)) * 0.5 + 0.5
    return y_proba_fixed

# def sigmoid_normalize(y_proba, th= 0.5):
#     y_proba_fixed = np.zeros(y_proba.shape)
#     y_proba_fixed = 1 / (1 + np.exp())
#     return y_proba_fixed

def get_y(y_test, y_proba, do_normalize=False):
    y = []
    y_proba_fixed = []
    for c in range(y_proba.shape[1]):
        y_proba_c = y_proba[:, c]
        y_test_c = y_test[:, c]
        th = get_thresh(y_test_c, y_proba_c)
        y.append(get_y_by_thresh(y_proba_c, th))
        y_proba_fixed.append(normalize(y_proba_c, th))
    
    if do_normalize:
        return np.array(y).T, np.array(y_proba_fixed).T
    else:
        return np.array(y).T, y_proba    

## Read Data

In [7]:
data: DataSet = read_data(get_file_path('dataset.pickle', 'data'))
X = data.X
y = data.y
X_train, X_test = data.X_train, data.X_test
y_train, y_test = data.y_train, data.y_test



AttributeError: 'NoneType' object has no attribute 'X'

## Get Prior

In [None]:
p_ij = get_prior(y_train)

## Train Model

In [None]:
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier
model = OneVsRestClassifier(LGBMClassifier(class_weight = 'balanced'), n_jobs=-2)
#model.fit(X_train, y_train)

In [None]:
#pickle.dump(model, open('temp_model.pickle', 'wb'))

In [None]:
model = pickle.load(open('temp_model.pickle', 'rb'))

In [None]:
y_pred_proba = model.predict_proba(X_test)

In [None]:
#print(classification_report(y_test, np.round(y_pred_proba)))

## Priored Predictions

In [None]:
global messages_dict
messages_dict = {}

In [None]:
def message(p_ij, T, prediction, i, parent, parent_val):
    N = []
    #print(f'child: {i}, parent: {parent}')
    for j in range(len(prediction)):
        if (T[i,j] != 0 or T[j,i] != 0) and j != parent and j != i:
            N.append(j)
    if f'{i}_{parent}_{parent_val}' in messages_dict.keys():
        m1 = messages_dict[f'{i}_{parent}_{parent_val}'][1]
        m0 = messages_dict[f'{i}_{parent}_{parent_val}'][0]
    else:
        #print('calculate')
        if len(N) > 0 :  
            m1 = theta(p_ij, i, parent, 1, parent_val) * prediction[i] * np.prod([message(p_ij, T, prediction, n, i, 1) for n in N])
            #print('theta1: ' + str(theta(p_ij, i, parent, 1, parent_val)))
            m0 = theta(p_ij, i, parent, 0, parent_val) * (1 - prediction[i]) * np.prod([message(p_ij, T, prediction, n, i, 0) for n in N])
            #print('theta0: ' + str(theta(p_ij, i, parent, 0, parent_val)))

        else:
            m1 = theta(p_ij, i, parent, 1, parent_val) * prediction[i]
            m0 = theta(p_ij, i, parent, 0, parent_val) * (1 - prediction[i])
        
        messages_dict[f'{i}_{parent}_{parent_val}'] = (m0, m1)
    #  m1 + m0 - ?
    return m1 + m0     

In [None]:
def run_prior_on_prediction(p_ij, T, prediction):
    
    priored_prediction = []
    global messages_dict
    messages_dict = {}
    
    for i in range(len(prediction)):
        p_i_new = 0
        N = []
        for j in range(len(prediction)):
            if T[i,j] != 0 or T[j,i] != 0:
                N.append(j)
        
        messages1 = [message(p_ij, T, prediction, n, i, 1) for n in N]
        messages0 = [message(p_ij, T, prediction, n, i, 0) for n in N]
            
        p_i1 = prediction[i] * np.prod(messages1) #* p_ij[i,i]
        p_i0 = (1 - prediction[i]) * np.prod(messages0) * (1- p_ij[i,i])
            
        p_i_new += p_i1 / (p_i1 + p_i0)
            

        priored_prediction.append(p_i_new)
    return priored_prediction

### y_pred w/o prior

In [None]:
y_pred, y_pred_proba = get_y(y_test, y_pred_proba)

### y_pred with pos prior

In [None]:
E = get_mutual_information(p_ij)
T = minimum_spanning_tree(E)

In [None]:
#y_pred_after_prior_proba_pos = np.array([run_prior_on_prediction(p_ij, T, p) for p in y_pred_proba])

In [None]:
#pickle.dump(y_pred_after_prior_proba_pos, open('y_pred_after_prior_proba_pos.pickle', 'wb'))

In [None]:
y_pred_after_prior_proba_pos = pickle.load(open('y_pred_after_prior_proba_pos.pickle', 'rb'))

In [None]:
y_pred_after_prior, y_pred_after_prior_proba_pos = get_y(y_test, y_pred_after_prior_proba_pos)

### y_pred with  neg prior

In [None]:
E_neg = get_neg_mutual_information(p_ij)
T_neg = minimum_spanning_tree(E_neg)

In [None]:
#y_pred_after_prior_proba_neg = np.array([run_prior_on_prediction(p_ij, T_neg, p) for p in y_pred_proba])

In [None]:
#pickle.dump(y_pred_after_prior_proba_neg, open('y_pred_after_prior_proba_neg.pickle', 'wb'))

In [None]:
y_pred_after_prior_proba_neg = pickle.load(open('y_pred_after_prior_proba_neg.pickle', 'rb'))

In [None]:
y_pred_after_prior_neg, y_pred_after_prior_neg_proba = get_y(y_test, y_pred_after_prior_proba_neg)

## Neg After Pos

In [None]:
#y_pred_after_prior_proba_neg_after_pos = np.array([run_prior_on_prediction(p_ij, T_neg, p) for p in y_pred_after_prior_proba_pos])

In [None]:
#pickle.dump(y_pred_after_prior_proba_neg_after_pos, open('y_pred_after_prior_proba_neg_after_pos.pickle', 'wb'))

In [None]:
y_pred_after_prior_proba_neg_after_pos = pickle.load(open('y_pred_after_prior_proba_neg_after_pos.pickle', 'rb'))

In [None]:
y_pred_after_prior_neg_after_pos, y_pred_after_prior_proba_neg_after_pos = get_y(y_test, y_pred_after_prior_proba_neg_after_pos)

## Pos After Neg 

In [None]:
#y_pred_after_prior_proba_pos_after_neg = np.array([run_prior_on_prediction(p_ij, T, p) for p in y_pred_after_prior_proba_neg])

In [None]:
#pickle.dump(y_pred_after_prior_proba_pos_after_neg, open('y_pred_after_prior_proba_pos_after_neg.pickle', 'wb'))

In [None]:
y_pred_after_prior_proba_pos_after_neg = pickle.load(open('y_pred_after_prior_proba_pos_after_neg.pickle', 'rb'))

In [None]:
y_pred_after_prior_pos_after_neg, y_pred_after_prior_proba_pos_after_neg = get_y(y_test, y_pred_after_prior_proba_pos_after_neg)

### normalize pos

In [None]:
y_pred_after_prior, y_pred_norm_proba_pos = get_y(y_test, y_pred_after_prior_proba_pos, do_normalize=True)

In [None]:
#y_pred_norm_proba_neg_after_pos = np.array([run_prior_on_prediction(p_ij, T_neg, p) for p in y_pred_norm_proba_pos])

In [None]:
#pickle.dump(y_pred_norm_proba_neg_after_pos, open('y_pred_norm_proba_neg_after_pos.pickle', 'wb'))

In [None]:
y_pred_norm_proba_neg_after_pos = pickle.load(open('y_pred_norm_proba_neg_after_pos.pickle', 'rb'))

In [None]:
y_pred_norm_neg_after_pos, y_pred_norm_proba_neg_after_pos = get_y(y_test, y_pred_norm_proba_neg_after_pos)

### normalize neg

In [None]:
y_pred_after_prior_neg, y_pred_norm_proba_neg = get_y(y_test, y_pred_after_prior_proba_neg, do_normalize=True)

In [None]:
#y_pred_norm_proba_pos_after_neg = np.array([run_prior_on_prediction(p_ij, T_neg, p) for p in y_pred_norm_proba_neg])

In [None]:
#pickle.dump(y_pred_norm_proba_pos_after_neg, open('y_pred_norm_proba_pos_after_neg.pickle', 'wb'))

In [None]:
y_pred_norm_proba_pos_after_neg = pickle.load(open('y_pred_norm_proba_pos_after_neg.pickle', 'rb'))

In [None]:
y_pred_norm_pos_after_neg, y_pred_norm_proba_pos_after_neg = get_y(y_test, y_pred_norm_proba_pos_after_neg)

## Scores Comparisons

In [None]:
print(f1_score(y_test, y_pred, average = 'macro'))
print(f1_score(y_test, y_pred_after_prior, average = 'macro'))
print(f1_score(y_test, y_pred_after_prior_neg, average = 'macro'))
print(f1_score(y_test, y_pred_after_prior_neg_after_pos, average = 'macro'))
print(f1_score(y_test, y_pred_after_prior_pos_after_neg, average = 'macro'))
print(f1_score(y_test, y_pred_norm_neg_after_pos, average = 'macro'))
print(f1_score(y_test, y_pred_norm_pos_after_neg, average = 'macro'))

In [None]:
print(roc_auc_score(y_test, y_pred_proba, average = 'macro'))
print(roc_auc_score(y_test, y_pred_after_prior_proba_pos, average = 'macro'))
print(roc_auc_score(y_test, y_pred_after_prior_neg_proba, average = 'macro'))
print(roc_auc_score(y_test, y_pred_after_prior_proba_neg_after_pos, average = 'macro'))
print(roc_auc_score(y_test, y_pred_after_prior_proba_pos_after_neg, average = 'macro'))
print(roc_auc_score(y_test, y_pred_norm_proba_neg_after_pos, average = 'macro'))
print(roc_auc_score(y_test, y_pred_norm_proba_pos_after_neg, average = 'macro'))

## Research probas distributions

In [None]:
def normalize_func(y_proba, th):
    y_proba_fixed = np.zeros(y_proba.shape)
    y_proba_fixed[y_proba<th] = ((y_proba[y_proba<th] / th)) * 0.5
    y_proba_fixed[y_proba>=th] = (((y_proba[y_proba>=th] - th) / (1-th)) * 0.5 + 0.5)
    return y_proba_fixed

In [None]:
def try_normalization(y_test, y_proba, normalize_func=None):

    y = []
    y_proba_fixed = []
    threshs = []
    for c in range(y_proba.shape[1]):
        y_proba_c = y_proba[:, c]
        y_test_c = y_test[:, c]
        th = get_thresh(y_test_c, y_proba_c)
        threshs.append(th)
        y.append(get_y_by_thresh(y_proba_c, th))
        y_proba_fixed.append(normalize_func(y_proba_c, th))
    
    for i in range(59):
        print(i)
        #plt.plot(sorted(y_pred_proba[:, i]), label='no prior')
        plt.plot(sorted(y_pred_after_prior_proba_pos[:, i]), label='pos prior')
        #plt.plot(sorted(y_pred_after_prior_neg_proba[:, i]), label='neg prior')
        #plt.plot(sorted(y_pred_norm_proba_neg_after_pos[:, i]), label='neg_after_pos_normed')
        plt.plot(sorted(np.array(y_proba_fixed).T[:, i]), label='pos_normed')
        plt.plot([0, 1200], [threshs[i],threshs[i]])
        plt.legend()
        plt.show()
    
    return np.array(y).T, np.array(y_proba_fixed).T, threshs

In [None]:
2, 10, 11, 12, 18, 19, 23, 27, 28, 32, 44, 53

In [None]:
x = try_normalization(y_test, y_pred_after_prior_proba_pos, normalize_func)

In [None]:
plt.hist(x[2])

## Other Visualizations and Statistics

In [None]:
#colors_dict

In [None]:
#len(pos_index), len(neg_index), len(set(pos_index).intersection(set(neg_index)))

In [None]:
#pos_index

In [None]:
#neg_index

In [None]:
def get_classification_df(y_test, y_pred):
    
    df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True, target_names=data._y_cols)).T.reset_index()
    return df

global colors_dict
colors_dict = {'red':0, 'green':0, 'blue':0}

global y_index
y_index = []

def highlight_greater(row):
    global colors_dict

    if row['f1-score'] > row['f1-score_prior']:
        color = 'red'
    elif row['f1-score'] < row['f1-score_prior']:
        color = 'green'
    else:
        color= 'blue'
        
    if not 'avg' in row['index']:
        colors_dict[color] += 1
        if color == 'red':
            y_index.append(row['index'])
        
    background = ['background-color: {}'.format(color) for _ in row]
    return background



df = get_classification_df(y_test, y_pred).join(get_classification_df(y_test, y_pred_after_prior), rsuffix = '_prior')
df.style.apply(highlight_greater, axis=1)


17 red
11 blue
31 green

In [None]:
get_classification_df(y_pred_after_prior, y_pred_norm_neg_after_pos)

In [None]:
for c in range(y.shape[1]):
    plt.scatter(y_pred_proba[:, c], y_pred_after_prior_proba[:, c])
plt.show()

In [None]:
for c in range(y.shape[1]):
    plt.plot(y_pred_proba[:, c], '.')
    plt.plot(y_pred_after_prior_proba[:, c], '.')
    plt.show()

In [None]:
label_count = len(y.columns)

In [None]:
# T = minimum_spanning_tree(get_mutual_information(p_ij)).todense()
# G = nx.Graph(T, nodes=y.columns)
# G = nx.relabel_nodes(g, {i: c for i,c in zip(range(y.shape[1]), y.columns)})
# nx.draw_networkx(G, cmap=plt.get_cmap('jet'), with_labels=True,font_size=16, pos=nx.fruchterman_reingold_layout(G))
# fig = plt.gcf()
# fig.set_size_inches(18.5, 10.5)

In [None]:
plt.imshow(p_ij)
plt.colorbar()

In [None]:
plt.imshow(np.array((T != 0) * 1.) * p_ij)
plt.colorbar()

In [None]:
E = get_mutual_information(p_ij)
plt.imshow(E)
plt.colorbar()

In [None]:
plt.imshow(np.array((T != 0) * 1.) * E)
plt.colorbar()