In [1]:
import os
import pandas as pd
from autograd import grad
import autograd.numpy as np
import numpy as np1
import numpy.matlib
from numpy.linalg import inv
from numpy import linalg as LA
from numpy.matlib import repmat

In [2]:
import sys
sys.path.append('../../Python/')
from Preprocessing import standardize
from Evaluation import split_train_test_ma, train_model, test_model
from Databases import get_databases_path, get_database

In [3]:
databases_path = '../../Databases/Sinteticas'
paths = get_databases_path(databases_path)

In [4]:
# Obtain the attributes and labels
attr_df = get_database(paths[0], 'database_attr.csv')
labels_df = get_database(paths[0], 'database_labels.csv')

In [5]:
# Preprocessing the attributes
standardized_attr_df = standardize(attr_df)

In [6]:
# Split in Training and Test Sets
X_train, X_test, y_train, y_test = split_train_test_ma(standardized_attr_df, labels_df)

In [7]:
def sigmoid(a):
    return 1./(1 + np.exp(-a))

def grad_cost_function(w, X, Y, mui):
    R = Y.shape[1] # Number of annotators.
    D = X.shape[1] # Input space dimension
    N = X.shape[0] # Number of samples.
    
    sizew1 = D
    sizew2 = D*R;
    sizeW = sizew1 + sizew2
    
    W1 = w[0,0:sizew1].reshape(sizew1,1)
    W2 = w[0,sizew1:sizeW].reshape(D,R)
    pi = sigmoid(np.dot(X, W1))
    nt = sigmoid(np.dot(X,W2))
    
    Gr1 = np.dot(X.transpose(), (pi - mui))
    Gr2 = np.zeros((D, R))
    
    for j in range(R):
        aux = np.multiply(np.array(Y[:,j]).reshape(nt.shape[0],1), (2*mui-1)) + (1- mui) - np.array(nt[:,j]).reshape(nt.shape[0],1) 
        aux2 = np.sum((np.multiply(np1.matlib.repmat(aux, 1, D), X)), axis=0)
        Gr2[:,j] = -aux2
        g = np.concatenate((Gr1.flatten().reshape(1, sizew1), Gr2.flatten().reshape(1, sizew2)), axis=None)
    return g

           
def optimizer(X, Y, w_0, mui):
    params = w_0
    epsilon = 0.001
    normGrad = 300
    i=0
    while normGrad > 0.1 and i<300:
        gr = grad_cost_function(params, X, Y, mui)
        params = params - epsilon * gr
        normGrad = LA.norm(gr)
        i+=1
    return params, normGrad
    
    
def traing_mae(X, Y):
    X.loc[:,'b'] = np.zeros((X.shape[0],1))
    X = X.as_matrix()
    Y = Y.as_matrix()
    R = Y.shape[1] # Number of annotators.
    D = X.shape[1] # Input space dimension
    N = X.shape[0] # Number of samples.
    
    sizew1 = D
    sizew2 = D*R;
    sizeW = sizew1 + sizew2
    
    w = np.random.randn(1, sizeW)
    mui = np.array(Y.mean(axis=1)).reshape(Y.shape[0],1)
    
    w = repmat(np.dot(inv(np.dot(X.T,X) + 1e-20*np.eye(D)), np.dot(X.T,mui)),1,R+1)
    w = w.flatten().reshape(1, sizeW)
    
    w_0 = w
    loglik1 = 10000000
    difloglik = 100
    difloglik1 = -1
    i = 0
    
    while abs(difloglik) > 0.2 and i<100:
        # M-step     
        w, Ngr = optimizer(X, Y, w_0, mui)
        W1 = w[0,0:sizew1].reshape(sizew1,1)
        W2 = w[0,sizew1:sizeW].reshape(D,R)

        #E-step
        pi = sigmoid(np.dot(X, W1))

        aux = sigmoid(np.dot(X,W2))

        ai = np.multiply(np.power((1-aux),np.absolute(Y-1)),np.power(aux, (1-np.absolute(Y-1))))
        ai = np.array(np.prod(ai, axis=1)).reshape(ai.shape[0],1) 

        bi = np.multiply(np.power((1-aux),np.absolute(Y)),np.power(aux, (1-np.absolute(Y))))
        bi = np.array(np.prod(bi, axis=1)).reshape(bi.shape[0],1) 

        nummui = np.multiply(ai, pi)
        denmui = (np.multiply(ai, pi) + np.multiply(bi, (1-pi))) 
        mui =  np.divide(nummui, denmui)

        aux_0 = 1-mui 
        aux_1 = 1-pi 
        aux_a = np.multiply(mui, np.log(np.multiply(pi, ai))) 
        aux_b = np.multiply(aux_0, np.log(np.multiply(aux_1, bi))) 

        loglik2 = -(aux_a + aux_b).sum()
        difloglik = (loglik2 - loglik1);
        loglik1 = loglik2
        
        i+=1
        w_0 = w
    return w[0,0:sizew1].reshape(sizew1,1) 

In [8]:
X_test.loc[:,'b'] = np.ones((X_test.shape[0],1))
W1 = traing_mae(X_train, y_train)
X_test = X_test.as_matrix()
pred = sigmoid(np.dot(X_test, W1))
y_pred = [True if item >= 0.5 else False for item in pred]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Global Precision:")
print(accuracy_score(y_test, y_pred))
print("\n")

print("General Report:")
print(classification_report(y_test, y_pred, target_names=['yes','no']))
print("\n")

print("Confusion Matrix: ")
matriz_confusion = confusion_matrix(y_test, y_pred)
table = pd.DataFrame(matriz_confusion)
print(table)

Global Precision:
0.5029239766081871


General Report:
              precision    recall  f1-score   support

         yes       0.63      0.53      0.57       108
          no       0.36      0.46      0.41        63

    accuracy                           0.50       171
   macro avg       0.49      0.49      0.49       171
weighted avg       0.53      0.50      0.51       171



Confusion Matrix: 
    0   1
0  57  51
1  34  29


In [11]:
pred

array([[7.94202526e-01],
       [4.37799386e-01],
       [9.02204897e-01],
       [9.77788884e-01],
       [7.99250198e-01],
       [1.18989985e-01],
       [9.94883080e-01],
       [3.36619517e-01],
       [9.52723337e-01],
       [1.29622498e-01],
       [3.29413479e-01],
       [1.40619362e-01],
       [8.90496980e-01],
       [8.20472682e-02],
       [1.16992673e-02],
       [8.60641969e-01],
       [9.95008586e-01],
       [7.18767205e-01],
       [2.86227817e-01],
       [3.01427431e-01],
       [1.42128321e-02],
       [9.98954988e-01],
       [5.34663192e-02],
       [5.81706648e-01],
       [7.75874302e-01],
       [9.02401847e-01],
       [9.89525616e-01],
       [9.98985882e-01],
       [8.07143700e-01],
       [1.75822966e-02],
       [7.76977092e-01],
       [5.86347801e-01],
       [9.31990387e-01],
       [2.65109620e-01],
       [9.14533988e-01],
       [9.30100546e-01],
       [6.62602642e-01],
       [2.53900517e-02],
       [1.36610246e-01],
       [5.78607012e-01],
