In [1]:
import os
import numpy as np
import pandas as pd
from numpy.linalg import inv
from numpy.matlib import repmat

In [2]:
import sys
sys.path.append('../../Python/')
from Preprocessing import standardize
from Evaluation import split_train_test_ma, train_model, eval_model
from Databases import get_databases_path, get_database

In [3]:
databases_path = '../../Databases/Sinteticas'
paths = get_databases_path(databases_path)

In [4]:
# Obtain the attributes and labels
attr_df = get_database(paths[1], 'database_attr.csv')
labels_df = get_database(paths[1], 'database_labels.csv')

In [5]:
# Preprocessing the attributes
standardized_attr_df = standardize(attr_df)

In [6]:
# Split in Training and Test Sets
X_train, X_test, y_train, y_test = split_train_test_ma(standardized_attr_df, labels_df)

In [7]:
R = y_train.columns.size
Ntr = y_train.shape[0]

In [8]:
def sigmoid(a):
    return 1./(1 + np.exp(-a))

def training_LFCmodel(X, Y):
    X.loc[:,'b'] = np.zeros((X.shape[0],1))
    R = Y.shape[1] # Number of annotators.
    D = X.shape[1] # Input space dimension
    N = X.shape[0] # Number of samples.
    
    # Parameters initialization
    loglik1 = 10000000
    difloglik = 1
    difloglik1 = -1
    mui = np.array(Y.mean(axis=1)).reshape(Y.shape[0],1)

    w = np.dot(inv(np.dot(X.T,X) + 1e-20*np.eye(D)), np.dot(X.T,mui))
    i = 0
    
    while abs(difloglik) > 0.2 and i<100: 
        # M-step
        # We initialize the optimization parameters w.
        pi = sigmoid(np.dot(X,w))   
        gr = np.dot(X.T, (mui - pi))

        diag_aux = np.multiply(pi,(1-pi))
        diag = diag_aux*np.identity(len(diag_aux))

        H  = np.dot(np.dot(-X.T,diag),X)
        w  = w - np.dot(inv(H + 1e-20*np.eye(D)),gr)

        alpha = np.multiply(repmat(mui,1,R),Y).sum()/mui.sum()
        beta =  np.multiply(repmat((1-mui),1,R),(1-Y)).sum()/(1-mui).sum()

        # E-step
        pi = sigmoid(np.dot(X,w))  
        ai = np.multiply(np.power(repmat(alpha,N,1),Y),np.power(repmat((1-alpha),N,1),(1-Y)))
        ai = np.array(np.prod(ai, axis=1)).reshape(ai.shape[0],1)

        bi = np.multiply(np.power(repmat(beta,N,1),(1-Y)), np.power(repmat((1-beta),N,1),Y))
        bi = np.array(np.prod(bi, axis=1)).reshape(ai.shape[0],1)

        mui = np.multiply(ai, pi)/(np.multiply(ai,pi) + np.multiply(bi,(1 - pi)))

        loglik2 = -(np.log(np.multiply(ai, pi) + np.multiply((1-pi),bi)).sum())
        difloglik = abs(loglik2 - loglik1)
        difloglik1 = (loglik2 - loglik1)
        loglik1 = loglik2
        print('Verosimilitud:', abs(difloglik))

        i+=1    
    return w, alpha, beta 

In [9]:
w, alpha, beta = training_LFCmodel(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Verosimilitud: 9998638.115005765
Verosimilitud: 13.628329134145588
Verosimilitud: 31.104994377176354
Verosimilitud: 56.159227338884875
Verosimilitud: 52.320428014990966
Verosimilitud: 30.32196352383835
Verosimilitud: 16.66017736551339
Verosimilitud: 8.502573441662662
Verosimilitud: 5.01178724309375
Verosimilitud: 3.076003630393643
Verosimilitud: 2.1168256440464575
Verosimilitud: 1.845699420199253
Verosimilitud: 1.8244322915011253
Verosimilitud: 1.8893136480705834
Verosimilitud: 2.3496160699342
Verosimilitud: 2.5289299210394347
Verosimilitud: 2.622556209738832
Verosimilitud: 2.2653312756456216
Verosimilitud: 0.7808425710099982
Verosimilitud: 0.3021274681407249
Verosimilitud: 0.12589109695250045


In [10]:
X_test.loc[:,'b'] = 0.0
predtest = sigmoid(np.dot(X_test,w))
y_pred = [True if item >= 0.5 else False for item in predtest]

In [11]:
eval_model(y_test, y_pred)

Global Precision:
0.9298245614035088


General Report:
              precision    recall  f1-score   support

         yes       0.94      0.94      0.94       108
          no       0.90      0.90      0.90        63

    accuracy                           0.93       171
   macro avg       0.92      0.92      0.92       171
weighted avg       0.93      0.93      0.93       171



Confusion Matrix: 
     0   1
0  102   6
1    6  57
