# Learning from the crowd

In [2]:
# Modules importés

import numpy as np
import matplotlib.pyplot as plt
from tools import *
from scipy.optimize import minimize

In [3]:
#bloc génération de données:
def gen_arti(centerx=1,centery=1,sigma=0.1,nbex=1000,data_type=0,epsilon=0.02):
    """ Generateur de donnees,
        :param centerx: centre des gaussiennes
        :param centery:
        :param sigma: des gaussiennes
        :param nbex: nombre d'exemples
        :param data_type: 0: melange 2 gaussiennes, 1: melange 4 gaussiennes, 2:echequier
        :param epsilon: bruit dans les donnees
        :return: data matrice 2d des donnnes,y etiquette des donnnees
    """
    if data_type==0:
         #melange de 2 gaussiennes
         xpos=np.random.multivariate_normal([centerx,centerx],np.diag([sigma,sigma]),int(nbex//2))
         xneg=np.random.multivariate_normal([-centerx,-centerx],np.diag([sigma,sigma]),int(nbex//2))
         data=np.vstack((xpos,xneg))
         y=np.hstack((np.ones(nbex//2),-np.ones(nbex//2)))
    if data_type==1:
        #melange de 4 gaussiennes
        xpos=np.vstack((np.random.multivariate_normal([centerx,centerx],np.diag([sigma,sigma]),int(nbex//4)),np.random.multivariate_normal([-centerx,-centerx],np.diag([sigma,sigma]),int(nbex/4))))
        xneg=np.vstack((np.random.multivariate_normal([-centerx,centerx],np.diag([sigma,sigma]),int(nbex//4)),np.random.multivariate_normal([centerx,-centerx],np.diag([sigma,sigma]),int(nbex/4))))
        data=np.vstack((xpos,xneg))
        y=np.hstack((np.ones(nbex//2),-np.ones(int(nbex//2))))

    if data_type==2:
        #echiquier
        data=np.reshape(np.random.uniform(-4,4,2*nbex),(nbex,2))
        y=np.ceil(data[:,0])+np.ceil(data[:,1])
        y=2*(y % 2)-1
    # un peu de bruit
    data[:,0]+=np.random.normal(0,epsilon,nbex)
    data[:,1]+=np.random.normal(0,epsilon,nbex)
    # on mélange les données
    idx = np.random.permutation((range(y.size)))
    data=data[idx,:]
    y=y[idx]
    return data,y


In [5]:
#Fonction de générations de données 

#Proba que l'annotateur est raison

liste_annotateur_bernouilli=[0.9,0.9,0.9]


def fonction_modification_annotateur(label,proba):
    valeur_proba=np.random.uniform(0,1)
    label_res=label
    if(valeur_proba>=proba):
        label_res=-label
        
    return label_res
        
        

def generation_bernouilli(nombre_exemple,datatype):
    xtrain,ytrain = gen_arti(nbex=nombre_exemple,data_type=0,epsilon=0.2)
    #changement des labels 
    y_annote=np.zeros((nombre_exemple,len(liste_annotateur_bernouilli)))
    
    for i in range(len(liste_annotateur_bernouilli)):
        fonction_annotation=lambda x:fonction_modification_annotateur(x,liste_annotateur_bernouilli[i])
        fonction_annotation=np.vectorize(fonction_annotation)
        y_annote[:,i]=fonction_annotation(ytrain)
    return xtrain,y_annote,ytrain

        
xtrain,y_annote,ytrain=generation_bernouilli(4,0)
print(y_annote)
print(ytrain)




[[-1.  1.  1.]
 [-1. -1. -1.]
 [-1.  1.  1.]
 [ 1.  1.  1.]]
[-1. -1.  1.  1.]


In [5]:
class LearnCrowd:
    def __init__(self, T, N, d):
        self.alpha = np.zeros((1,d)) # Poids des dimensions
        self.beta = 0
        self.w = np.zeros((d,T)) # Poids des labelleurs
        self.gamma = np.zeros((1,T))
        
    def likelihoodBernouilli(self, X, Y):
        #proba cond du label Yt du labelleur t pour la donnée i sachant le vrai label 0 ou 1 (Bernouilli)
        y_cond_z = np.zeros((N,T,2))
        rlog=lambda i,t: 1/(1+exp(-np.dot(self.w(:,t).T,X[i,:])-self.gamma[1,t]))   #indice de la donnée, indice du labelleur
        rlog=np.vectorize(rlog) #rlog c'est neta
        for (t in range(T)):
            y_cond_z[:,t,0] = pow(1-rlog(:,t),np.abs(Y[:,t]))*pow(rlog(:,t),1-np.abs(Y[:,t]))
            y_cond_z[:,t,1] = pow(1-rlog(:,t),np.abs(Y[:,t]-1))*pow(rlog(:,t),1-np.abs(Y[:,t]-1))
        #hyp de base que l'on pourra prendre pour simplifier neta[i,t]=rlog(i,t)=neta[t]
        #cet hyp revient à donner une proba constante de se tromper pour le labelleur t quelque soit la donnée
        #il faudrait alors rajouter un self.neta=np.zeros(1,T) au init pour le modèle de Bernouilli
        
        #proba que le label soit 0 ou 1 sachant la donnée (Rlog)
        z_cond_x = np.zeros((N,2))
        sigm = lambda x:  1/(1+exp(-x))
        sigm=np.vectorize(sigm)
        z_cond_x[:,0] = sigm(np.dot(self.alpha.T,X)-self.beta)
        z_cond_x[:,1] = 1-z_cond_x[:,0]
        
        #z_cond_x(i,1) = 1/(1+exp(-np.dot(self.alpha.T,X(i,:))-self.beta))
        #z_cond_x(i,2) = 1-z_cond_x(i,1)
        
        return np.multiply(np.prod(y_cond_z,axis=1),z_cond_x)
        
    
    def likelihoodGaussian(self, X, Y):
        #proba cond du label Yt du labelleur t pour la donnée i sachant le vrai label 0 ou 1 (Bernouilli)
        y_cond_z_cond_x = np.zeros((N,T,2))
        rlog=lambda i,t: 1/(1+exp(-np.dot(self.w(:,t).T,X[i,:])-self.gamma[1,t]))   #indice de la donnée, indice du labelleur
        rlog=np.vectorize(rlog)
        norm=lambda x,mu,sigma:1/(sqrt(2*np.pi)*sigma)*exp(-pow((x-mu),2)/pow(sigma,2))
        norm=np.vectorize(norm)
        for (t in range(T)):
            y_cond_z_cond_x[:,t,0] = norm(Y[:,t],0,rlog(:,t))
            y_cond_z_cond_x[:,t,1] = norm(Y[:,t],1,rlog(:,t))
        
        #proba que le label soit 0 ou 1 sachant la donnée (Rlog)
        z_cond_x = np.zeros((N,2))
        sigm = lambda x:  1/(1+exp(-x))
        sigm=np.vectorize(sigm)
        z_cond_x[:,0] = sigm(np.dot(self.alpha.T,X)-self.beta)
        z_cond_x[:,1] = 1-z_cond_x[:,0]
        
        return np.multiply(np.prod(y_cond_z_cond_x,axis=1),z_cond_x)
        
    def fit(self, X, Y, model=likelihoodBernouilli,  eps = 10**(-6)):
          
    def sigma(self, X):
        # A FAIRE
        return 0
    def eta(self, X):
        # A FAIRE
        return 0
    
    def likelihoodBernoulli(self, X, Y, gamma, w):
        P = np.zeros((X.shape[0],Y.shape[1]))
        return P      

    def likelihoodGaussian(self, X, Y, gamma, w):
        P = np.zeros((X.shape[0],Y.shape[1]))
        return P
        
    def Pz(self, z, X, alpha, beta):
        res = 1/(1+np.exp(-alpha.dot(X.T)-beta))
        if z == 1:
            return res
        else:
            return 1-res
        
    def Ptilde(self, X, Y, model, alpha, beta, gamma, w):
        Pt = np.zeros((X.shape[0],1))
        py = self.model(X,Y, gamma, w) # Taille N,T
        pz = self.Pz(1, X, alpha, beta) # Taille : N,1
        return np.multiplty(np.prod(py,axis=1),pz) # Taille : N,1
    
    def likelihood(self, X, Y, model, alpha, beta, gamma, w):
        Pt = self.Ptilde(X, Y, model, alpha, beta, gamma, w)
        return Pt.T.dot(np.log(Pt))
    
    def grad_likelihood(self, X, Y, model, alpha, beta, gamma, w):
        """Returns the partial derivatives of likelihood according to
        alpha, beta, gamma and w"""
        tmp_exp = np.exp(-X.dot(alpha.T)-beta)
        deltaPt = self.Pz(1,X)-self.Pz(0,X)
        grad_lh_alpha = np.sum(deltaPt*np.multiplty(np.multiplty(X,tmp_exp),1/(1+tmp_exp)**2))
        grad_lh_beta = np.sum(deltaPt*np.multiplty(tmp_exp,1/(1+tmp_exp)**2))
        tmp_exp = np.exp(-X.dot(w)-gamma) # Taille : N,T
        grad_etasigma_gamma = tmp_exp/(1+tmp_exp)**2 # Taille : N,T
        grad_etasigma_w = np.multiplty(X,tmp_exp)/(1+tmp_exp)**2 # Taille : N,T
        if ("Bernoulli" in model):
            grad_lh_eta = (-1)**Y *(-deltaPt) # Taille : N,T
            grad_lh_gamma = np.sum(np.multiply(grad_lh_eta,grad_etasigma_gamma)) 
            grad_lh_w = np.sum(np.multiply(grad_lh_eta, grad_etasigma,w))
        elif ("Gaussian" in model):
            s = self.sigma(X)
            grad_lh_sigma = (Y**2-self.Pz(1,X)*(2*Y-1))/s**3 - 1/s # Taille : N,T
            grad_lh_gamma = np.sum(np.multiply(grad_lh_sigma,grad_etasigma_gamma)) 
            grad_lh_w = np.sum(np.multiply(grad_lh_sigma, grad_etasigma,w))
        return np.array([[-grad_lh_alpha, -grad_lh_beta, -grad_lh_gamma, -grad_lh_w]])
    
    def BFGS_func(self, X, Y, model, alpha, beta, gamma, w):
        return [self.likelihood(X, Y, model, alpha, beta, gamma, w), \
                self.grad_likelihood(X, Y, model, alpha, beta, gamma, w)]
    
    def fit(self, X, Y, model=likelihoodBernoulli, eps = 10**(-6)):
        self.alpha = np.zeros((1,d))
        self.beta = 0
        alphaNew = np.ones((1,d))
        betaNew = 1
        wNew = np.random.rand(d,T)
        gammaNew = np.random.rand(1,T)
        while (np.linalg.norm(self.alpha-alphaNew)**2 + (self.beta-betaNew)**2 >= eps):
            tmpAlpha = alphaNew
            tmpBeta = betaNew
            tmpGamma = gammaNew
            tmpW = wNew
            # Expectation (E-step)
            Pt = self.Ptilde(X, Y, Z, model)
            # Maximization
            lh = - self.likelihood(X, Y, model, alpha, beta, gamma, w)
            BFGSfunc = lambda alpha,beta,gamma,w : likelihood(self, X, Y, model, alpha, beta, gamma, w)
            BFGSJac = lambda alpha,beta,gamma,w : grad_likelihood(self, X, Y, model, alpha, beta, gamma, w)
            result = minimize(BFGSfunc, method='BFGS', jac = BFGSJac, \
                              options={'gtol': 1e-6, 'disp': True, 'maxiter': 1000})
            print(result.message)
            print("Optimal solution :")
            print(result.x)
            # Updating new vectors :
            alphaNew = result.x[0]
            betaNew = result.x[1]
            gammaNew = result.x[2]
            wNew = result.x[3]
            self.alpha = tmpAlpha
            self.beta = tmpBeta
            self.gamma = gammaNew
            self.w = wNew
            
        self.alpha = alphaNew
        self.beta = betaNew
        self.w = wNew
        self.gamma = gammaNew
        
    def predict(self, X):
        
    def predictV2(self, Y):
        
    def score(self, X, Z):
        # On connaît la vérité terrain
        return np.sum(predict(X)==Z)
    
    def get_eps(self):
        
    def loss(self,data,y):
        
    def loss_g(self,data,y):


IndentationError: expected an indented block (<ipython-input-5-2db244050362>, line 82)