In [44]:
import utils
import pandas as pd
import importlib
import expert_factory
importlib.reload(expert_factory)
import numpy as np

In [45]:
def calcula_verossimilhanca(num_experts, y_true, output_experts, output_gating):
    py = np.zeros(shape=(num_experts,len(y_true)))
    for expert in range(0,num_experts):
        for index,y in enumerate(y_true):
            #calcula a diff entre o real e o que o cada expert previu
            #diff = Ytr(j,:)-Yaux(j,:);
            diff = y - output_experts[index,expert]
            #Py(j,i)=exp(-diff*diff'/(2*var(i)));   
            py[expert,index] = np.exp(np.dot(-diff, diff.T) / (2))
    # Likelihood= sum(log(sum(Yg.*Py,2)));
    #haux = Yg.*Py;
    haux = np.multiply(output_gating, py.T)
    likelihood = np.sum(np.log(np.sum(haux,axis=0)))
    return likelihood,haux       

In [46]:
def output_final(X, gating_network, all_experts):
    #Calcula saida do gating
    gating_output = gating_network.feedforward(X)[-1]
    experts_outputs = []
    #calcula saida do experts
    for exp_net in all_experts:
        experts_outputs.append(exp_net.feedforward(X)[-1].tolist())
    #retorna o melhor expert como saida
    final_output = []
    for index_gate, result in enumerate(np.argmax(gating_output,axis=1)):
        final_output.append(experts_outputs[result][index_gate])
    return final_output
        

def maximiza_gating(gating_network,max_epocas_gating, alpha_gating, X_train, h, X_val, y_val):
    gating_network.train(max_epocas_gating, alpha_gating, X_train, h, X_val, y_val)
    
def maximiza_expert(expert_network,max_epocas_expert,alpha_expert, X_train, h, X_val, y_val):
    expert_network.train(max_epocas_expert, alpha_expert, X_train, h, X_val, y_val)

In [47]:
#Lendo o dadoa
df = pd.read_csv('data/treinamento-1.txt', header=None)
num_lags = 10

#criando Lag
lagged_data = utils.create_lag(df, num_lags)
lagged_data = lagged_data.reset_index(drop=True)

X = lagged_data.drop(['y'],axis=1)
y = lagged_data['y']

#Criando conjunto de dados
fracao_dados_para_treino = 0.7
fracao_dados_para_teste = 0.2
X_train,y_train,X_test,y_test,X_val,y_val = utils.treino_teste_validacao(X,y, frac_train=fracao_dados_para_treino, frac_test=fracao_dados_para_teste)

Tamanho total 990
Tamanho treino 693
Tamanho teste 198
Tamanho validacao 99


In [48]:
#vamos comecar com experts iguais
exp_ne= X_train.shape[1]
exp_nh= 3
exp_ns= 1
num_experts = 2
all_experts = []
for _ in range(num_experts):
    exp = expert_factory.Expert(exp_ne,exp_nh,exp_ns,g_h='sigmoid',g_o='sigmoid')
    all_experts.append(exp)
    
gating_ne = X_train.shape[1]
gating_nh = 3
gating_ns = num_experts
gating_network = expert_factory.Expert(gating_ne,gating_nh,gating_ns, g_h='sigmoid', g_o='softmax')

In [61]:
likelihood = 0
old_likelihood = -np.inf
iters = 0
max_iters = 1000
max_epocas_gating = 1
alpha_gating = 0.5
while abs(likelihood-old_likelihood) > 1e-3 and iters < max_iters:
    iters += 1
    #calcula a saida para cada rede
    #A funcao retorna 4 varaiveis, queremos apenas a ultima que representa o output (por isso o [-1])
    output_gating = gating_network.feedforward(X_train)[-1]
    output_experts = np.matrix([np.ravel(expert.feedforward(X_train)[-1]).tolist() for expert in all_experts]).T
    #Agora que temos a saida comecamos com a funcao de EM
    old_likelihood = likelihood
    #Passo E (Expectation)
    #Com os parametros atuais calculamos calculamos a 'expectation' posterior para cada expert
    likelihood,haux_train = calcula_verossimilhanca(num_experts, y_train, output_experts, output_gating)
    #likelihood_val,haux_val = calcula_verossimilhanca(num_experts, y_train, output_experts, output_gating, matriz_covariancia)
    #h = haux./(sum(haux,2)*ones(1,m));
    h = np.divide(haux_train, np.sum(haux_train,axis=0))
    #Passo M (Maximizacao)
    maximiza_gating(gating_network,max_epocas_gating, alpha_gating, X_train, h, X_val, y_val)
    #Itera por cada expert o treinando com seu respectivo h
    for exp_index, expert in enumerate(all_experts):
        output_individual_exp = []
        for item in h[:,0]:
            output_individual_exp.append([item])
        maximiza_expert(expert,max_epocas_gating,alpha_gating, X_train, output_individual_exp, X_val, y_val)
    saida_final = output_final(X_train, gating_network, all_experts)
    erro = saida_final - y_train 
    loss = np.square(erro).mean()


0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726
0.43421965487081726


0.4343498269865729
0.43434917091213204
0.4343485179531086
0.43434786827731475
0.4343472219594233
0.43434657902668583
0.4343459394821304
0.4343453033163314
0.43434467051338077
0.43434404105392027
0.4343434149166825
0.4343427920792753
0.43434217251858476
0.4343472136332685
0.4343465814819963
0.4343459524318525
0.4343453264602997
0.4343458049391446
0.43434517561168845
0.4343445492576078
0.4343439258537912
0.434343305376947
0.4343426878036172
0.4343420731101915
0.4343414612729221
0.4343408522679378
0.43434024607125976
0.43433964265881647
0.4343390420064597
0.4343383241854446
0.4343377252832258
0.43433750279637795
0.4343369071564869
0.43433631417979307
0.43433572384236
0.43433513612027375
0.43433455098966084
0.4343339684267055
0.4343333884076686
0.4343336548214128
0.43433307582567615
0.43433249927768874
0.434331925154507
0.4343313534333733
0.4343305660744719
0.4343299998560839
0.434329435975658
0.43432887441122797
0.4343283151411017
0.434327758143879
0.43432720339846737
0.43432665088409994


0.43424428448420266
0.4342442686932323
0.4342442531379346
0.43424423781622823
0.4342442227260519
0.4342442078653647
0.4342441932321452
0.4342441788243917
0.4342441646401219
0.4342441506773727
0.4342441369341999
0.43424412340867796
0.43424411009890035
0.43424394960811763
0.43424393667570765
0.43424392395419104
0.43424391144172053
0.4342438991364668
0.434243887036618
0.43424387514037954
0.4342430141664302
0.43424300221645534
0.43424299046669856
0.43424297891541963
0.4342429675608951
0.4342429564014179
0.4342429454352974
0.43424293466085867
0.43424292407644316
0.4342429136804077
0.43424290347112493
0.43424289344698275
0.4342428836063847
0.43424287394774885
0.4342428644695089
0.4342428551701127
0.4342428460480232
0.4342428371017176
0.43424282832968764
0.4342428197304392
0.43424281130249204
0.43424280304438007
0.4342427949546509
0.43424278703186553
0.4342427792745988
0.43424277168143877
0.43424276425098646
0.43424275698185627
0.43424274987267564
0.43424126436942706
0.4342412579298145
0.4342

In [51]:
teste = output_final(X_train, gating_network, all_experts)

In [125]:
print(type(y_train))

<class 'numpy.ndarray'>


In [128]:
print(np.array(h[:,0]).T)


[0.01156333 0.01042883 0.00938287 0.00813877 0.00709063 0.00956672
 0.00672117 0.0080347  0.00572201 0.00948199 0.00769875 0.01884049
 0.01390172 0.02154663 0.01157301 0.01491996 0.01071675 0.01509907
 0.01033117 0.01835214 0.01107813 0.02250479 0.01534226 0.02272517
 0.01751616 0.01918316 0.01437816 0.01984835 0.01598412 0.02016663
 0.01729386 0.01934895 0.01332784 0.01959735 0.0113897  0.01996717
 0.01217667 0.0185082  0.01135169 0.01939163 0.01446376 0.02311653
 0.0156179  0.02507302 0.01956827 0.02406632 0.02158657 0.01886385
 0.02262253 0.01986753 0.02420045 0.02143355 0.01881815 0.02151411
 0.01867279 0.01819353 0.01514594 0.02153682 0.01460541 0.01976641
 0.01136766 0.01970821]


In [6]:
import numpy as np
import pandas as pd
import math
import time
import matplotlib.pyplot as plt
import itertools
import sys

#import Perceptron_Regressor
#import Utilities

import importlib
importlib.reload(Perceptron_Regressor)
# importlib.reload(Utilities)


In [119]:
def calc_likelihood(d, y_experts, y_g, covar_matrix):
    #list with the probablities of all errors from each expert to each line of input
    py_matrix = []
    #iterate over the experts
    for exp in range(y_experts[0].shape[1]):
        py_row = []
        #for each expert, calculate the individual py (sum of logs of final output - the gating output multiplied by the corresponding expert output)
        for inst, value in enumerate(d):
            #diff is the error of the final output
            diff = value-y_experts[inst,exp]
            py = np.exp(-np.dot(diff, diff.T) / np.multiply(2, covar_matrix[exp,exp]))
            py_row += [py[0,0]]
        py_matrix += [py_row]
    
    py_matrix = np.matrix(py_matrix).T
    h_aux = np.multiply(y_g, py_matrix)
    likelihood = np.sum(np.log(np.sum(h_aux, axis=1)))
    
    return likelihood, h_aux

In [None]:
#util = Utilities.Utilities()

# def calc_likelihood(d, y_experts, y_g, covar_matrix):
#     #create the matrix with the errors of each expert to each instance
#     diff = d-y_experts
    
#     #create the auxiliar matrix that computes the squared error for each expert,
#     # multiplied by the covariance matrix,
#     # Note: in this case we use the Identity for the covariance matrix
#     expert_error = np.sum(np.multiply(covar_matrix, np.dot(diff.T, diff)), axis=0)

#     # calculates py as the prior probability multiplied by the exp 
#     # of the expert_error multiplied by -0.5
#     py = np.multiply(np.exp(np.multiply(expert_error, -0.5)), y_g)

#     #calculates the final likelihood, computing the sum for each instance of the logs of py
#     ll = -np.sum(np.log(py))
    
#     return ll, py

def calc_2norm(m):
    return np.sqrt(np.sum(np.multiply(m,m)))

def maximize_gating(gating_net, X_train, h, max_it = 1e4, min_norm = 1e-5, X_valid=None, d_valid=None, use_fit=False):
    if use_fit:
        gating_net.fit(X=X_train, d=h, valid_data=X_valid, valid_d=d_valid, verbose=False)
    else:
        norm_grad = float("inf")
        it = 0

        while norm_grad > min_norm and it < max_it:
            print(it, norm_grad)
            #calculate the descent gradient for h
            djdw1, djdw2 = gating_net.calculate_gradient(X=X_train, d=h)
            #compute the right leraning rate
            learn_rate = gating_net.calculate_bisection(X=X_train, d=h, djdw1=djdw1, djdw2=djdw2)
            #learn_rate=0.1
            #update the gating network wheights
            gating_net.w1, gating_net.w2 = gating_net.update_weights(learning_rate=learn_rate, djdw1=djdw1, djdw2=djdw2,
                                                                     w1=gating_net.w1, w2=gating_net.w2)
            it+=1
            norm_grad = calc_2norm(np.append(djdw1.ravel(), djdw2.ravel(), axis=1))

def maximize_expert(expert_net, X_train, h, d_train, covar_matrix=None, max_it = 1e4, min_norm = 1e-5, X_valid=None, d_valid=None, use_fit=False):
    if use_fit:
        expert_net.fit(X=X_train, d=d_train, valid_data=X_valid, valid_d=d_valid, side_factor=h, verbose=False)
    else:
        norm_grad = float("inf")
        it = 0

        #print("expert start new")
        while norm_grad > min_norm and it < max_it:
            #calculate the descent gradient for h
            #print("expert start")
            djdw1, djdw2 = gating_net.calculate_gradient(X=X_train, d=d_train, side_factor=h)
            #print("grad")
            #compute the right leraning rate
            learn_rate = gating_net.calculate_bisection(X=X_train, d=d_train, djdw1=djdw1, djdw2=djdw2, side_factor=h)
            #learn_rate=0.1
            #print("alfa")
            #update the gating network wheights
            gating_net.w1, gating_net.w2 = gating_net.update_weights(learning_rate=learn_rate, djdw1=djdw1, djdw2=djdw2,
                                                                     w1=gating_net.w1, w2=gating_net.w2)

            it+=1
            norm_grad = calc_2norm(np.append(djdw1.ravel(), djdw2.ravel(), axis=1))

            #print("weights")

def calc_final_pred(X, gating_net, experts_list):
    y_g = gating_net.forward(X)

    #y_e = []
    #for exp in experts_list:
    #    y_e +=[exp.forward(X).T[0]]
    #    print()
    #print(y_e)
    #y_e =np.matrix(y_e).T
    y_e = np.matrix([np.array(exp.forward(X).T.tolist()[0]) for exp in experts_list]).T

    return np.sum(np.multiply(y_e, y_g), axis=1)


def main():

    # In[ ]:
    input_path = r"input\treinamento-1.txt"

    input_data=pd.read_csv(open(input_path, "r"), header=None)
    input_data.columns = ["time_series"]
    input_data.head()


    # In[17]:


    lagged_data = util.create_lags(input_data, n=20)

    #util.to_excel(lagged_data, "time_series_with_lag.xlsx")


    # In[18]:


    correlations = lagged_data.corr()

    #util.to_excel(correlations, "correlations.xlsx")


    # In[19]:


    # CRIA OS INPUTS
    max_lags = 5

    lagged_data_ = lagged_data[["time_series"] + ["time_series_lag_{}".format(i) for i in range(1, max_lags+1)]].dropna()
    test, train = util.get_simple_sample(lagged_data_, 0.7)
    valid, test = util.get_simple_sample(test, 0.5)

    X_train= train[["time_series_lag_{}".format(i) for i in range(1, max_lags+1)]]
    X_train["bias"] = 1 # adicionando bias
    d_train = train[["time_series"]]

    X_test = test[["time_series_lag_{}".format(i) for i in range(1, max_lags+1)]]
    X_test["bias"] = 1 # adicionando bias
    d_test = test[["time_series"]]

    X_valid = valid[["time_series_lag_{}".format(i) for i in range(1, max_lags+1)]]
    X_valid["bias"] = 1 # adicionando bias
    d_valid = valid[["time_series"]]

    nInp = len(X_train.loc[0])
    nOut = len(d_train.loc[0])
    nHid_gat = 3
    nHid_exp = 3
    nExperts = 4

    #creates the gating network
    gating_net = Perceptron_Regressor.MLP(nInp=nInp, nHid=nHid_gat, nOut=nExperts,
                                          fFunc="sigmoid", gFunc="softmax",
                                          cost_func="entropy")

    #creates the expert networks list
    experts_list = []
    for i in range(nExperts):
        expert = Perceptron_Regressor.MLP(nInp=nInp, nHid=nHid_gat, nOut=nOut,
                                          fFunc="sigmoid", gFunc="sigmoid",
                                          cost_func="mse")
        
        experts_list+=[expert]


    likelihood = 0
    likelihood_prev = -float("inf")
    max_iterations = 1000
    min_ll_gain = 1e-3
    covar_matrix = np.identity(nExperts)

    #loop to execute the Expectation Maximization algorithm
    it = 0
    while it < max_iterations and abs(likelihood-likelihood_prev) > min_ll_gain:
    #     st_time = time.time()
        it+=1
        #calculates the outputs of each network
        y_g = gating_net.forward(X_train)
        #y_experts = np.matrix([exp.forward(X_train).T[0] for exp in experts_list]).T
        y_experts = np.matrix([np.array(exp.forward(X_train).T.tolist()[0]) for exp in experts_list]).T
        
        y_g_valid = gating_net.forward(X_valid)
        #y_experts_valid = np.matrix([exp.forward(X_valid).T[0] for exp in experts_list]).T
        y_experts_valid = np.matrix([np.array(exp.forward(X_valid).T.tolist()[0]) for exp in experts_list]).T
        
        #E step - Expectation
        # calculates the matrix h of posterior expectations for each expert
        likelihood_prev = likelihood
        likelihood, h_aux = calc_likelihood(d=np.matrix(d_train), y_experts=y_experts,
                                            y_g=y_g, covar_matrix=covar_matrix)
        likelihood_valid, h_aux_valid = calc_likelihood(d=np.matrix(d_valid), y_experts=y_experts_valid,
                                            y_g=y_g_valid, covar_matrix=covar_matrix)
    #     print("Likelihood time =", time.time()-st_time)
        
    #     st_time = time.time()
        #computes the h (posteriori likelihood) dividing elementwise the h_aux
        # by the sum of all elements of the matrix 
        h = np.divide(h_aux, np.sum(h_aux, axis=1))
        h_valid = np.divide(h_aux_valid, np.sum(h_aux_valid, axis=1))
        
        #M step - Maximization
        # minimize the cost function for gating and expert networks (maximize the ouputs)
        #First - maximize gating network (calulate the descend gradient for the error to h)
        maximize_gating(gating_net=gating_net, X_train=X_train, h=h,
                        X_valid=X_valid, d_valid=d_valid,use_fit=True)
    #     print("Maximixing Gating time =", time.time()-st_time)
        
        #then maximize each of the experts
        for k, expert in enumerate(experts_list):
    #         st_time = time.time()
            #compute the expert responsability in the error for each instance
            #expert_responsability = np.multiply(d_train, np.divide(np.sum(h, axis=0)[0,k], covar_matrix[k,k]))        
            maximize_expert(expert_net=expert, X_train=X_train, h=h[:,k],
                            d_train=d_train, covar_matrix=covar_matrix,
                            X_valid=X_valid, d_valid=d_valid,use_fit=True)
    #         print("\t\tMaximazing Expert",k,"time =", time.time()-st_time)
            
        y_pred = calc_final_pred(X=X_train, gating_net=gating_net, experts_list=experts_list)
        mse = util.get_mse(y_pred, d_train)
        print(it, "\t", likelihood, "\t", mse)
    print("y_pred:\n", y_pred)

if __name__ == "__main__":
    main()