In [1]:
import math
import json
import numpy as np
import pandas as pd

In [2]:
def Preprocessing(DocList, BGLM):
    N = len(DocList)
    M = len(BGLM)
    A = np.zeros([N, M], int)

    for index, doc in enumerate(DocList.values()):
        for line in open("Document/"+doc,"r").readlines()[3:]:
            for word in line.split()[:-1]:
                A[index,int(word)] += 1
    
    return N, M, A

def InitParam(M,N,K):
    LAMBDA = np.random.random([N,K])
    THETA = np.random.random([K,M])
    for i in range(N):
        for j in range(K):
            LAMBDA[i,] /= np.sum(LAMBDA[i,])
    for i in range(K):
        for j in range(M):
            THETA[i,] /= np.sum(THETA[i,])
    return LAMBDA,THETA

def EStep(P,M,N,K):
    for i in range(N):
        for j in range(M):
            s = 0
            for k in range(K):
                P[i,j,k] = THETA[k,j] * LAMBDA[i,k]
                s += P[i,j,k]
            if s == 0:
                for k in range(K):
                    P[i,j,k] = 0
            else:
                for k in range(K):
                    P[i,j,k] /= s
    return P

def MStep(A,P,LAMBDA,Theta,M,N,K):
    for k in range(K):
        s = 0
        for j in range(M):
            THETA[k,j] = 0
            for i in range(N):
                THETA[k,j] += A[i,j] * P[i,j,k]
            s += THETA[k,j]
        if s == 0:
            for j in range(M):
                THETA[k,j] = 1.0 / M
        else:
            for j in range(M):
                THETA[k,j] /= s
    
    for i in range(N):
        for k in range(K):
            LAMBDA[i,k] = 0
            s = 0
            for j in range(M):
                LAMBDA[i,k] = A[i,j] * P[i,j,k]
                s += A[i,j]
            if s == 0:
                LAMBDA[i,k] = 1.0 / K
            else:
                LAMBDA[i,k] /= s
    return LAMBDA, THETA

def CurrentLogLikelihood(A,LAMBDA,THETA,M,N,K):
    LogLikelihood = 0
    for i in range(N):
        for j in range(M):
            tmp = 0
            for k in range(K):
                tmp += THETA[k,j] * LAMBDA[i,k]
            if tmp > 0:
                LogLikelihood += A[i,j] * math.log(tmp)
    return LogLikelihood

In [None]:
QueryList = {index : queries.strip('\n') for index, queries in enumerate(open("query_list.txt","r"))}

DocList = {index : docs.strip('\n') for index, docs in enumerate(open("doc_list.txt","r"))}

BGLM = {index:float(lines.split()[1]) for index,lines in enumerate(open("BGLM.txt","r"))}

N, M, A = Preprocessing(DocList, BGLM)

K = 10
MaxIter = 30
Threshold = 10.0

# LAMBDA[i,k] = p(Tk|Di)
# THETA[i,j] = p(Wj|Ti)
LAMBDA,THETA = InitParam(M,N,K)

# P[i,j,k] = p(Tk|Di,Wj)
P = np.zeros([N,M,K])

OldLogLikelihood = 1
NewLogLikelihood = 1
for i in range(MaxIter):
    print(i)
    P = EStep(P,M,N,K)
    LAMBDA, THETA = MStep(A,P,LAMBDA,THETA,M,N,K)
    NewLogLikelihood = CurrentLogLikelihood(A,LAMBDA,THETA,M,N,K)
    if (OldLogLikelihood != 1) and (NewLogLikelihood - OldLogLikelihood) < Threshold:
        break
    OldLogLikelihood = NewLogLikelihood

0:00:00.591561
0:33:11.090269
7:09:33.561506
0:06:55.016381
0
8:04:18.757840


In [None]:
Param_Alpha = 0.4
Param_Beta = 0.4
A_Normalize = np.zeros([N,M],float)

for i in range(N):
    A_Normalize[i,] = np.divide(A[i,],np.sum(A[i,]))

f = open("submission.txt", "w")
f.write("Query,RetrievedDocuments\r\n")

for index, query in QueryList:
    f.write(query + ",")
    Score = {}
    for i,doc in enumerate(DocList):
        s = 1
        for line in open("Query/"+query,"r").readlines():
            for word in line.split()[:-1]:
                a1 = Param_Alpha * A_Normalize[i,int(word)]
                s1 = 0
                for k in range(K):
                    s1 += LAMBDA[i,k] * THETA[k,int(word)]
                a2 = s1 * Param_Beta
                a3 = (1 - Param_Alpha - Param_Beta) * math.exp(BGLM[int(word)])
                s *= a1 + a2 + a3
        Score.update({doc : s})
    Score_Sort = sorted(Score.items(), key=lambda Score: Score[1],reverse=True)
    
    for item in Score_Sort:
        f.write(item[0] + " ")
    f.write("\r\n")
f.close()
    