In [1]:
from datetime import *
import math
import json
import numpy as np
import pandas as pd

In [2]:
def Preprocessing(BGLM):
    N = len(open("Collection.txt","r").readlines())
    M = len(BGLM)
    A = np.zeros([N, M], int)
    A_Normalize = np.zeros([N,M], float)

    for index, doc in enumerate(open("Collection.txt","r").readlines()):
        for word in doc.split():
            A[index,int(word)] += 1

    for i in range(N):
        A_Normalize[i,] = np.divide(A[i,],np.sum(A[i,]))
    
    return N, M, A, A_Normalize

def InitParam(M,N,K):
    LAMBDA = np.random.random([N,K])
    THETA = np.random.random([K,M])
    for i in range(N):
        LAMBDA[i,] /= np.sum(LAMBDA[i,])
    for i in range(K):
        THETA[i,] /= np.sum(THETA[i,])
            
    return LAMBDA, THETA

def EStep(P,M,N,K):
    for i in range(N):
        for j in range(M):
            for k in range(K):
                P[i,j,k] = THETA[k,j] * LAMBDA[i,k]
            s = np.sum(P[i,j,:])
            if s == 0:
                for k in range(K):
                    P[i,j,k] = 0
            else:
                for k in range(K):
                    P[i,j,k] /= s
                    
    return P

def MStep(A,P,LAMBDA,Theta,M,N,K):
    t = datetime.now()
    for k in range(K):
        for j in range(M):
            THETA[k,j] = np.sum(A[:,j] * P[:,j,k])
        s = np.sum(THETA[k,:])
        if s == 0:
            for j in range(M):
                THETA[k,j] = 1.0 / M
        else:
            for j in range(M):
                THETA[k,j] /= s
    print(datetime.now()-t)
    
    for i in range(N):
        for k in range(K):
            LAMBDA[i,k] = np.sum(A[i,:] * P[i,:,k])
            s = np.sum(A[i,:])
            if s == 0:
                LAMBDA[i,k] = 1.0 / K
            else:
                LAMBDA[i,k] /= s
                
    return LAMBDA, THETA

def CurrentLogLikelihood(A,LAMBDA,THETA,M,N,K):
    LogLikelihood = 0
    for i in range(N):
        for j in range(M):
            tmp = 0
            for k in range(K):
                tmp += THETA[k,j] * LAMBDA[i,k]
            if tmp > 0:
                LogLikelihood += A[i,j] * math.log(tmp)

    return LogLikelihood


def Fold_In(LAMBDA,THETA,Words,A,M,N,K):
    P = np.zeros([N,M,K])
    LAMBDA = np.random.random([N,K])
    for i in range(N):
        LAMBDA[i,] /= np.sum(LAMBDA[i,])
        
    for i in range(N):
        for j in range(M):
            for k in range(K):
                P[i,j,k] = THETA[k,j] * LAMBDA[i,k]
            s = np.sum(P[i,j,:])
            if s == 0:
                for k in range(K):
                    P[i,j,k] = 0
            else:
                for k in range(K):
                    P[i,j,k] /= s
    
    for i in range(N):
        for k in range(K):
            LAMBDA[i,k] = np.sum(A[i,:] * P[i,:,k])
            s = np.sum(A[i,:])
            if s == 0:
                LAMBDA[i,k] = 1.0 / K
            else:
                LAMBDA[i,k] /= s
    return LAMBDA

In [3]:
QueryList = {index : queries.strip('\n') for index, queries in enumerate(open("query_list.txt","r"))}

DocList = {index : docs.strip('\n') for index, docs in enumerate(open("doc_list.txt","r"))}

BGLM = {index:float(lines.split()[1]) for index,lines in enumerate(open("BGLM.txt","r"))}

N, M, A, A_Normalize = Preprocessing(BGLM)

K = 128
MaxIter = 100
Stop_Threshold = 10

# LAMBDA[i,k] = p(Tk|Di)
# THETA[i,j] = p(Wj|Ti)
LAMBDA,THETA = InitParam(M,N,K)
# P[i,j,k] = p(Tk|Di,Wj)
P = np.zeros([N,M,K])


MemoryError: 

In [None]:
OldLogLikelihood = 1
NewLogLikelihood = 1
for i in range(MaxIter):
    t = datetime.now()
    P = EStep(P,M,N,K)
    LAMBDA, THETA = MStep(A,P,LAMBDA,THETA,M,N,K)
    NewLogLikelihood = CurrentLogLikelihood(A,LAMBDA,THETA,M,N,K)
    if (OldLogLikelihood != 1) and (NewLogLikelihood - OldLogLikelihood) < Stop_Threshold:
        break
    print(str(i) + " " + str(NewLogLikelihood) + " " + str(NewLogLikelihood - OldLogLikelihood) + " " + str(datetime.now() - t))
    OldLogLikelihood = NewLogLikelihood

In [None]:
Orig_N = N
for index, doc in DocList.items():
    Words = np.zeros(M)
    for line in open("Query/"+doc,"r").readlines()[3:]:
        for word in line.split()[:-1]:
            Words[int(word)] += 1
    A = np.vstack([A,Words])
    A_Normalize[N,] = np.divide(A[N,],np.sum(A[N,]))
    N += 1
    LAMBDA = Fold_In(LAMBDA,THETA,A,Words,M,N,K)

In [None]:
Param_Alpha = 0
Param_Beta = 0

f = open("submission.txt", "w")
f.write("Query,RetrievedDocuments\r\n")

for index, query in QueryList.items():
    f.write(query + ",")
    Score = {}
    for ind,doc in DocList.items():
        s = 0
        i = ind + Orig_N
        for line in open("Query/"+query,"r").readlines():
            for word in line.split()[:-1]:
                a1 = math.log(Param_Alpha) + math.log(A_Normalize[i,Word2ID[word]])
                a2 = math.log(np.sum(LAMBDA[i,:] * THETA[:,Word2ID[word]])) + math.log(Param_Beta)
                a3 = math.log(1 - Param_Alpha - Param_Beta) + BGLM[Word2ID[word]]
                s += np.logaddexp(np.logaddexp(a1,a2),a3)
        Score.update({doc : s})
    Score_Sort = sorted(Score.items(), key=lambda Score: Score[1],reverse=True)
    
    for item in Score_Sort:
        f.write(item[0] + " ")
    f.write("\r\n")
f.close()
    