In [1]:
from datetime import *
import math
import json
import numpy as np
import pandas as pd

In [2]:
def GetWords(QueryList):
    ID2Word = {}
    Word2ID = {}
    CurrentID = 0
    
    for doc in open("Collection.txt","r").readlines():
        for word in doc.split():
            if word not in Word2ID.keys():
                ID2Word.update({CurrentID : word})
                Word2ID.update({word : CurrentID})
                CurrentID += 1

    for query in QueryList.values():
        for line in open("Query/"+query,"r").readlines():
            for word in line.split()[:-1]:
                if word not in Word2ID.keys():
                    ID2Word.update({CurrentID : word})
                    Word2ID.update({word : CurrentID})
                    CurrentID += 1

    return ID2Word, Word2ID
    
def Preprocessing(DocList, ID2Word):
    N = len(DocList)
    M = len(ID2Word)
    A = np.zeros([N, M], int)

    for index, doc in enumerate(open("Collection.txt","r").readlines()):
        for word in doc.split():
            A[index,Word2ID[word]] += 1
    
    return N, M, A

def InitParam(M,N,K):
    LAMBDA = np.random.random([N,K])
    THETA = np.random.random([K,M])
    for i in range(N):
        LAMBDA[i,] /= np.sum(LAMBDA[i,])
    for i in range(K):
        THETA[i,] /= np.sum(THETA[i,])
            
    return LAMBDA, THETA

def EStep(P,M,N,K):
    for i in range(N):
        for j in range(M):
            for k in range(K):
                P[i,j,k] = THETA[k,j] * LAMBDA[i,k]
            s = np.sum(P[i,j,:])
            if s == 0:
                for k in range(K):
                    P[i,j,k] = 0
            else:
                for k in range(K):
                    P[i,j,k] /= s
                    
    return P

def MStep(A,P,LAMBDA,Theta,M,N,K):
    t = datetime.now()
    for k in range(K):
        for j in range(M):
            THETA[k,j] = np.sum(A[:,j] * P[:,j,k])
        s = np.sum(THETA[k,:])
        if s == 0:
            for j in range(M):
                THETA[k,j] = 1.0 / M
        else:
            for j in range(M):
                THETA[k,j] /= s
    print(datetime.now()-t)
    
    for i in range(N):
        for k in range(K):
            LAMBDA[i,k] = np.sum(A[i,:] * P[i,:,k])
            s = np.sum(A[i,:])
            if s == 0:
                LAMBDA[i,k] = 1.0 / K
            else:
                LAMBDA[i,k] /= s
                
    return LAMBDA, THETA

def CurrentLogLikelihood(A,LAMBDA,THETA,M,N,K):
    LogLikelihood = 0
    for i in range(N):
        for j in range(M):
            tmp = 0
            for k in range(K):
                tmp += THETA[k,j] * LAMBDA[i,k]
            if tmp > 0:
                LogLikelihood += A[i,j] * math.log(tmp)

    return LogLikelihood

In [6]:
QueryList = {index : queries.strip('\n') for index, queries in enumerate(open("query_list.txt","r"))}

DocList = [index for index, words in enumerate(open("Collection.txt","r").readlines())]

BGLM = {index:float(lines.split()[1]) for index,lines in enumerate(open("BGLM.txt","r"))}

ID2Word, Word2ID = GetWords(QueryList)

N, M, A = Preprocessing(DocList, ID2Word)

K = 128
MaxIter = 50
Stop_Threshold = 10.0

# LAMBDA[i,k] = p(Tk|Di)
# THETA[i,j] = p(Wj|Ti)
LAMBDA,THETA = InitParam(M,N,K)
# P[i,j,k] = p(Tk|Di,Wj)
P = np.zeros([N,M,K])

81615933312


In [5]:
OldLogLikelihood = 1
NewLogLikelihood = 1
for i in range(MaxIter):
    t = datetime.now()
    P = EStep(P,M,N,K)
    LAMBDA, THETA = MStep(A,P,LAMBDA,THETA,M,N,K)
    print(LAMBDA)
    print(THETA)
    NewLogLikelihood = CurrentLogLikelihood(A,LAMBDA,THETA,M,N,K)
    if (OldLogLikelihood != 1) and (NewLogLikelihood - OldLogLikelihood) < Stop_Threshold:
        break
    print(str(i) + " " + str(NewLogLikelihood) + " " + str(NewLogLikelihood - OldLogLikelihood) + " " + str(datetime.now() - t))
    OldLogLikelihood = NewLogLikelihood

MemoryError: 

In [7]:
np.sum(THETA[0])

1.0

In [5]:
Param_Alpha = 0.4
Param_Beta = 0.4
A_Normalize = np.zeros([N,M],float)

for i in range(N):
    A_Normalize[i,] = np.divide(A[i,],np.sum(A[i,]))

f = open("submission.txt", "w")
f.write("Query,RetrievedDocuments\r\n")

for index, query in QueryList.items():
    f.write(query + ",")
    Score = {}
    for i,doc in DocList.items():
        s = 1
        for line in open("Query/"+query,"r").readlines():
            for word in line.split()[:-1]:
                a1 = Param_Alpha * A_Normalize[i,Word2ID[word]]
                a2 = np.sum(LAMBDA[i,:] * THETA[:,Word2ID[word]]) * Param_Beta
                a3 = (1 - Param_Alpha - Param_Beta) * math.exp(BGLM[Word2ID[word]])
                s *= a1 + a2 + a3
        Score.update({doc : s})
    Score_Sort = sorted(Score.items(), key=lambda Score: Score[1],reverse=True)
    
    for item in Score_Sort:
        f.write(item[0] + " ")
    f.write("\r\n")
f.close()
    