In [1]:
from datetime import *
import math
import json
import numpy as np
import pandas as pd

In [2]:
def GetWords(QueryList, DocList):
    ID2Word = {}
    Word2ID = {}
    CurrentID = 0
    
    for doc in DocList.values():
        for line in open("Document/"+doc,"r").readlines()[3:]:
            for word in line.split()[:-1]:
                if word not in Word2ID.keys():
                    ID2Word.update({CurrentID : word})
                    Word2ID.update({word : CurrentID})
                    CurrentID += 1
    
    for query in QueryList.values():
        for line in open("Query/"+query,"r").readlines():
            for word in line.split()[:-1]:
                if word not in Word2ID.keys():
                    ID2Word.update({CurrentID : word})
                    Word2ID.update({word : CurrentID})
                    CurrentID += 1

    return ID2Word, Word2ID
    
def Preprocessing(DocList, ID2Word):
    N = len(DocList)
    M = len(ID2Word)
    A = np.zeros([N, M], int)

    for index, doc in enumerate(DocList.values()):
        for line in open("Document/"+doc,"r").readlines()[3:]:
            for word in line.split()[:-1]:
                A[index,Word2ID[word]] += 1
    
    return N, M, A

def InitParam(M,N,K):
    LAMBDA = np.random.random([N,K])
    THETA = np.random.random([K,M])
    for i in range(N):
        LAMBDA[i,] /= np.sum(LAMBDA[i,])
    for i in range(K):
        THETA[i,] /= np.sum(THETA[i,])
            
    return LAMBDA, THETA

def EStep(P,M,N,K):
    for i in range(N):
        for j in range(M):
            for k in range(K):
                P[i,j,k] = THETA[k,j] * LAMBDA[i,k]
            s = np.sum(P[i,j,:])
            if s == 0:
                for k in range(K):
                    P[i,j,k] = 0
            else:
                for k in range(K):
                    P[i,j,k] /= s
                    
    return P

def MStep(A,P,LAMBDA,Theta,M,N,K):
    t = datetime.now()
    for k in range(K):
        for j in range(M):
            THETA[k,j] = np.sum(A[:,j] * P[:,j,k])
        s = np.sum(THETA[k,:])
        if s == 0:
            for j in range(M):
                THETA[k,j] = 1.0 / M
        else:
            for j in range(M):
                THETA[k,j] /= s
    print(datetime.now()-t)
    
    for i in range(N):
        for k in range(K):
            LAMBDA[i,k] = np.sum(A[i,:] * P[i,:,k])
            s = np.sum(A[i,:])
            if s == 0:
                LAMBDA[i,k] = 1.0 / K
            else:
                LAMBDA[i,k] /= s
                
    return LAMBDA, THETA

def CurrentLogLikelihood(A,LAMBDA,THETA,M,N,K):
    LogLikelihood = 0
    for i in range(N):
        for j in range(M):
            tmp = 0
            for k in range(K):
                tmp += THETA[k,j] * LAMBDA[i,k]
            if tmp > 0:
                LogLikelihood += A[i,j] * math.log(tmp)

    return LogLikelihood

"""
def Fold_In(P,LAMBDA,THETA,Words,M,N,K):
    s = 0
    for word, cnt in Words.items():
        
    
    for i in range(N):
        for j in range(M):
            for k in range(K):
                P[i,j,k] = THETA[k,j] * LAMBDA[i,k]
            s = np.sum(P[i,j,:])
            if s == 0:
                for k in range(K):
                    P[i,j,k] = 0
            else:
                for k in range(K):
                    P[i,j,k] /= s
    
    for i in range(N):
        for k in range(K):
            LAMBDA[i,k] = np.sum(A[i,:] * P[i,:,k])
            s = np.sum(A[i,:])
            if s == 0:
                LAMBDA[i,k] = 1.0 / K
            else:
                LAMBDA[i,k] /= s
    return P, LAMBDA
"""

'\ndef Fold_In(P,LAMBDA,THETA,Words,M,N,K):\n    s = 0\n    for word, cnt in Words.items():\n        \n    \n    for i in range(N):\n        for j in range(M):\n            for k in range(K):\n                P[i,j,k] = THETA[k,j] * LAMBDA[i,k]\n            s = np.sum(P[i,j,:])\n            if s == 0:\n                for k in range(K):\n                    P[i,j,k] = 0\n            else:\n                for k in range(K):\n                    P[i,j,k] /= s\n    \n    for i in range(N):\n        for k in range(K):\n            LAMBDA[i,k] = np.sum(A[i,:] * P[i,:,k])\n            s = np.sum(A[i,:])\n            if s == 0:\n                LAMBDA[i,k] = 1.0 / K\n            else:\n                LAMBDA[i,k] /= s\n    return P, LAMBDA\n'

In [9]:
QueryList = {index : queries.strip('\n') for index, queries in enumerate(open("query_list.txt","r"))}

DocList = {index : docs.strip('\n') for index, docs in enumerate(open("doc_list.txt","r"))}

BGLM = {index:float(lines.split()[1]) for index,lines in enumerate(open("BGLM.txt","r"))}

ID2Word, Word2ID = GetWords(QueryList, DocList)

N, M, A = Preprocessing(DocList, ID2Word)

K = 10
MaxIter = 100
Stop_Threshold = 10

# LAMBDA[i,k] = p(Tk|Di)
# THETA[i,j] = p(Wj|Ti)
LAMBDA,THETA = InitParam(M,N,K)
LAMBDA

array([[0.08265687, 0.12389915, 0.14431716, ..., 0.03747926, 0.04744107,
        0.0675116 ],
       [0.06455302, 0.10233558, 0.14369174, ..., 0.08801251, 0.14222262,
        0.05674945],
       [0.10797379, 0.11510747, 0.06133635, ..., 0.13700476, 0.12341014,
        0.13449628],
       ...,
       [0.03526934, 0.03817748, 0.05058374, ..., 0.00218736, 0.02939629,
        0.10860892],
       [0.12396263, 0.11380203, 0.13292816, ..., 0.13468656, 0.07642923,
        0.13944311],
       [0.11103276, 0.11488765, 0.10807695, ..., 0.01470436, 0.10691114,
        0.12396762]])

In [10]:
# P[i,j,k] = p(Tk|Di,Wj)
P = np.zeros([N,M,K])

OldLogLikelihood = 1
NewLogLikelihood = 1
for i in range(MaxIter):
    t = datetime.now()
    P = EStep(P,M,N,K)
    LAMBDA, THETA = MStep(A,P,LAMBDA,THETA,M,N,K)
    NewLogLikelihood = CurrentLogLikelihood(A,LAMBDA,THETA,M,N,K)
    if (OldLogLikelihood != 1) and (NewLogLikelihood - OldLogLikelihood) < Stop_Threshold:
        break
    print(str(i) + " " + str(NewLogLikelihood) + " " + str(NewLogLikelihood - OldLogLikelihood) + " " + str(datetime.now() - t))
    OldLogLikelihood = NewLogLikelihood

KeyboardInterrupt: 

In [None]:
"""
for doc in open("Collection.txt","r").readlines():
    Words = {}
    for word in doc.split():
        if word in Words:
            Words[word] += 1
        else:
            Words[word] = 1
    P, LAMBDA = Fold_In(P,LAMBDA,THETA,Words,M,N,K)
np.savetxt("lambda_K10.csv",LAMBDA,delimiter=",")
np.savetxt("theta_K10.csv",THETA,delimiter=",")
"""

In [None]:
Param_Alpha = 0.45
Param_Beta = 0.35
A_Normalize = np.zeros([N,M],float)

for i in range(N):
    A_Normalize[i,] = np.divide(A[i,],np.sum(A[i,]))

f = open("submission.txt", "w")
f.write("Query,RetrievedDocuments\r\n")

for index, query in QueryList.items():
    f.write(query + ",")
    Score = {}
    for i,doc in DocList.items():
        s = 0
        for line in open("Query/"+query,"r").readlines():
            for word in line.split()[:-1]:
                a1 = np.log(Param_Alpha) + np.log(A_Normalize[i,Word2ID[word]])
                a2 = np.log(np.sum(LAMBDA[i,:] * THETA[:,Word2ID[word]])) + np.log(Param_Beta)
                a3 = np.log(1 - Param_Alpha - Param_Beta) + BGLM[Word2ID[word]]
                s += np.logaddexp(np.logaddexp(a1,a2),a3)
        Score.update({doc : s})
    Score_Sort = sorted(Score.items(), key=lambda Score: Score[1],reverse=True)
    
    for item in Score_Sort:
        f.write(item[0] + " ")
    f.write("\r\n")
f.close()
    