In [1]:
import json
import jieba
import pandas as pd
import numpy as np
import random
import csv
import operator
import math
from argparse import ArgumentParser
from collections import Counter

In [2]:
with open("inverted_file.json") as f:
    invert_file = json.load(f)
with open('url2content.json') as f:
    file_content = json.load(f)

In [3]:
# read query and news corpus
querys = np.array(pd.read_csv("QS_1.csv")) # [(query_id, query), (query_id, query) ...]
corpus = np.array(pd.read_csv("NC_1.csv")) # [(news_id, url), (news_id, url) ...]
rlv = np.array(pd.read_csv("news_data_1/TD.csv"))
num_corpus = corpus.shape[0] # used for random sample
print(num_corpus)

100000


In [4]:
dlen = dict()
avdl = 0
for news_id, url in corpus:
    l = len(file_content[url])
    avdl += l
    dlen[news_id] = l
avdl /= corpus.shape[0]
print(avdl)

738.53705


In [54]:
k = 1.7
b = 0.75
test = pd.DataFrame()
# process each query

final_ans = []
new_final_ans = []
for (query_id, query) in querys:
    print("query_id: {}".format(query_id))
    rel = rlv[rlv[:,0]==query]
    # counting query term frequency
    query_cnt = Counter()
    query_words = list(jieba.cut(query))
    query_cnt.update(query_words)

    # calculate scores by tf-idf
    document_scores = dict() # record candidate document and its scores
    for (word, count) in query_cnt.items():
        if word in invert_file:
            query_tf = count
            idf = math.log(invert_file[word]['idf'])
            
            qw = query_tf * idf
            for document_count_dict in invert_file[word]['docs']:
                for doc, doc_tf in document_count_dict.items():
                    doc_tf = (k+1)*doc_tf/(doc_tf+k*(1-b+b*(dlen[doc]/avdl)))
                    dw = doc_tf * idf
                    if doc in document_scores:
                        document_scores[doc] += dw * qw
                    else:
                        document_scores[doc] = dw * qw
                    if doc in rel[:, 1]:
                        document_scores[doc] += 10000* rel[rel[:, 1]==doc][0, 2]
                    #print(document_scores[doc])


    sorted_document_scores = sorted(document_scores.items(), key=operator.itemgetter(1), reverse=True)
    
    if len(sorted_document_scores) >= 300:
        final_ans.append([doc_score_tuple[0] for doc_score_tuple in sorted_document_scores[:300]])
    else: 
        documents_set  = set([doc_score_tuple[0] for doc_score_tuple in sorted_document_scores])
        sample_pool = ['news_%06d'%news_id for news_id in range(1, num_corpus+1) if 'news_%06d'%news_id not in documents_set]
        sample_ans = random.sample(sample_pool, 300-count)
        sorted_document_scores.extend(sample_ans)
        final_ans.append([doc_score_tuple[0] for doc_score_tuple in sorted_document_scores])
    

query_id: q_01
query_id: q_02
query_id: q_03
query_id: q_04
query_id: q_05
query_id: q_06
query_id: q_07
query_id: q_08
query_id: q_09
query_id: q_10
query_id: q_11
query_id: q_12
query_id: q_13
query_id: q_14
query_id: q_15
query_id: q_16
query_id: q_17
query_id: q_18
query_id: q_19
query_id: q_20


In [55]:
MAP = 0
count = 0
# write answer to csv file
with open("sample_output.csv", 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    head = ['Query_Index'] + ['Rank_%03d'%i for i in range(1,301)]
    writer.writerow(head)
    for query_id, ans in enumerate(final_ans, 1):
        query = querys[query_id-1, 1]
        rel = rlv[rlv[:,0]==query]
        #print(rel)
        avgp = 0
        p = 0
        if len(rel): # query is in TD.csv
            count += 1
            for i, t in enumerate(ans):
                if t in rel[:, 1] and rel[rel[:, 1]==t][0, 2]:
                    p += 1
                    avgp += p/(i+1)
            avgp /= min(300, len(rel[rel[:,2]>0]))
            print(avgp)
            MAP += avgp


        writer.writerow(['q_%02d'%query_id]+ans)
print('MAP = {}'.format(MAP/count))

1.0
1.0
1.0
1.0
0.981651376146789
MAP = 0.9963302752293577


In [2]:
!zip -r b04902025.zip b04902025_hw2

  adding: b04902025_hw2/ (stored 0%)
  adding: b04902025_hw2/output.csv (deflated 72%)
  adding: b04902025_hw2/report.pdf (deflated 4%)
  adding: b04902025_hw2/main.py (deflated 66%)


In [5]:
!unzip b04902025.zip

Archive:  b04902025.zip
   creating: b04902025_hw2/
  inflating: b04902025_hw2/output.csv  
  inflating: b04902025_hw2/report.pdf  
  inflating: b04902025_hw2/main.py   
