In [33]:
# data
import os

from nltk.corpus import stopwords

with open('config', mode='r', encoding='utf-8') as cfile:
    conf = cfile.readlines()
    path_root = conf[0][len('path_root='):].strip()
    dir_source = conf[1][len('dir_source='):].strip()
    dir_temp = conf[2][len('dir_temp='):].strip()  # folder name for the processed files
    fn_indexer = conf[3][len('fn_indexer='):].strip()    # new file to store the indexer
    fn_doc_id = conf[4][len('fn_doc_id='):].strip()
    fn_doc_list = conf[5][len('fn_doc_list='):].strip()
    fn_doc_len = conf[6][len('fn_doc_len='):].strip()

path_source = os.path.join(path_root, dir_source)
path_temp = os.path.join(path_root, dir_temp)
doc_id = {}
doc_list = []
doc_len = {}
indexer = {}

stop_words = {'i', 'me', 'my', 'myself', 'we',
              'our', 'ours', 'ourselves', 'you',
              "you're", "you've", "you'll", "you'd",
              'your', 'yours', 'yourself', 'yourselves',
              'he', 'him', 'his', 'himself', 'she',
              "she's", 'her', 'hers', 'herself', 'it',
              "it's", 'its', 'itself', 'they', 'them',
              'their', 'theirs', 'themselves', 'what',
              'which', 'who', 'whom', 'this', 'that',
              "that'll", 'these', 'those', 'am', 'is',
              'are', 'was', 'were', 'be', 'been',
              'being', 'have', 'has', 'had', 'having',
              'do', 'does', 'did', 'doing', 'a', 'an',
              'the', 'and', 'but', 'if', 'or', 'because',
              'as', 'until', 'while', 'of', 'at', 'by',
              'for', 'with', 'about', 'against', 'between',
              'into', 'through', 'during', 'before', 'after',
              'above', 'below', 'to', 'from', 'up', 'down',
              'in', 'out', 'on', 'off', 'over', 'under',
              'again', 'further', 'then', 'once', 'here',
              'there', 'when', 'where', 'why', 'how',
              'all', 'any', 'both', 'each', 'few',
              'more', 'most', 'other', 'some', 'such',
              'no', 'nor', 'not', 'only', 'own',
              'same', 'so', 'than', 'too', 'very',
              's', 't', 'can', 'will', 'just', 'don',
              "don't", 'should', "should've", 'now', 'd',
              'll', 'm', 'o', 're', 've', 'y', 'ain',
              'aren', "aren't", 'couldn', "couldn't",
              'didn', "didn't", 'doesn', "doesn't", 'hadn',
              "hadn't", 'hasn', "hasn't", 'haven', "haven't",
              'isn', "isn't", 'ma', 'mightn', "mightn't",
              'mustn', "mustn't", 'needn', "needn't", 'shan',
              "shan't", 'shouldn', "shouldn't", 'wasn',
              "wasn't", 'weren', "weren't", 'won', "won't",
              'wouldn', "wouldn't"}
# stop_words = set(stopwords.words('english'))
stop_words.add('s')


In [40]:
# Some functions
import csv
from nltk import RegexpTokenizer
from math import log

import sys
# tokenize

def get_term_freq(textlines):
    title = textlines[0]
    text = textlines[7]
    title = title.lower()
    text = text.lower()
    # word_tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
    word_tokenizer = RegexpTokenizer('[A-Za-z]+')
    words_title = word_tokenizer.tokenize(title)
    term_set = set(words_title).difference(stop_words)
    term_freq = {}
    for w in words_title:
        if w in term_set:
            if w in term_freq:
                term_freq[w] = term_freq[w] + 1
            else:
                term_freq[w] = 1

    words_text = word_tokenizer.tokenize(text)
    term_set = set(words_text).difference(stop_words)
    for w in words_text:
        if w in term_set:
            if w in term_freq:
                term_freq[w] = term_freq[w] + 5
            else:
                term_freq[w] = 10
    return term_freq, len(words_title) + len(words_text) * 5


# todo: load data
def load():
    indexer.clear()
    doc_list.clear()
    doc_id.clear()
    doc_len.clear()

    with open(os.path.join(path_temp, fn_indexer), mode='r', encoding='utf-8') as f:
        f_reader = csv.reader(f)
        for row in f_reader:
            if len(row) > 1:
                rlst = []
                for tu in row[1:]:
                    rlst.append(tuple(eval(tu)))
                indexer[row[0]] = rlst

    with open(os.path.join(path_temp, fn_doc_id), mode='r', encoding='utf-8') as f:
        f_reader = csv.reader(f)
        for row in f_reader:
            doc_id[row[0]] = int(row[1])

    with open(os.path.join(path_temp, fn_doc_len), mode='r', encoding='utf-8') as f:
        f_reader = csv.reader(f)
        for row in f_reader:
            doc_len[int(row[0])] = int(row[1])

    with open(os.path.join(path_temp, fn_doc_list), mode='r', encoding='utf-8') as f:
        f_reader = csv.reader(f)
        for i in f_reader:
            doc_list.extend(i)


def weighted_tf(tf, d_len, avg_len):
    return 2 * tf / (0.25 + 0.75 * d_len / avg_len + tf)


def idf(df, n):
    if df == 0:
        return 0
    return log(n / df, 10)


def get_query_list(query):
    query = query.lower()
    # word_tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
    word_tokenizer = RegexpTokenizer('[A-Za-z]+')
    words = word_tokenizer.tokenize(query)
    inv_list = []
    for i in range(len(words)):
        if words[i] == 'not' and i + 1 < len(words):
            inv_list.append(words[i + 1])
            i = i + 1
    query = set(words).difference(stop_words)
    inv_list = set(inv_list).difference(stop_words)
    return query, inv_list


def search(query_list, inv_list):
    avg_doc_len = 0
    doc_num = len(doc_len)
    for dl in doc_len:
        avg_doc_len = avg_doc_len + dl / doc_num

    candidate = {}
    for k in query_list:
        if k not in indexer.keys():
            continue
        df = len(indexer[k])
        coe = 0
        if k in inv_list:
            coe = -0.5
        else:
            coe = 1

        for did, tf in indexer[k]:
            if did not in candidate.keys():
                candidate[did] = 0
            candidate[did] = candidate[did] + coe * weighted_tf(tf, doc_len[did], avg_doc_len) * idf(df, doc_num)

    candidate = sorted(candidate.items(), key= lambda x: (x[1]), reverse= True)
    return candidate[:10]

def output_info(doc, query_list):
    print('title: ' + doc[0][:-1])
    print('date : ' + doc[3])
#     todo: get paragraph


In [35]:
load()

In [41]:
# Query
# for v in indexer['russia']:
#     print(doc_list[v[0]][len(path_root):], v[1])
q = 'china and russia not japan'
#

ql, il = get_query_list(q)
answer = search(ql, il)
for fid, weight in answer:
    print('======================')
    print(str(fid) + ';' + doc_list[fid] + ';' + str(weight))
    with open(doc_list[fid], mode='r', encoding='utf-8') as f:
        text = f.readlines()
    output_info(text, ql)




111;C:\BCSpace\study\ThisSemester\IR\project\workspace\test_data\articles1\17412.csv;3.889987360256452
title: Russia Requires Apple and Google to Remove LinkedIn From Local App Stores - The New York Times
date : 2017-01-07

7992;C:\BCSpace\study\ThisSemester\IR\project\workspace\test_data\articles1\26728.csv;3.8864725395949575
title: Russia Vetoes U.N. Draft Resolution Condemning Chemical Attack in Syria, China Abstains
date : 2017-04-13

5690;C:\BCSpace\study\ThisSemester\IR\project\workspace\test_data\articles1\23699.csv;3.857743843547781
title: U.S. Women Jump, Spin and Soar to Gymnastics Gold - The New York Times
date : 2016-08-15

4370;C:\BCSpace\study\ThisSemester\IR\project\workspace\test_data\articles1\22132.csv;3.8530513540202884
title: China Launches 2 Astronauts on Its Longest Space Mission - The New York Times
date : 2016-10-17

7010;C:\BCSpace\study\ThisSemester\IR\project\workspace\test_data\articles1\25471.csv;3.851988254068541
title: Russia Prepares to Block LinkedIn Af