# Import Package 

In [1]:
import re
import json
import collections
import xml.dom.minidom
from stemming.porter2 import stem

# Read xml. File

In [2]:
# open trec.sample.xml
dom = xml.dom.minidom.parse('trec.sample.xml')
rootdata = dom.documentElement

# store number and text
# DOCNO[0]:  1
# Text[0]:   He likes to wink, he likes to drink
docno_list = rootdata.getElementsByTagName('DOCNO')
headline_list = rootdata.getElementsByTagName('HEADLINE')
text_list = rootdata.getElementsByTagName('TEXT')

# create dictionary: num_text_dict
num_text_dict = {}
for i in range(len(text_list)):
    num_text_dict[int(docno_list[i].firstChild.data)] = str(headline_list[i].firstChild.data).strip() + " " + str(text_list[i].firstChild.data).strip()
       
num_text_dict

{1: "FT  14 MAY 91 / (CORRECTED) Jubilee of a jet that did what it was designed\nto do Correction (published 16th May 1991) appended to this article.\n'FRANK, it flies]' shouted someone at Sir Frank Whittle during the maiden\nflight of a British jet. 'Of course it does,' replied Sir Frank, who\npatented the first aircraft gas turbine. 'That's what it was bloody well\ndesigned to do, wasn't it?'\nExactly 50 years ago yesterday, the first British jet made a brief 17-minute\nflight from RAF Cranwell in Lincolnshire. To celebrate the event, Mr Eric\n'Winkle' Brown, a 72-year-old test pilot of the prototype Gloster Whittle\njet, Mr Geoffrey Bone, a 73-year-old engineer, and Mr Charles McClure, a\n75-year-old pilot, returned to RAF Cranwell. They are seen in front of a\nrestored Meteor NF 11. Sir Frank was unable to attend because of ill-health.\nThe Gloster Whittle was not the first jet to fly: a Heinkel 178 had its\nmaiden flight in August 1939, 21 months before the British aircraft.\nCorr

# Store the Word Position in every Document

In [3]:
# store word position 
word_position_dict = {}  # store word position in every document

with open('englishST.txt', 'r') as eng:
    eng_str = eng.read()
eng_list = eng_str.split()  # stopping words list

r = """[0-9!#$%&'"()*+,-./:;<=>?@[\]^_`{|}~\n]"""
for key, value in num_text_dict.items():
    # delete punctuation
    no_punct = ""
    no_punct = re.sub(r, ' ', value)
    no_punct = no_punct.lower()
    no_list = no_punct.split()  # text words list
    
    # Stopping 
    stop_list = []
    for j in no_list:
        if (j not in eng_list):
            stop_list.extend([j])
    
    # Normalisation and Stemming
    norm_list = []
    for m in stop_list:
        norm_list.append(stem(m))    
        
    # generate position list in every document
    doc_position_list = []
    for i in range(len(norm_list)):
        doc_position_list.extend([[i+1, norm_list[i]]])
    
    word_position_dict[key] = doc_position_list

word_position_dict

{1: [[1, 'ft'],
  [2, 'correct'],
  [3, 'jubile'],
  [4, 'jet'],
  [5, 'design'],
  [6, 'correct'],
  [7, 'publish'],
  [8, 'append'],
  [9, 'articl'],
  [10, 'frank'],
  [11, 'fli'],
  [12, 'shout'],
  [13, 'sir'],
  [14, 'frank'],
  [15, 'whittl'],
  [16, 'maiden'],
  [17, 'flight'],
  [18, 'british'],
  [19, 'jet'],
  [20, 'repli'],
  [21, 'sir'],
  [22, 'frank'],
  [23, 'patent'],
  [24, 'aircraft'],
  [25, 'gas'],
  [26, 'turbin'],
  [27, 'bloodi'],
  [28, 'design'],
  [29, 'wasn'],
  [30, 'year'],
  [31, 'ago'],
  [32, 'yesterday'],
  [33, 'british'],
  [34, 'jet'],
  [35, 'made'],
  [36, 'minut'],
  [37, 'flight'],
  [38, 'raf'],
  [39, 'cranwel'],
  [40, 'lincolnshir'],
  [41, 'celebr'],
  [42, 'event'],
  [43, 'mr'],
  [44, 'eric'],
  [45, 'winkl'],
  [46, 'brown'],
  [47, 'year'],
  [48, 'test'],
  [49, 'pilot'],
  [50, 'prototyp'],
  [51, 'gloster'],
  [52, 'whittl'],
  [53, 'jet'],
  [54, 'mr'],
  [55, 'geoffrey'],
  [56, 'bone'],
  [57, 'year'],
  [58, 'engin'],
  [59, 'mr

In [27]:
# clear
file = open('processed_doc.txt', 'w').close()

# write
js = json.dumps(word_position_dict) 
file = open('processed_doc.txt', 'w') 
file.write(js) 
file.close() 

# Count the total number of books

In [4]:
book_number_list = list(word_position_dict.keys())
print("N=" + str(len(book_number_list)))

N=1000


# 1 Generate Special Words List

In [5]:
# special words
specialwords_list = []
for value in word_position_dict.values():
    for i in range(len(value)):
        if (value[i][1] not in specialwords_list):
            specialwords_list.extend([value[i][1]])
            
specialwords_list.sort()
specialwords_list

['aa',
 'aaf',
 'ab',
 'abalkin',
 'abandon',
 'abat',
 'abb',
 'abbado',
 'abbott',
 'abbrevi',
 'abc',
 'abci',
 'abduct',
 'abdul',
 'abel',
 'abela',
 'aberdeen',
 'abhor',
 'abid',
 'abidin',
 'abil',
 'abingdon',
 'abingworth',
 'abitibi',
 'abnorm',
 'aboard',
 'abol',
 'abolish',
 'abolit',
 'abolitionist',
 'abort',
 'abound',
 'abraham',
 'abram',
 'abreast',
 'abridg',
 'abroad',
 'abrupt',
 'absenc',
 'absent',
 'absente',
 'absentia',
 'absolut',
 'absorb',
 'absorpt',
 'abstain',
 'abstent',
 'abstract',
 'absurd',
 'abta',
 'abtex',
 'abund',
 'abus',
 'ac',
 'academ',
 'academi',
 'academia',
 'acceler',
 'accent',
 'accentu',
 'accept',
 'access',
 'accessori',
 'accid',
 'accident',
 'acclaim',
 'acco',
 'accommod',
 'accompani',
 'accomplic',
 'accomplish',
 'accor',
 'accord',
 'accordion',
 'account',
 'accret',
 'accru',
 'accumul',
 'accur',
 'accuraci',
 'accus',
 'accustom',
 'ace',
 'achiev',
 'achill',
 'acid',
 'ackner',
 'acknowledg',
 'acm',
 'acor',
 'aco

# 2 Term -> Position

In [6]:
# Through the dictionary to get "Term" and its "Position"
# {'Term1': [[doc1, pos1], [doc2, pos2], [doc3, pos3]
#  'Term2': [[doc1, pos1], [doc2, pos2], [doc3, pos3]}
term_position_dict = {}

for i in range(len(specialwords_list)):
    term_position_dict[specialwords_list[i]] = []

for key, value in word_position_dict.items():
    for i in range(len(value)):
        term_position_dict[value[i][1]].extend([[key, value[i][0]]])

term_position_dict

{'aa': [[245, 50]],
 'aaf': [[351, 828], [351, 1296]],
 'ab': [[19, 40]],
 'abalkin': [[112, 102]],
 'abandon': [[12, 145],
  [31, 90],
  [48, 116],
  [74, 7],
  [140, 568],
  [151, 99],
  [223, 139],
  [307, 300],
  [323, 180],
  [354, 82],
  [3363, 75],
  [3381, 121],
  [3407, 476],
  [3410, 73],
  [3448, 455],
  [3459, 375],
  [3504, 146],
  [3597, 94],
  [3637, 213],
  [3718, 65],
  [3783, 159],
  [3867, 284],
  [3932, 254]],
 'abat': [[3926, 184]],
 'abb': [[129, 141],
  [129, 314],
  [129, 324],
  [129, 343],
  [129, 357],
  [129, 370],
  [129, 408],
  [129, 413],
  [129, 433],
  [129, 530],
  [129, 561]],
 'abbado': [[263, 133]],
 'abbott': [[17, 65],
  [17, 116],
  [17, 142],
  [348, 75],
  [351, 1054],
  [3693, 864]],
 'abbrevi': [[3442, 53]],
 'abc': [[3337, 114],
  [3708, 8],
  [3708, 24],
  [3708, 41],
  [3708, 88],
  [3818, 9],
  [3818, 22],
  [3818, 40],
  [3818, 88],
  [3818, 110],
  [3818, 131],
  [3818, 168]],
 'abci': [[3369, 494]],
 'abduct': [[272, 142]],
 'abdul': 

# 2 Output Dictionary to .txt File

In [7]:
# clear
file = open('index_ts_pos.txt', 'w').close()
# wirte
js = json.dumps(term_position_dict) 
file = open('index_ts_pos.txt', 'w') 
file.write(js) 
file.close() 

# 2 Term -> (Document -> Position)

In [8]:
# Through the dictionary to get "Term" and its "Position" in every document
# {'Term1': {doc1: [position1, position2], doc2: [position1, position2]}
#  'Term2': {doc1: [position1, position2, position3], doc2: [position1, position2]}}
term_doc_pos_dict = {}

for key, value in term_position_dict.items():
    each_doc_pos_dict = {}
    doc_pos_list = [value[0][1]]  # [8]
    each_doc_pos_dict[value[0][0]] = [value[0][1]]  # doc1: [position1]   1: [8]
    
    for i in range(len(value)-1):
        if (value[i+1][0] == value[i][0]):
            doc_pos_list.extend([value[i+1][1]])  # if doc number don't change, extend doc_pos_list
        else:  # else generate doc->positonlist
            each_doc_pos_dict[value[i][0]] = doc_pos_list  # generate {dco1: [pos1, pos2, pos3]}
            doc_pos_list = [value[i+1][1]]
    
    each_doc_pos_dict[value[len(value)-1][0]] = doc_pos_list  # the last doc_pos_list
    doc_pos_list = []
    
    term_doc_pos_dict[key] = each_doc_pos_dict
            
term_doc_pos_dict

{'aa': {245: [50]},
 'aaf': {351: [828, 1296]},
 'ab': {19: [40]},
 'abalkin': {112: [102]},
 'abandon': {12: [145],
  31: [90],
  48: [116],
  74: [7],
  140: [568],
  151: [99],
  223: [139],
  307: [300],
  323: [180],
  354: [82],
  3363: [75],
  3381: [121],
  3407: [476],
  3410: [73],
  3448: [455],
  3459: [375],
  3504: [146],
  3597: [94],
  3637: [213],
  3718: [65],
  3783: [159],
  3867: [284],
  3932: [254]},
 'abat': {3926: [184]},
 'abb': {129: [141, 314, 324, 343, 357, 370, 408, 413, 433, 530, 561]},
 'abbado': {263: [133]},
 'abbott': {17: [65, 116, 142], 348: [75], 351: [1054], 3693: [864]},
 'abbrevi': {3442: [53]},
 'abc': {3337: [114],
  3708: [8, 24, 41, 88],
  3818: [9, 22, 40, 88, 110, 131, 168]},
 'abci': {3369: [494]},
 'abduct': {272: [142]},
 'abdul': {261: [42]},
 'abel': {264: [343]},
 'abela': {3401: [208]},
 'aberdeen': {141: [839], 3327: [97], 3330: [44, 63], 3828: [90]},
 'abhor': {48: [101]},
 'abid': {3407: [177], 3643: [301]},
 'abidin': {3939: [41

# 2 Output Dictionary to .txt File

In [9]:
# clear
file = open('index_ts.txt', 'w').close()
# write
js = json.dumps(term_doc_pos_dict) 
file = open('index_ts.txt', 'w') 
file.write(js) 
file.close() 

# 3 Input from .txt File into dictionary

In [10]:
# Term -> (Document -> Position)
file = open('index_ts.txt', 'r') 
js = file.read()
term_doc_pos_dict = json.loads(js) 
file.close() 

# term_doc_pos_dict

In [11]:
# Term -> Position
file = open('index_ts_pos.txt', 'r') 
js = file.read()
term_position_dict = json.loads(js) 
file.close() 

# term_position_dict

# 4 Output output_index.txt

In [12]:
# input from term_position_dict
# {'Term1': [[doc1, pos1], [doc2, pos2], [doc3, pos3]
#  'Term2': [[doc1, pos1], [doc2, pos2], [doc3, pos3]}
# input term_doc_pos_dict
# {'Term1': {doc1: [position1, position2], doc2: [position1, position2]}
#  'Term2': {doc1: [position1, position2, position3], doc2: [position1, position2]}}
# output to output_index.txt

# clear
file = open('index.txt', 'w').close()

# write
doc = open('index.txt', 'w')
for key, value in term_position_dict.items():
    print(key + ":" + str(len(term_doc_pos_dict[key])), file=doc)
    print("\t" + str(value[0][0]) + ": " + str(value[0][1]), end="", file=doc)  # do not wrap
    for i in range(len(value)-1):
        if (value[i+1][0] == value[i][0]):
            print(", " + str(value[i+1][1]), end="", file=doc)
        else:
            print("", file=doc)
            print("\t" + str(value[i+1][0]) + ": " + str(value[i+1][1]), end="", file=doc)
            
    print("", file=doc)
doc.close()

# 5 Every Search Module

In [13]:
# term_doc_pos_dict:  
# {'Term1': {doc1: [position1, position2], doc2: [position1, position2]}
#  'Term2': {doc1: [position1, position2, position3], doc2: [position1, position2]}}
# term_position_dict
# {'Term1': [[doc1, pos1], [doc2, pos2], [doc3, pos3]
#  'Term2': [[doc1, pos1], [doc2, pos2], [doc3, pos3]}

# search one word
def word_search(w_s, search_dict):
    w_s = stem(w_s.lower())
    key_list = search_dict[w_s].keys()
    key_list = [int(x) for x in list(key_list)]
    return key_list

len(word_search("condemning", term_doc_pos_dict))

6

In [14]:
# input term_position_dict
# {'Term1': [[doc1, pos1], [doc2, pos2], [doc3, pos3]
#  'Term2': [[doc1, pos1], [doc2, pos2], [doc3, pos3]}
def link_search(s, search_dict):
    r = """["]"""
    s = re.sub(r, '', s)
    s_list = s.split()
    w1 = s_list[0]
    w1 = stem(w1.lower()) 
    w2 = s_list[1]
    w2 = stem(w2.lower()) 
    w1_pos_list = list(search_dict[w1])
    w2_pos_list = list(search_dict[w2])
    
    link_list = []
    
    if ((len(w1_pos_list) == 0) or (len(w2_pos_list) == 0)):
        return link_list
    else:
        w1_len = len(w1_pos_list)
        w2_len = len(w2_pos_list)
        i = 0
        j = 0
        #  flag = ((i < (w1_len-2)) and (j < (w2_len-2)))  # end condition
        while ((i < (w1_len-1)) and (j < (w2_len-1))):  # double index
            if (w1_pos_list[i][0] == w2_pos_list[j][0]):  # two words are in the same document
                if ((w2_pos_list[j][1]-w1_pos_list[i][1]) == 1):
                    link_list.extend([w1_pos_list[i][0]])
                    i = i + 1
                    j = j + 1
                elif ((w2_pos_list[j][1] - w1_pos_list[i][1]) > 1):
                    i = i + 1
                elif ((w2_pos_list[j][1] - w1_pos_list[i][1]) < 1):
                    j = j + 1   
            elif (w1_pos_list[i][0] > w2_pos_list[j][0]):
                j = j + 1
            elif (w1_pos_list[i][0] < w2_pos_list[j][0]):
                i = i + 1
    
    link_list = list(set(link_list))
    link_list.sort()
    
    return link_list

len(link_search("income taxes", term_position_dict))

12

In [15]:
# search not word or link words
def not_search(a, search_dict, term_position_dict):
    r = """["]"""
    a_list = a.split("NOT")
    a_list = [x.strip() for x in a_list]
    a = a_list[1]
    if ('"' in a):  # it is a link_word
        # use link_search function
        a = re.sub(r, ' ', a)  # delete "
        a_not_list = link_search(a, term_position_dict)  
    else:  # single word
        # use word_search function
        a_not_list = word_search(a, search_dict) 
        
    a_key_list = book_number_list.copy()
    for i in a_not_list:
        if (i in a_key_list):
            a_key_list.remove(i)  # get not-a list
    
    return a_key_list


len(not_search('NOT "income taxes"', term_doc_pos_dict, term_position_dict))

988

In [16]:
# input term_position_dict
# {'Term1': [[doc1, pos1], [doc2, pos2], [doc3, pos3]
#  'Term2': [[doc1, pos1], [doc2, pos2], [doc3, pos3]}
def distance_search(s, search_dict):
    r = """[!#$%&'"()*+,-./:;<=>?@[\]^_`{|}~\n]"""
    s_punct = re.sub(r, ' ', s)
    s_list = s_punct.split()  # text words list
    distance = int(s_list[0])
    w1 = s_list[1]
    w1 = stem(w1.lower()) 
    w2 = s_list[2]
    w2 = stem(w2.lower()) 
    w1_pos_list = list(search_dict[w1])
    w2_pos_list = list(search_dict[w2])

    distance_list = []
    
    if ((len(w1_pos_list) == 0) or (len(w2_pos_list) == 0)):
        return link_list
    else:
        w1_len = len(w1_pos_list)
        w2_len = len(w2_pos_list)
        i = 0
        j = 0
        while ((i < (w1_len-1)) and (j < (w2_len-1))):  # double index
            if (w1_pos_list[i][0] == w2_pos_list[j][0]):  # two words are in the same document
                if (abs(w2_pos_list[j][1]-w1_pos_list[i][1]) <= distance):  # distance: distance between two words
                    distance_list.extend([w1_pos_list[i][0]])
                    i = i + 1
                    j = j + 1
                elif ((w2_pos_list[j][1] - w1_pos_list[i][1]) > distance):
                    i = i + 1
                elif ((w2_pos_list[j][1] - w1_pos_list[i][1]) < (-distance)):
                    j = j + 1   
            elif (w1_pos_list[i][0] > w2_pos_list[j][0]):
                j = j + 1
            elif (w1_pos_list[i][0] < w2_pos_list[j][0]):
                i = i + 1
    
    distance_list = list(set(distance_list))
    distance_list.sort()
    
    return distance_list

len(distance_search("#10(income, taxes)", term_position_dict))

24

In [17]:
# input term_doc_pos_dict
# {'Term1': {doc1: [position1, position2], doc2: [position1, position2]}
#  'Term2': {doc1: [position1, position2, position3], doc2: [position1, position2]}}
def or_search(s, search_dict, term_position_dict):
#     w_list = s.split("OR")
#     w_list = [x.strip() for x in w_list]
#     # two words
#     a = w_list[0]
#     b = w_list[1]
#     r = """["]"""
    w_list = s.split(' ')
    for i in range(len(w_list)):
        if (w_list[i] == 'OR'):
            or_pos = i
    
    # two words
    a = ' '.join(w_list[:or_pos])
    b = ' '.join(w_list[or_pos+1:])
    r = """["]"""
    
    # deal with NOT
    if ("NOT" in a):
        a_list = a.split("NOT")
        a_list = [x.strip() for x in a_list]
        a = a_list[1]
        if ('"' in a):  # it is a link_word
            # use link_search function
            a = re.sub(r, ' ', a)  # delete "
            a_not_list = link_search(a, term_position_dict)   
        else:  # single word
            # use word_search function
            a_not_list = word_search(a, search_dict)    
        a_key_list = book_number_list.copy()
        for i in a_not_list:
            if (i in a_key_list):
                a_key_list.remove(i)  # get not-a list
    else:
        if ('"' in a):  # it is a link_word
            # use link_search function
            a = re.sub(r, ' ', a)  # delete "
            a_key_list = link_search(a, term_position_dict)   
        else:  # single word
            # use word_search function
            a_key_list = word_search(a, search_dict)
    
    if ("NOT" in b):
        b_list = b.split("NOT")
        b_list = [x.strip() for x in b_list]
        b = b_list[1]
        if ('"' in b):  # it is a link_word
            # use link_search function
            b = re.sub(r, ' ', b)  # delete "
            b_not_list = link_search(b, term_position_dict)   
        else:  # single word
            # use word_search function
            b_not_list = word_search(b, search_dict)
        b_key_list = book_number_list.copy()
        for i in b_not_list:
            if (i in b_key_list):
                b_key_list.remove(i)  # get not-b list 
    else:
        if ('"' in b):  # it is a link_word
            # use link_search function
            b = re.sub(r, ' ', b)  # delete "
            b_key_list = link_search(b, term_position_dict)   
        else:  # it is a single word
            # use word_search function
            b_key_list = word_search(b, search_dict)   
    
    # get OR list
    or_list = a_key_list
    for i in b_key_list:
        if (i not in or_list):
            or_list.extend([i])
    or_list.sort()
    
    return or_list

len(or_search("income OR taxes", term_doc_pos_dict, term_position_dict))

235

In [18]:
s = 'Edinburgh AND SCOTLAND'
w_list = s.split(' ')
for i in range(len(w_list)):
    if (w_list[i] == 'AND'):
        and_pos = i
a = ' '.join(w_list[:and_pos])
print(a)
b = ' '.join(w_list[and_pos+1:])
print(b)

Edinburgh
SCOTLAND


In [19]:
# input term_doc_pos_dict
# {'Term1': {doc1: [position1, position2], doc2: [position1, position2]}
#  'Term2': {doc1: [position1, position2, position3], doc2: [position1, position2]}}
def and_search(s, search_dict, term_position_dict):
#     w_list = s.split("AND")
#     w_list = [x.strip() for x in w_list]
    w_list = s.split(' ')
    for i in range(len(w_list)):
        if (w_list[i] == 'AND'):
            and_pos = i
    
    # two words
    a = ' '.join(w_list[:and_pos])
    b = ' '.join(w_list[and_pos+1:])
    
#     # two words
#     a = w_list[0]
#     b = w_list[1]
    r = """["]"""
   
    # deal with NOT
    if ("NOT" in a):
        a_list = a.split("NOT")
        a_list = [x.strip() for x in a_list]
        a = a_list[1]
        if ('"' in a):  # it is a link_word
            # use link_search function
            a = re.sub(r, ' ', a)  # delete "
            a_not_list = link_search(a, term_position_dict)   
        else:  # single word
            # use word_search function
            a_not_list = word_search(a, search_dict)    
        a_key_list = book_number_list.copy()
        for i in a_not_list:
            if (i in a_key_list):
                a_key_list.remove(i)  # get not-a list
    else:
        if ('"' in a):  # it is a link_word
            # use link_search function
            a = re.sub(r, ' ', a)  # delete "
            a_key_list = link_search(a, term_position_dict)   
        else:  # single word
            # use word_search function
            a_key_list = word_search(a, search_dict)   
    
    if ("NOT" in b):
        b_list = b.split("NOT")
        b_list = [x.strip() for x in b_list]
        b = b_list[1]
        if ('"' in b):  # it is a link_word
            # use link_search function
            b = re.sub(r, ' ', b)  # delete "
            b_not_list = link_search(b, term_position_dict)   
        else:  # single word
            # use word_search function
            b_not_list = word_search(b, search_dict) 
        b_key_list = book_number_list.copy()
        for i in b_not_list:
            if (i in b_key_list):
                b_key_list.remove(i)  # get not-b list 
    else:
        if ('"' in b):  # it is a link_word
            # use link_search function
            b = re.sub(r, ' ', b)  # delete "
            b_key_list = link_search(b, term_position_dict)   
        else:  # it is a single word
            # use word_search function
            b_key_list = word_search(b, search_dict)   
    
    
    # get OR list
    and_list = []
    for i in a_key_list:
        if (i in b_key_list):
            and_list.extend([i])
    and_list.sort()
    
    return and_list

len(and_search("income AND SCOTLAND", term_doc_pos_dict, term_position_dict))

5

In [20]:
# my search module
def search(s, term_doc_pos_dict, term_position_dict):
    doc_list = []
    if ("AND" in s):
        doc_list = and_search(s, term_doc_pos_dict, term_position_dict)
    elif ("OR" in s):
        doc_list = or_search(s, term_doc_pos_dict, term_position_dict)
    elif ("#" in s):
        doc_list = distance_search(s, term_position_dict)
    elif ("NOT" in s):
        doc_list = not_search(s, term_doc_pos_dict, term_position_dict)
    elif (len(s.split()) > 1):
        doc_list = link_search(s, term_position_dict)
    else:
        doc_list = word_search(s, term_doc_pos_dict)
        
    return doc_list

s = "income taxes"
len(search(s, term_doc_pos_dict, term_position_dict))

12

# 6 Input Query txt & Output to output_boolean.txt

In [22]:
# Input Query txt
with open('queries.lab2.txt', 'r') as f:
    line_list = f.readlines()
f.close()

query_dict = {}
num_list = []
l_list = []

# get the number
for i in range(len(line_list)):  
    num_list.extend([line_list[i].split()])  # split 

# get the line
for j in range(len(line_list)):
    l_list.extend([' '.join(num_list[j][1:])])

for k in range(len(l_list)):
    query_dict[num_list[k][0]] = l_list[k]

query_dict

{'1': 'Scotland',
 '2': 'Window',
 '3': 'replacing',
 '4': 'condemning',
 '5': 'income OR taxes',
 '6': 'income AND NOT taxes',
 '7': '"income taxes"',
 '8': '#10(income, taxes)',
 '9': '"middle east" AND peace'}

In [23]:
# get results.boolean dict
# boolean_dict = {int1: list[int], int2: list[int]}
boolean_dict = {}

for i in range(len(query_dict)):
    boolean_dict[i+1] = search(query_dict[str(i+1)], term_doc_pos_dict, term_position_dict)
    print(len(boolean_dict[i+1]))
    
boolean_dict

25
16
68
6
235
48
12
24
8


{1: [16,
  94,
  96,
  143,
  272,
  351,
  370,
  3330,
  3334,
  3336,
  3338,
  3376,
  3485,
  3504,
  3505,
  3532,
  3533,
  3561,
  3565,
  3629,
  3654,
  3820,
  3826,
  3932,
  3938],
 2: [30,
  55,
  59,
  145,
  160,
  272,
  374,
  3463,
  3639,
  3782,
  3788,
  3906,
  3909,
  3910,
  3917,
  3930],
 3: [16,
  28,
  31,
  51,
  92,
  97,
  131,
  135,
  136,
  138,
  140,
  169,
  195,
  197,
  218,
  242,
  256,
  274,
  276,
  325,
  3325,
  3342,
  3361,
  3370,
  3387,
  3397,
  3403,
  3410,
  3411,
  3413,
  3418,
  3437,
  3438,
  3461,
  3494,
  3504,
  3521,
  3531,
  3532,
  3533,
  3561,
  3598,
  3623,
  3654,
  3660,
  3662,
  3668,
  3693,
  3700,
  3716,
  3720,
  3724,
  3734,
  3745,
  3788,
  3805,
  3828,
  3867,
  3869,
  3876,
  3886,
  3895,
  3896,
  3907,
  3918,
  3921,
  3930,
  3935],
 4: [113, 309, 354, 3374, 3674, 3678],
 5: [3,
  14,
  16,
  23,
  24,
  32,
  33,
  34,
  39,
  41,
  42,
  43,
  46,
  65,
  92,
  106,
  112,
  113,
  125,
  1

In [24]:
# Output to output_boolean.txt
# clear
file = open('results.boolean.txt', 'w').close()

# write
doc = open('results.boolean.txt', 'w')
for key, value in boolean_dict.items():
    for i in range(len(value)):
        print(str(key) + "," + str(value[i]), file=doc)
doc.close()

In [26]:
# clear
file = open('boolean_dict.txt', 'w').close()
# wirte
js = json.dumps(boolean_dict) 
file = open('boolean_dict.txt', 'w') 
file.write(js) 
file.close()