In [1]:
import math
import os
import re
import string
import wikipediaapi

from googletrans import Translator
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

In [2]:
def read_queries(filename):
    '''
    in : 'file.trec'
    out: ['asd fgh', 'qwe rty', ...]
    '''
    queries = []
    f = open(filename, 'r')
    if f.mode == 'r':
        contents = f.readlines()
        for row in contents:
            if (row[0]!='<'):
                queries.append(row)
    f.close()
    return queries

In [3]:
def translate_queries(queries):
    translator = Translator(service_urls=[
      'translate.google.com'
    ])
    translations = translator.translate(queries, src='id', dest='en')
    return [translation.text for translation in translations]

In [4]:
def write_query_trec_format(queries, filename):
    outfile = filename.replace('.', '-eng.')
    if (os.path.exists(outfile)):
        os.remove(outfile)
    
    f = open(filename, 'r')
    if f.mode == 'r':
        contents = f.readlines()
        index = [re.search(r"\d+(\.\d+)?", row).group(0) for row in contents if row.startswith('<num>')]
    f.close()
    
    queries_index = list(zip(index, queries))
    
    for query in queries_index:
        f = open(outfile, 'a')
        f.write('<top>\n')
        f.write('<num>{}</num><title>\n'.format(query[0]))
        f.write(query[1])
        f.write('\n</title>\n')
        f.write('</top>\n')
        f.close()

In [5]:
def translate_process(filename):
    queries = read_queries(filename)
    translated_queries = translate_queries(queries)
    write_query_trec_format(translated_queries, filename)

In [6]:
def outputfile_translate(filename):
    return filename.replace('.', '-eng.')

In [7]:
def read_queries_in_token(filename):
    '''
    in : 'file'
    out: [['asd', 'fgh'], ['qwe', 'rty']]
    '''
    queries = []
    f = open(filename, 'r')
    if f.mode == 'r':
        contents = f.readlines()
        for row in contents:
            if (row[0]!='<'):
                queries.append(word_tokenize(row))
    f.close()
    return queries

In [8]:
def outputfile_filtered(filename):
    return filename.replace('.', '-filtered.')

In [9]:
def preprocess_queries(fname):
    f = open(fname, 'r')
    queries = f.read().replace(',', '')
    queries = queries.replace('.', '')
    queries = queries.replace('?', '')
    queries = queries.lower()
    f.close()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    terms = queries.split()
    '''
    in : ['asd', 'sdf'(stopword), 'dfg']
    out: 
    <top>
    <num>X</num><title>
    asd dfg
    </title>
    </top>
    '''
    outfile_name = outputfile_filtered(fname)
    if (os.path.exists(outfile_name)):
        os.remove(outfile_name)
    for t in terms:
        lemma = lemmatizer.lemmatize(t)
        if not (lemma in stop_words):
            if (lemma.endswith('>')):
                appendFile = open(outfile_name,'a')
                if (t.startswith('</title')):
                    appendFile.write('\n' + lemma + '\n')
                else:
                    appendFile.write(lemma + '\n')
                appendFile.close()
            else:
                appendFile = open(outfile_name,'a')
                appendFile.write(lemma+' ')
                appendFile.close()

In [10]:
'''
Deprecated
'''
def wordnet_expand(list_sentence):
    synonyms={}
    for x in list_sentence:
        count=0
        print(x)
        for syn in wordnet.synsets(x):
            print(syn)
            for l in syn.lemmas() :
                print(l)
                if(count<3):
                    if l.name() not in synonyms.keys():
                        synonyms[l.name()] = 1
                        count+=1

        count=0
        print()
    return synonyms    

In [11]:
def generate_term(list_sentence):
    '''
    in : ['asd', 'fgh', 'kjl']
    out: {'asd':1, 'fgh':2, 'kjl':3}
    '''
    wiki = wikipediaapi.Wikipedia('en')
    synonyms={}
    for x in list_sentence:
        count=0
        page_wiki = wiki.page(x)
        if page_wiki.exists():
            page_summary = clean_text(page_wiki.text)
#         print(x)
        for syn in wordnet.synsets(x):
#             print(syn)
            for l in syn.lemmas() :
#                 print(l)
                if(count<3):
                    if l.name() not in synonyms.keys():
                        term = l.name().replace('_', ' ')
                        synonyms[term] = 1
                        if page_wiki.exists():
                            weight = page_summary.count(term)
                            synonyms[term] = weight if weight > 0 else 1
                        count+=1

        count=0
    return synonyms    

In [12]:
def count_weight_tfidf(candidate_terms, page_collection):
    """
    Param: candidate_term {term:weight, ...}, 
    Return: [('term':123), ('term1':234)]
    """
    term_tfidf = []
    N = len(page_collection)
    for k in candidate_terms.keys():
        occurrence = 0
        for page in page_collection:
            if re.search(r'\b' + k + r'\b', page):
                occurrence += 1
#         term_tfidf[k] = candidate_terms[k] * (math.log(N/(1+occurrence)) + 1)
        tup = k, candidate_terms[k] * (math.log(N/(1+occurrence)) + 1)
        term_tfidf.append(tup)
    return term_tfidf

In [13]:
def wiki_expand(query):
    wiki = wikipediaapi.Wikipedia('en')
    tokens = word_tokenize(query)
    page_intersect = {}
    for token in tokens:
        page_wiki = wiki.page(token)
        if page_wiki.exists():
            temp = page_wiki.links.keys() & page_wiki.backlinks.keys()
            print(temp)
            page_intersect.update(temp)
    print('DONE')
    return page_intersect

In [14]:
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    term_token = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_text = [w for w in term_token if not w in stop_words]
    space = ' '
    return space.join(filtered_text)

In [15]:
def get_page_collection(query):
    '''
    in : [['asd', 'fgh'], ['qwe', 'rty']]
    out: ['das asd', 'sda sad']
    '''
    wiki = wikipediaapi.Wikipedia('en')
    page_collection = []
    for term in query:
        page_wiki = wiki.page(term)
        if page_wiki.exists():
            page_collection.append(clean_text(page_wiki.text))
    return page_collection

In [16]:
def select_expand_term(candidate_term, original_term, n_terms):
    '''
    in : candidate_term {'asd':1, 'fgh':2}, original_term ['term1', 'term2']
    out: ['term1','term2','fgh']
    '''
    sorted_term = sorted(candidate_term, key=lambda tup: tup[1], reverse=True)
    new_term = original_term[:]
    counter_term = 0
    idx_term = 0
    while (counter_term < n_terms and idx_term < len(candidate_term)):
        if not (candidate_term[idx_term][0] in original_term):
            new_term.append(candidate_term[idx_term][0])
            counter_term += 1
        idx_term += 1
    return new_term

In [17]:
def write_expanded_query_trec_format(queries, filename):
    if (os.path.exists(filename)):
        os.remove(filename)
    for num, query in enumerate(queries, start=1):
        f = open(filename, 'a')
        f.write('<top>\n')
        f.write('<num>{}</num><title>\n'.format(num))
        f.write(' '.join(query))
        f.write('\n</title>\n')
        f.write('</top>\n')
        f.close()

In [18]:
def outputfile_expand(filename, n_terms):
    return filename.replace('.', '-expand-' + str(n_terms) + '.')

In [41]:
def run_expand(fname):
    N_TERMS = 15
    translate_process(fname)
    print('--- finish translate')
    translated_file = outputfile_translate(fname)
    preprocess_queries(translated_file)
    filtered_file = outputfile_filtered(translated_file)
    print('--- finish preprocess')
    queries = read_queries_in_token(filtered_file)
    page_collection = get_page_collection(queries)
    expanded_query = []
    query_no = 1
    for query in queries:
        candidate_term = generate_term(query)
        term_weight = count_weight_tfidf(candidate_term, page_collection)
        expanded_query.append(select_expand_term(term_weight, query, 5))
        if query_no % 10 == 0:
            print('--- finish {}'.format(query_no))
        query_no += 1
    expanded_file = outputfile_expand(filtered_file, N_TERMS)
    write_expanded_query_trec_format(expanded_query, expanded_file)

In [42]:
run_expand('collection/cran-ta/cran-qry-indo.trec')

--- finish translate
--- finish preprocess
--- finish 10
--- finish 20
--- finish 30
--- finish 40
--- finish 50
--- finish 60
--- finish 70


ReadTimeout: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)

In [36]:
def run_expand_without_translate(fname):
    N_TERMS = 5
#     translate_process(fname)
#     print('--- finish translate')
    translated_file = outputfile_translate(fname)
    preprocess_queries(translated_file)
    filtered_file = outputfile_filtered(translated_file)
    print('--- finish preprocess')
    queries = read_queries_in_token(filtered_file)
    page_collection = get_page_collection(queries)
    expanded_query = []
    query_no = 1
    for query in queries:
        candidate_term = generate_term(query)
        term_weight = count_weight_tfidf(candidate_term, page_collection)
        expanded_query.append(select_expand_term(term_weight, query, N_TERMS))
        print('--- finish {}'.format(query_no))
        query_no += 1
    expanded_file = outputfile_expand(filtered_file, N_TERMS)
    write_expanded_query_trec_format(expanded_query, expanded_file)

In [38]:
run_expand_without_translate('collection/cisi-ta/partial/cisi-qry-indo-partial.trec')

--- finish preprocess
--- finish 1
--- finish 2
--- finish 3
--- finish 4
--- finish 5
--- finish 6
--- finish 7
--- finish 8
--- finish 9
--- finish 10
--- finish 11
--- finish 12
--- finish 13
--- finish 14
--- finish 15
--- finish 16
--- finish 17
--- finish 18
--- finish 19
--- finish 20
--- finish 21
--- finish 22
--- finish 23
--- finish 24
--- finish 25
--- finish 26
--- finish 27
--- finish 28
--- finish 29
--- finish 30
--- finish 31
--- finish 32
--- finish 33
--- finish 34
--- finish 35
--- finish 36
--- finish 37
--- finish 38
--- finish 39
--- finish 40
--- finish 41
--- finish 42
--- finish 43
--- finish 44
--- finish 45
--- finish 46
--- finish 47
--- finish 48
--- finish 49
--- finish 50
--- finish 51


In [47]:
def run_expand_only(fname):
    N_TERMS = 7
    translated_file = outputfile_translate(fname)
    filtered_file = outputfile_filtered(translated_file)
    print('--- finish preprocess')
    queries = read_queries_in_token(filtered_file)
    page_collection = get_page_collection(queries)
    expanded_query = []
    query_no = 1
    for query in queries:
        candidate_term = generate_term(query)
        term_weight = count_weight_tfidf(candidate_term, page_collection)
        expanded_query.append(select_expand_term(term_weight, query, N_TERMS))
        if query_no % 10 == 0:
            print('--- finish {}'.format(query_no))
        query_no += 1
    expanded_file = outputfile_expand(filtered_file, N_TERMS)
    write_expanded_query_trec_format(expanded_query, expanded_file)

In [48]:
run_expand_only('collection/cran-ta/cran-qry-indo.trec')

--- finish preprocess
--- finish 10
--- finish 20
--- finish 30
--- finish 40
--- finish 50
--- finish 60
--- finish 70
--- finish 80
--- finish 90
--- finish 100
--- finish 110
--- finish 120
--- finish 130
--- finish 140
--- finish 150
--- finish 160
--- finish 170
--- finish 180
--- finish 190
--- finish 200
--- finish 210
--- finish 220


In [25]:
'AaA'.replace('a', 'b')

'AbA'