### Homework 1

In [2]:
# Import Elasticsearch
from elasticsearch import Elasticsearch

Open connection to elasticsearch

In [1270]:
index_name = 'ap_dataset2'

In [833]:
es = Elasticsearch([{'host':'localhost','port':9200}])

In [834]:
es

<Elasticsearch([{'host': 'localhost', 'port': 9200}])>

Define index

In [1262]:
request_body = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords_path": "my_stoplist.txt"
                },
                "english_stemmer":{
                    "type": "stemmer",
                    "Language": "english"
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop",
                        "english_stemmer"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions"
            }
        }
    }
}

Create index.

In [1263]:
response = es.indices.create(index = index_name, body = request_body)

In [1264]:
response

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ap_dataset2'}

Open files and parse documents.  Add to index.

In [1265]:
def addDocuments(file):
    in_text = False
    ind = 1
    doc_text = ''
    doc_id = ''
    for line in file:
        str_arr = line.split()
        if(len(str_arr) > 0):
            if(str_arr[0] == '<DOCNO>'):
                doc_id = str_arr[1]
            if(str_arr[0] == '</TEXT>'):
                in_text=False
            if(in_text):
                doc_text = doc_text + line
            if(str_arr[0] == '<TEXT>'):
                in_text = True
            if(str_arr[0] == '</DOC>' and doc_id != ''):
                docExample = {"text":doc_text}
                res = es.index(index=index_name,id=doc_id, body=docExample)
                doc_id = ''
                doc_text = ''

In [1266]:
import os
path_prefix = 'AP_DATA/ap89_collection/'
list_folder_names = os.listdir('/Users/celiasherry/Documents/NE/Spring2020/IR/HW/AP_DATA/ap89_collection')
list_folder_names.remove('.DS_Store')

Total number of files

In [1267]:
len(list_folder_names)

365

In [1268]:
def parseFolders(list_folder_names):
    for folder in list_folder_names:
        file = open(path_prefix + folder, 'r', encoding = "ISO-8859-1")
        addDocuments(file)
        file.close()

In [1269]:
parseFolders(list_folder_names)

Check number of documents in index

In [1272]:
es.indices.refresh(index=index_name)
es.cat.count(index=index_name)

'1579742202 01:16:42 84678\n'

##### Tokenize Query

In [1271]:
query = "Dog brown cows movies"
request = {
    'field':'text',
    'text': query
}
es.indices.analyze(index=index_name, body=request)

{'tokens': [{'token': 'dog',
   'start_offset': 0,
   'end_offset': 3,
   'type': '<ALPHANUM>',
   'position': 0},
  {'token': 'brown',
   'start_offset': 4,
   'end_offset': 9,
   'type': '<ALPHANUM>',
   'position': 1},
  {'token': 'cow',
   'start_offset': 10,
   'end_offset': 14,
   'type': '<ALPHANUM>',
   'position': 2},
  {'token': 'movi',
   'start_offset': 15,
   'end_offset': 21,
   'type': '<ALPHANUM>',
   'position': 3}]}

##### Term frequency in document

In [1302]:
term = 'rainier'
doc = 'AP890513-0001'
#es.termvectors(index=index_name, id=doc, fields='text')['term_vectors']['text']['terms'].get(term,{})

es.termvectors(index=index_name,id=doc,fields='text')['term_vectors']['text']['terms']

{'14,112': {'term_freq': 1,
  'tokens': [{'position': 195, 'start_offset': 1129, 'end_offset': 1135}]},
 '14,158': {'term_freq': 1,
  'tokens': [{'position': 204, 'start_offset': 1181, 'end_offset': 1187}]},
 '14,410': {'term_freq': 2,
  'tokens': [{'position': 61, 'start_offset': 377, 'end_offset': 383},
   {'position': 179, 'start_offset': 1035, 'end_offset': 1041}]},
 '21': {'term_freq': 1,
  'tokens': [{'position': 141, 'start_offset': 808, 'end_offset': 810}]},
 '26': {'term_freq': 1,
  'tokens': [{'position': 81, 'start_offset': 473, 'end_offset': 475}]},
 '33': {'term_freq': 1,
  'tokens': [{'position': 83, 'start_offset': 480, 'end_offset': 482}]},
 '400': {'term_freq': 1,
  'tokens': [{'position': 168, 'start_offset': 965, 'end_offset': 968}]},
 'afternoon': {'term_freq': 1,
  'tokens': [{'position': 137, 'start_offset': 780, 'end_offset': 789}]},
 'ag': {'term_freq': 1,
  'tokens': [{'position': 79, 'start_offset': 460, 'end_offset': 464}]},
 'area': {'term_freq': 2,
  'token

##### Document Frequency

In [1275]:
es.termvectors(index=index_name, id=doc, fields='text',term_statistics=True)['term_vectors']['text']['terms'] \
.get(term,{})

{'doc_freq': 31,
 'ttf': 48,
 'term_freq': 2,
 'tokens': [{'position': 9, 'start_offset': 60, 'end_offset': 67},
  {'position': 26, 'start_offset': 158, 'end_offset': 165}]}

##### Vocab Size

In [1276]:
request = {
    "aggs":{
        "vocabSize": {
            "cardinality": {
                "field": "text"},
        }
    }
    , "size":0}
es.search(index=index_name, body=request)['aggregations']['vocabSize']['value']

192916

##### Document Length

In [1277]:
doc_id = 'AP890513-0003'
sum(map(lambda t: t['term_freq'], 
        es.termvectors(index=index_name,id=doc_id, fields='text',term_statistics=True)['term_vectors']['text']['terms'].values()))

338

##### Average Document Length

In [1278]:
es.termvectors(index=index_name,id='AP891220-0113', fields='text', field_statistics=True)['term_vectors']['text']['field_statistics']['sum_ttf']/es.termvectors(index=index_name,id='AP891220-0113', fields='text', field_statistics=True)['term_vectors']['text']['field_statistics']['doc_count']

248.65458026544385

### ES built in

Create file to write results

In [1279]:
output_file = "ES_builtin_output_file.txt"
f = open(output_file, "w+")
f.close()

Parse results and write results to file

In [1280]:
def write_to_file(query_num, res, file_output):
    i = 1
    for doc in res:
        if i > 1000:
            break
        _id = doc['_id']
        _score = doc['_score']
        rank = i
        i += 1
        file_output.write(str(query_num) + " Q0 " + str(_id) + " " + str(rank) + " " + str(_score) + " Exp\n")

Open query file and run search on each query.

In [1462]:
file = open("AP_DATA/query_desc.51-100.short.txt", 'r')
f_output = open(output_file, "w")
for line in file:
    query_number = line.split('.')[0]
    query_text = line[6:]
    request = {
        "sort":[{
            "_score" : {
                "order":"desc"
            }}
        ],
        "query":{
            "match":{
                "text":{
                    "query":query_text,
                }
            }
        },
        'size': 10000
    }
    res = es.search(index=index_name, body=request)['hits']['hits']
    write_to_file(query_number, res, f_output)
file.close()
f_output.close()

### Okapi TF

Define method to get average document length of index.

In [1282]:
def get_avg_doc_length():
    return es.termvectors(index=index_name,id='AP891220-0113', fields='text', field_statistics=True)['term_vectors']['text']['field_statistics']['sum_ttf']/es.termvectors(index=index_name,id='AP891220-0113', fields='text', field_statistics=True)['term_vectors']['text']['field_statistics']['doc_count']

Define method to get document length. 

In [1283]:
def get_doc_length(doc_id):
    return sum(map(lambda t: t['term_freq'], 
            es.termvectors(index=index_name,id=doc_id, fields='text',term_statistics=True)['term_vectors']['text']['terms'].values()))

Get nested dictionary of each document and term frequency.

In [1284]:
def get_term_vectors(batch):
    doc_dict = {}
    doc_length_dict = {}
    request = {
        "ids": batch,
        "parameters": {
            "fields": [
                "text"
            ]
        }
    }
    res = es.mtermvectors(index=index_name, body=request)['docs']
    for doc in res:
        doc_id = doc['_id']
        doc_length_dict[doc_id] = get_doc_length(doc_id)
        tv = doc['term_vectors']
        if len(tv) != 0:
            doc_dict[doc_id] = {}
            term_freq = tv['text']['terms']
            for term in term_freq:
                f = term_freq.get(term, {})['term_freq']
                doc_dict[doc_id][term] = f
    return doc_dict, doc_length_dict

Get nested dictionary for all documents in index, batch.

In [1285]:
from toolz import partition_all

def partition_docs(docs):
    batch_size = 1000
    partitions = list(partition_all(batch_size, docs))
    doc_dict = {}
    doc_length = {}
    for partition in partitions:
        new_dict, doc_length_dict = get_term_vectors(partition)
        doc_dict.update(new_dict)
        doc_length.update(doc_length_dict)
    return doc_dict, doc_length

Use elasticsearch to query for documents that contain at least one query word.

In [1360]:
def get_doc_with_query_words(query):
    page = es.search(
    index = index_name,
    scroll = '1m',
    size = 1000,
    body = {
        "_source":"_id",
        "query": {
            "match": {
              "text":{
                "query": query,
                "operator" : "or"
              }
            }
          }
        })
    sid = page['_scroll_id']
    scroll_size = page['hits']['total']['value']
    doc_ids = []
    for doc in page['hits']['hits']:
        doc_ids.append(doc['_id'])

    while (scroll_size > 0):
        page = es.scroll(scroll_id=sid, scroll='1m')
        scroll_size = len(page['hits']['hits'])
        for doc in page['hits']['hits']:
            doc_ids.append(doc['_id'])
    return doc_ids

Get tokens from query

In [1289]:
def get_tokens(query):
    token_list = []
    request = {
        'field':'text',
        'text': query
    }
    res = es.indices.analyze(index=index_name, body=request)['tokens']
    # Get list of keywords in query
    for token in res:
        token_list.append(token['token'])
    return token_list

Sort dictionary and write to file

In [1292]:
import collections
def write_to_file_okapi(query_num, score_dict, file_output):
    sort = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True)
    i = 1
    for item in sort:
        if i > 1000:
            break
        _id = item[0]
        _score = item[1]
        rank = i
        i += 1
        file_output.write(str(query_num) + " Q0 " + str(_id) + " " + str(rank) + " " + str(_score) + " Exp\n")

Go through queries and write output to file

In [1471]:
def get_okapi_tf():
    okapi_output_file = "Okapi_TF_output_file.txt"
    f_output = open(okapi_output_file, "w+")
    file = open("AP_DATA/EC2_query_desc.51-100.short.txt", 'r')
    
    for line in file:
        query_number = line.split('.')[0]
        query_text = line[6:]
        
        tokens = get_tokens(query_text)
        avg_doc_length = get_avg_doc_length()
        
        score_dict = {}
        
        for term in tokens:
            
            doc_ids = get_doc_with_query_words(term)
            master_doc_dict, doc_length_dict = partition_docs(doc_ids)
            
            for doc in doc_ids:
                if term in master_doc_dict[doc].keys():
                    tf = master_doc_dict[doc][term]
                    if doc in score_dict.keys():
                        score_dict[doc] = score_dict[doc] + (tf/(((doc_length_dict[doc]/avg_doc_length)*1.5)+.5+tf))
                    else:
                        score_dict[doc] = (tf/(((doc_length_dict[doc]/avg_doc_length)*1.5)+.5+tf))
                    
        write_to_file_okapi(query_number, score_dict, f_output)
                
    file.close()
    f_output.close()

Run Okapi TF 

In [1373]:
get_okapi_tf()

allegations corrupt government   

['alleg', 'corrupt', 'govern']
weather caused death

['weather', 'caus', 'death']
prediction prime lending rate move 

['predict', 'prime', 'lend', 'rate', 'move']
attack guerrilla border military

['attack', 'guerrilla', 'border', 'militari']
politically motivated hostage-taking hostage

['polit', 'motiv', 'hostag', 'take', 'hostag']
military coup d'etat  

['militari', 'coup', "d'etat"]
supporters National Rifle Association NRA

['support', 'nation', 'rifl', 'associ', 'nra']
Iran-Contra Affair  

['iran', 'contra', 'affair']
rail strike 

['rail', 'strike']
poaching wildlife poach

['poach', 'wildlif', 'poach']
signing contract preliminary agreement launch commercial satellite  

['sign', 'contract', 'preliminari', 'agreement', 'launch', 'commerci', 'satellit']
current criminal actions against officers failed U.S. financial institution

['current', 'crimin', 'action', 'offic', 'fail', 'u.', 'financi', 'institut']
crime computer

['crime', 'comput']


### TF-IDF

Get total number of documents

In [1374]:
def get_total_num_documents():
    return es.count(index=index_name).get('count')

Get number of documents that contain term m

In [911]:
import math

In [1375]:
def get_df_w(term_list):
    term_dict = {}
    doc_count = get_total_num_documents()
    for word in term_list:
        query = {
            "query": {
                "term" : {
                    "text": word
                }
            }
        }
        df = es.count(index=index_name, body=query)['count']
        if df != 0:
            term_dict[word] = math.log10(doc_count/df)
    return term_dict

Get TF-IDF for document

In [1377]:
def get_tf_idf_for_doc(keyword_list, term_dict, avg_doc_length, token_dict, doc_length):
    denom = (1.5 * (doc_length/avg_doc_length)) + .5
    okapi_tf = 0
    for token in keyword_list:
        if token in term_dict.keys():
            tf = term_dict[token]
            okapi_tf_temp = (tf / (tf + denom)) * token_dict[token]
            okapi_tf = okapi_tf + okapi_tf_temp
    return okapi_tf

Go through queries and write to output file

In [1466]:
def get_tf_idf():
    tf_output_file = "TF_IDF_output_file.txt"
    f_output = open(tf_output_file, "w+")
    file = open("AP_DATA/EC2_query_desc.51-100.short.txt", 'r')
    
    for line in file:
        query_number = line.split('.')[0]
        query_text = line[6:]
        
        tokens = get_tokens(query_text)
        df_multiplier_dict = get_df_w(tokens)
        
        score_dict = {}
        
        
        for term in tokens:
            
            doc_ids = get_doc_with_query_words(term)
            master_doc_dict, doc_length_dict = partition_docs(doc_ids)
            
            num_docs = 0
            collection_length = 0
            for doc in doc_length_dict:
                collection_length = collection_length + doc_length_dict[doc]
                num_docs = num_docs + 1
            avg_doc_length = collection_length/num_docs
            
            for doc in doc_ids:
                if term in master_doc_dict[doc].keys():
                    tf = master_doc_dict[doc][term]
                    if doc in score_dict.keys():
                        score_dict[doc] = score_dict[doc] + ((tf/(((doc_length_dict[doc]/avg_doc_length)*1.5)+.5+tf)) * df_multiplier_dict[term])
                    else:
                        score_dict[doc] = ((tf/(((doc_length_dict[doc]/avg_doc_length)*1.5)+.5+tf)) * df_multiplier_dict[term])
        
        write_to_file_okapi(query_number, score_dict, f_output)
                
    file.close()
    f_output.close()

In [1385]:
get_tf_idf()

acquisit
u.
armi
specifi
advanc
weapon
system


## Okapi BM25

Get first term in Okapi BM25

In [1387]:
def get_t1_bm25(term_list):
    term_dict = {}
    doc_count = get_total_num_documents()
    for word in term_list:
        query = {
            "query": {
                "term" : {
                    "text": word
                }
            }
        }
        df = es.count(index=index_name, body=query)['count']
        if df != 0:
            term_dict[word] = math.log10((doc_count+.5)/(df+.5))
    return term_dict

Go through queries and write to output file.

In [1467]:
def get_bm25():
    bm25_output_file = "BM25_output_file.txt"
    f_output = open(bm25_output_file, "w+")
    file = open("AP_DATA/EC2_query_desc.51-100.short.txt", 'r')
    
    k1 = 1.2
    k2 = 1.2
    b = .75
    
    for line in file:
        query_number = line.split('.')[0]
        query_text = line[6:]
        
        tokens = get_tokens(query_text)
        avg_doc_length = get_avg_doc_length()
        df_multiplier_dict = get_t1_bm25(tokens)
        
        score_dict = {}
        
        for term in tokens:
            
            doc_ids = get_doc_with_query_words(term)
            master_doc_dict, doc_length_dict = partition_docs(doc_ids)
            
            for doc in doc_ids:
                if term in master_doc_dict[doc].keys():
                    tf = master_doc_dict[doc][term]
                    first_term = df_multiplier_dict[term]
                    second_term = (tf+k1*tf)/(tf+k1*((1-b)+b*doc_length_dict[doc]/avg_doc_length))
                    third_term = (tf+k1*tf) / (tf+k2)
                    if doc in score_dict.keys():
                        score_dict[doc] = score_dict[doc] + (first_term*second_term*third_term)
                    else:
                        score_dict[doc] = first_term*second_term*third_term
                    
        write_to_file_okapi(query_number, score_dict, f_output)
                
    file.close()
    f_output.close()

In [1392]:
get_bm25()

## Unigram LM with Laplace Smoothing

In [1393]:
def get_vocab_size():
    request = {
        "aggs":{
            "vocabSize": {
                "cardinality": {
                    "field": "text"},
            }
        }
        , "size":0}
    return es.search(index=index_name, body=request)['aggregations']['vocabSize']['value']

In [1468]:
def get_unigram_laplace():
    laplace_output_file = "laplace_output_file.txt"
    f_output = open(laplace_output_file, "w+")
    file = open("AP_DATA/EC2_query_desc.51-100.short.txt", 'r')
    
    v = get_vocab_size()
    
    for line in file:
        query_number = line.split('.')[0]
        query_text = line[6:]
        
        tokens = get_tokens(query_text)
        
        doc_ids = get_doc_with_query_words(query_text)
        master_doc_dict, doc_length_dict = partition_docs(doc_ids)
        
        score_dict = {}
        
        for term in tokens:
            
            for doc in doc_ids:
                if term in master_doc_dict[doc].keys():
                    tf = master_doc_dict[doc][term]
                    frac = math.log10(float(tf+1)/float(doc_length_dict[doc] + v))
                    if doc in score_dict.keys():
                        score_dict[doc] = score_dict[doc] + frac
                    else:
                        score_dict[doc] = frac
                else:
                    frac2 = math.log10(float(1)/float(doc_length_dict[doc] + v))
                    if doc in score_dict.keys():
                        score_dict[doc] = score_dict[doc] + frac2
                    else:
                        score_dict[doc] = frac2
                    
        write_to_file_okapi(query_number, score_dict, f_output)
                
    file.close()
    f_output.close()

In [1395]:
get_unigram_laplace()

## Unigram LM with Jelinek-Mercer Smoothing

In [1469]:
def get_unigram_jelinek():
    jelinek_output_file = "jelinek_output_file.txt"
    f_output = open(jelinek_output_file, "w+")
    file = open("AP_DATA/EC2_query_desc.51-100.short.txt", 'r')
    
    l = 0.8
    
    for line in file:
        query_number = line.split('.')[0]
        query_text = line[6:]
        
        tokens = get_tokens(query_text)
        
        doc_ids = get_doc_with_query_words(query_text)
        master_doc_dict, doc_length_dict = partition_docs(doc_ids)
        
        total_tf = {}
        for doc in master_doc_dict:
            for t in master_doc_dict[doc]:
                if t in total_tf.keys():
                    total_tf[t] = total_tf[t] + master_doc_dict[doc][t]
                else:
                    total_tf[t] = master_doc_dict[doc][t]
                    
        collection_length = 0
        for doc in doc_length_dict:
            collection_length = collection_length + doc_length_dict[doc]
            
        score_dict = {}
        
        for term in tokens:
            
            for doc in doc_ids:
                if term in master_doc_dict[doc].keys():
                    tf = master_doc_dict[doc][term]
                    frac = math.log10(float(l*(tf/doc_length_dict[doc]))/float((1-l)*(total_tf[term]/collection_length)))
                    if doc in score_dict.keys():
                        score_dict[doc] = score_dict[doc] + frac
                    else:
                        score_dict[doc] = frac
                else:
                    frac2 = math.log10(float((1-l)*(total_tf[term]/collection_length)))
                    if doc in score_dict.keys():
                        score_dict[doc] = score_dict[doc] + frac2
                    else:
                        score_dict[doc] = frac2
                    
        write_to_file_okapi(query_number, score_dict, f_output)
                
    file.close()
    f_output.close()

In [1397]:
get_unigram_jelinek()

## EC2: Pseudo-relevance Feedback

In [1485]:
def get_relevant_docs(query):
    page = es.search(
    index = index_name,
    scroll = '1m',
    size = 1000,
    body = {
        "_source":"_id",
        "query": {
            "terms": {
                "text": query
            }
          }
        })
    sid = page['_scroll_id']
    scroll_size = page['hits']['total']['value']
    doc_ids = []
    for doc in page['hits']['hits']:
        doc_ids.append(doc['_id'])

    while (scroll_size > 0):
        page = es.scroll(scroll_id=sid, scroll='1m')
        scroll_size = len(page['hits']['hits'])
        for doc in page['hits']['hits']:
            doc_ids.append(doc['_id'])
    return doc_ids

In [1490]:
def get_ok_tf(tokens, master_doc_dict, doc_ids, doc_length_dict, avg_doc_length):
    score_dict = {}
    
    for term in tokens:
        for doc in doc_ids:
            if term in master_doc_dict[doc].keys():
                tf = master_doc_dict[doc][term]
                if doc in score_dict.keys():
                    score_dict[doc] = score_dict[doc] + (tf/(((doc_length_dict[doc]/avg_doc_length)*1.5)+.5+tf))
                else:
                    score_dict[doc] = (tf/(((doc_length_dict[doc]/avg_doc_length)*1.5)+.5+tf))
                    
    return score_dict

In [1588]:
def get_common_terms(list_of_doc_ids):
    v,l = get_term_vectors(list_of_doc_ids)
    term_frequency_dict = {}
    return_list = []
    for doc in v:
        for t in v[doc]:
            if t in term_frequency_dict.keys():
                term_frequency_dict[t] = term_frequency_dict[t] + 1
            else:
                term_frequency_dict[t] = 1
    for term in term_frequency_dict:
        if term_frequency_dict[term] > len(list_of_doc_ids)-2:
            return_list.append(term)
    return return_list
        

In [None]:
query_text = 'fiber optics technology in use'
tokens = get_tokens(query_text)
doc_ids = get_relevant_docs(tokens)
master_doc_dict, doc_length_dict = partition_docs(ids)

In [1627]:
doc_ids = get_relevant_docs(tokens)

In [1632]:
st
st2 = 'allegations corrupt government    weather caused death prediction prime lending rate move'

In [1595]:
def get_okapi_tf_pseudo_feedback(n, num_terms, master_doc_dict, doc_ids, doc_length_dict):
    feedback_file = "pseudo_feedback_output_file.txt"
    f_output = open(feedback_file, "w+")
    file = open("AP_DATA/query_desc.51-100.short2.txt", 'r')
    
    for line in file:
        query_number = line.split('.')[0]
        query_text = line[6:]
        
        tokens = get_tokens(query_text)
        avg_doc_length = get_avg_doc_length()
        
        #doc_ids = get_relevant_docs(tokens)
        #master_doc_dict, doc_length_dict = partition_docs(doc_ids)
        # Get TF for documents
        first_pass = get_ok_tf(tokens, master_doc_dict, doc_ids, doc_length_dict, avg_doc_length)
        sorted_dict = sorted(first_pass.items(), key=operator.itemgetter(1), reverse=True)
 
        # Only keep n documents
        top_docs = sorted_dict[:n]
        
        # Find common terms
        docs = []
        for d in top_docs:
            docs.append(d[0])
        
        terms = get_common_terms(docs)
        res = [i for i in terms if i not in tokens]
        print(res)
        if num_terms > len(res):
            m = len(res)
        else:
            m = num_terms
        for i in range(0,m):
            query_text = query_text + ' ' + res[i]
        
        print(query_text)
        new_tokens = get_tokens(query_text)
        #new_doc_ids = get_relevant_docs(new_tokens)
        #new_master_doc_dict, new_doc_length_dict = partition_docs(new_doc_ids)
        second_pass = get_ok_tf(new_tokens, master_doc_dict, doc_ids, doc_length_dict, avg_doc_length)
        
                    
        write_to_file_okapi(query_number, second_pass, f_output)
                
    file.close()
    f_output.close()

In [1574]:
test = 'allegations corrupt government    alleg corrupt'
tok = get_tokens(test)
print(set(tok))
print(tok)

{'govern', 'alleg', 'corrupt'}
['alleg', 'corrupt', 'govern', 'alleg', 'corrupt']


In [1613]:
get_okapi_tf_pseudo_feedback(9,2, master_doc_dict, doc_ids, doc_length_dict)

['compani']
fiber optics technology in use compani


## EC2: Pseudo-relevance Feedback Using ElasticSearch aggs "significant terms"

In [1414]:
def get_aggregations(term):
    request = {
        "query":{
            "terms": {
                "text":[term]
            }
        },
        "aggregations": {
            "significantTerms": {
                "significant_terms": {
                    "field":"text"
                }
            }
        },
        "size": 0
    }   
    related_words = []
    res = es.search(index=index_name, body=request)['aggregations']['significantTerms']['buckets']
    for r in res:
        related_words.append(r['key'])
    if len(related_words) > 0:
        return related_words[1:]
    else:
        return related_words

In [1420]:
def get_words_to_add_to_query(query):
    tokens = get_tokens(query)
    term_map = {}
    for token in tokens:
        l = get_aggregations(token)
        for word in l:
            if word in term_map.keys():
                term_map[word] = term_map[word] + 1
            else:
                term_map[word] = 1
    sort = sorted(term_map.items(), key=operator.itemgetter(1), reverse=True)
    return sort

In [1430]:
def add_words_to_query(query):
    l = get_words_to_add_to_query(query)
    output = []
    for word in l:
        if word[1] > 1:
            output.append(word[0])
    return output

In [1461]:
add_words_to_query("acquisition U.S. Army specified advanced weapons systems")

['militari']

Rerun models with modified queries

In [1473]:
#get_okapi_tf()
#get_tf_idf()
get_bm25()
#get_unigram_laplace()
#get_unigram_jelinek()

## Get list of all document ids, clear cache

Get list of all document ids

In [475]:
import elasticsearch.helpers
number_of_docs = es.count(index=index_name).get('count')
request = {
    "query":{
        "match_all" : {}
    }
    ,'size': 10000
}

results = elasticsearch.helpers.scan(es,
                                    index=index_name,preserve_order=True,query=request)
ids = []
for item in results:
    ids.append(item['_id'])
    

In [1614]:
print(len(ids))
ids[0:10]

84678


['AP891220-0113',
 'AP891220-0114',
 'AP891220-0115',
 'AP891220-0116',
 'AP891220-0117',
 'AP891220-0118',
 'AP891220-0119',
 'AP891220-0120',
 'AP891220-0121',
 'AP891220-0122']

In [835]:
es.indices.clear_cache()

{'_shards': {'total': 5, 'successful': 4, 'failed': 0}}

In [987]:
query = 'Document will discuss allegations, or measures being taken against, corrupt public officials of any governmental jurisdiction worldwide. '

In [990]:
t = get_tokens(query)
a = get_t1_bm25(t)

In [894]:
l = get_doc_with_query_words(query)

In [895]:
len(l)

42483