## CSV to JSON

In [62]:
import csv
import json

In [63]:
all_json_data = [] # a list of json records

with open("documents.csv", newline="", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        json_obj = {
            "doc_id": row["ID"], 
            "text": row["Text"] 
        }
        all_json_data.append(json_obj)

In [64]:
print(len(all_json_data))

18316


## Start Elastic Search

In [65]:
from elasticsearch import Elasticsearch

In [66]:
client = Elasticsearch("http://localhost:9200")

In [67]:
# We will use the Vector Space Model for our IR-method
vsm_mapping = {
    "settings": {
        "number_of_shards": 1,
        "similarity": {
            "scripted_tfidf": {
                "type": "scripted",
                "script": {
                    "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
                }
            }
        },        
        "analysis": {
            "analyzer": {
                "default": {
                    "type": "english"
                },
                "default_search": {
                    "type": "english"
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "doc_id":{
                "type": "integer"
            },
            "text": {
                "type": "text",
                "similarity": "scripted_tfidf"
            }
        }
    }
}

In [68]:
# Delete any existing index
client.indices.delete(index="my_docs")

ObjectApiResponse({'acknowledged': True})

In [69]:
# Create the new index
client.indices.create(index='my_docs', body=vsm_mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_docs'})

In [70]:
def insert_document(document):
    """Index one document.

    We force the Elasticsearch _id to be equal to our collection's doc_id (from CSV).
    This way, hit["_id"] == hit["_source"]["doc_id"] and we avoid mismatches.
    """
    docid = str(document["doc_id"]) # Get the doc id of each document
    return client.index(index="my_docs", id=docid, body=document) # Set the dock id as the index id

# populate index one document at a time
for document in all_json_data:
    insert_document(document)

print("Done indexing documents into `my_docs` index!")

Done indexing documents into `my_docs` index!


In [71]:
# The count must be 18,316
count = client.count(index="my_docs")
print(count)

{'count': 18316, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}


In [72]:
def print_search_response(response, max_hits=10):
    """DEBUG FUNCTION
    Help us check if the response we got has the format that we expect
    """
    
    # Use doc_id from _source (collection ID), NOT the ES internal _id.
    hits = response.get("hits", {}).get("hits", []) # Get the rersults of the search
    if not hits: 
        print("Your search returned no results.")
        return

    for i, hit in enumerate(hits[:max_hits], start=1):
        es_id = hit.get("_id") # Get elastic search id (index id)
        score = hit.get("_score") # Get the tf-idf score 
        src = hit.get("_source", {}) 
        doc_id = src.get("doc_id")  # Get the doc id
        text_preview = (src.get("text","")[:120] + "…") if src.get("text") else ""
        print(f"{i:>2}. doc_id={doc_id} | _id={es_id} | score={score:.4f} | {text_preview}")

## Querying + Run file (TREC)

In [73]:
import pandas as pd
from pathlib import Path

QUERIES_PATH = "queries.csv" 


queries_df = pd.read_csv(QUERIES_PATH)  # columns: ID, Text


print("queries:", queries_df.shape)

queries: (10, 2)


In [89]:
def es_search(query_text: str, k: int = 50):
    """Vector Space (classic) retrieval using our scripted_tfidf similarity on field 'text'."""
    body = {
        "size": k,
        "query": {
            "match": {
                "text": {
                    "query": query_text
                }
            }
        }
    }
    return client.search(index="my_docs", body=body)

def make_run_rows(qid: str, query_text: str, k: int, run_name: str = "VSM"):
    """Return rows in TREC run format: qid Q0 docid rank score IR-method."""
    resp = es_search(query_text, k=k)
    print_search_response(resp,1)
    rows = []
    for rank, hit in enumerate(resp["hits"]["hits"], start=1):
        # use collection doc_id, not ES internal id
        doc_id = str(hit.get("_source", {}).get("doc_id")) # doc id must be string in the TREC
        score = float(hit.get("_score", 0.0)) # Score must be float in the TREC
        rows.append((qid, "Q0", doc_id, rank, score, run_name))
    return rows

In [90]:
def write_trec(run_rows, out_path: str):
    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        for qid, q0, doc_id, rank, score, run_name in run_rows: 
            f.write(f"{qid} {q0} {doc_id} {rank} {score:.6f} {run_name}\n") # TREC format
    return str(out_path)

# create run files for k = 20, 30, 50
run_files = {}
for k in [20, 30, 50]:
    rows = []
    for _, r in queries_df.iterrows():
        qid = str(r["ID"])
        qtext = str(r["Text"])
        rows.extend(make_run_rows(qid, qtext, k=k, run_name=f"VSM_k{k}"))
    run_files[k] = write_trec(rows, f"trec_eval/vsm_k{k}.txt")

run_files

 1. doc_id=193378 | _id=193378 | score=88.7109 | Optimodal European Travel Ecosystem: EuTravel aims to: 1. Support the EU agenda towards an open and single market for mo…
 1. doc_id=213164 | _id=213164 | score=61.5033 | Big Data for Mobility Tracking Knowledge Extraction in Urban Areas: Track&amp;Know will research, develop and exploit a …
 1. doc_id=204146 | _id=204146 | score=95.3150 | Towards a Shared European Logistics Intelligent Information Space: SELIS is aimed at delivering a ‘platform for pan-Euro…
 1. doc_id=214253 | _id=214253 | score=77.8100 | Polyglot and Hybrid Persistence Architectures for Big Data Analytics: The need for levels of availability and scalabilit…
 1. doc_id=212490 | _id=212490 | score=89.1696 | Cognitive Heterogeneous Architecture for Industrial IoT: CHARIOT will provide a design method and cognitive computing pl…
 1. doc_id=210133 | _id=210133 | score=78.1138 | End-to-End Approach for Mobility-as-a-Service tools, business models, enabling framework and evi

{20: 'trec_eval\\vsm_k20.txt',
 30: 'trec_eval\\vsm_k30.txt',
 50: 'trec_eval\\vsm_k50.txt'}