##**DSAI 201 Final Project**

In [None]:
#install the Pyterrier framework
!pip install python-terrier
# install the nltk modules
!pip install nltk

Collecting python-terrier
  Downloading python-terrier-0.10.1.tar.gz (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m571.6 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matchpy (from python-terrier)
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.6/69.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecated (from python-terrier)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting chest (

In [None]:
import pyterrier as pt
if not pt.started():

 pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])



terrier-assemblies 5.9 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done
terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



##**Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re # used to clean the data
import os

In [None]:
from nltk.stem import *
from nltk.stem.porter import *
# Initialize Porter stemmer
stemmer = PorterStemmer()

##**Unzipping the dataset and converting every file to csv**

In [None]:
import zipfile
zip_file_name = 'cisi.zip'
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall('cisi_dataset')
!ls cisi_dataset

CISI.ALL  CISI.QRY  CISI.REL


In [None]:
def load_cisi_dataset(data_dir):
    documents_path = os.path.join(data_dir, 'CISI.ALL')
    queries_path = os.path.join(data_dir, 'CISI.QRY')
    qrels_path = os.path.join(data_dir, 'CISI.REL')

    documents_df = read_documents(documents_path)
    queries_df = read_queries(queries_path)
    qrels_df = read_qrels(qrels_path)
    return documents_df, queries_df, qrels_df

# Read documents from CISI.ALL file
def read_documents(documents_path):
    with open(documents_path, 'r') as file:
        lines = file.readlines()
    documents = []
    current_document = None
    for line in lines:
        if line.startswith('.I'):
            if current_document is not None:
                current_document['Text'] = current_document['Text'].split('\t')[0].strip()  # Remove anything after the first tab
                documents.append(current_document)
            current_document = {'ID': line.strip().split()[1], 'Text': ''}
        elif line.startswith('.T'):
            continue
        elif line.startswith('.A') or line.startswith('.B') or line.startswith('.W') or line.startswith('.X'):
            continue
        else:
            current_document['Text'] += line.strip() + ' '

    # Append the last document
    if current_document is not None:
        current_document['Text'] = current_document['Text'].split('\t')[0].strip()  # Remove anything after the first tab
        documents.append(current_document)
    documents_df = pd.DataFrame(documents)
    return documents_df

# Read queries from CISI.QRY file
def read_queries(queries_path):
    with open(queries_path, 'r') as file:
        lines = file.readlines()
    query_texts = []
    query_ids = []
    current_query_id = None
    current_query_text = []
    for line in lines:
        if line.startswith('.I'):
            if current_query_id is not None:
                query_texts.append(' '.join(current_query_text))
                current_query_text = []
            current_query_id = line.strip().split()[1]
            query_ids.append(current_query_id)
        elif line.startswith('.W'):
            continue
        elif line.startswith('.X'):
            break
        else:
            current_query_text.append(line.strip())
    # Append the last query
    query_texts.append(' '.join(current_query_text))
    queries_df = pd.DataFrame({
        'qid': query_ids,
        'raw_query': query_texts})
    return queries_df

# Read qrels from CISI.REL file
def read_qrels(qrels_path):
    qrels_df = pd.read_csv(qrels_path, sep='\s+', names=['qid','Q0','docno','label'])
    return qrels_df

In [None]:
documents_df, queries_df, qrels_df = load_cisi_dataset("cisi_dataset")
#documents_df = documents_df.rename(columns={"ID":"docno","Text":"raw_text"},inplace=True)

In [None]:
documents_df.head()

Unnamed: 0,ID,Text
0,1,18 Editions of the Dewey Decimal Classificatio...
1,2,"Use Made of Technical Libraries Slater, M. Thi..."
2,3,Two Kinds of Power An Essay on Bibliographic C...
3,4,Systems Analysis of a University Library; fina...
4,5,A Library Management Game: a report on a resea...


##**Preprocessing**

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def Steem_text(text):

    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    # print (tokens)
    return ' '.join(stemmed_tokens)


def clean(text):
   text = re.sub(r"http\S+", " ", text) # remove urls
   text = re.sub(r"RT ", " ", text) # remove rt
   text = re.sub(r"@[\w]*", " ", text) # remove handles
   text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
   text = re.sub(r'\t', ' ', text) # remove tabs
   text = re.sub(r'\n', ' ', text) # remove line jump
   text = re.sub(r"\s+", " ", text) # remove extra white space
   text = text.strip()
   return text

def remove_Stopwords(text):
  tokens = word_tokenize(text)
  tokenized = [word.lower() for word in tokens if not word.lower() in stop_words]
  return ' '.join(tokenized)

def process(sentence):
  cleaned = clean(sentence)
  stemed = Steem_text(cleaned)
  removed = remove_Stopwords(stemed)
  return removed




In [None]:
print("Data before processing: ")
documents_df.head()

Data before processing: 


Unnamed: 0,ID,Text
0,1,18 Editions of the Dewey Decimal Classificatio...
1,2,"Use Made of Technical Libraries Slater, M. Thi..."
2,3,Two Kinds of Power An Essay on Bibliographic C...
3,4,Systems Analysis of a University Library; fina...
4,5,A Library Management Game: a report on a resea...


In [None]:
print("Data after processing: ")
documents_df["processed_text"] = documents_df["Text"].apply(process)
documents_df.head()

Data after processing: 


Unnamed: 0,ID,Text,processed_text
0,1,18 Editions of the Dewey Decimal Classificatio...,18 edit dewey decim classif comaromi j p prese...
1,2,"Use Made of Technical Libraries Slater, M. Thi...",use made technic librari slater thi report ana...
2,3,Two Kinds of Power An Essay on Bibliographic C...,two kind power essay bibliograph control wilso...
3,4,Systems Analysis of a University Library; fina...,system analysi univers librari ; final report ...
4,5,A Library Management Game: a report on a resea...,librari manag game report research project bro...


##**Indexing**

In [None]:
indexer = pt.DFIndexer("./socuments_dfsetindex", overwrite=True)
# index the text, record the docnos as meta documents_df
documents_df['docno'] = documents_df['ID'].astype(str)
index_ref = indexer.index(documents_df["processed_text"], documents_df["docno"])
print(index_ref.toString())
index_ref.toString()

./socuments_dfsetindex/data.properties


'./socuments_dfsetindex/data.properties'

In [None]:
index = pt.IndexFactory.of(index_ref)

In [None]:
def inverted_Index(coll):
    punkt = [":", ",", ".", ";", '"']
    fin_doc = []
    doc_num = []
    doc_freq = {}
    posting_list = {}
    for doc_index, doc in enumerate(coll, start=1):
        splitted = doc.split()
        for word in splitted:
            stemed_word = ''.join(char for char in word if char not in punkt)

            if stemed_word.lower():
                fin_doc.append(stemed_word.lower())
                doc_num.append(doc_index)
                if doc_freq.get(stemed_word.lower(), 0) < len(coll):
                   doc_freq[stemed_word.lower()] = doc_freq.get(stemed_word.lower(), 0) + 1


                if stemed_word.lower() not in posting_list:
                  posting_list[stemed_word.lower()] = []
                if doc_index not in posting_list[stemed_word.lower()]:
                  posting_list[stemed_word.lower()].append(doc_index)


    df = pd.DataFrame({'Term': fin_doc, 'docID': doc_num})
    df['Posting_list'] = df['Term'].map(posting_list)
    df['doc_Freq'] = df['Term'].map(doc_freq)
    sorted_df = df.sort_values(by='Term').reset_index()

    sorted_df = sorted_df.drop_duplicates(subset=['Term'])



    return sorted_df
#ngrock

In [None]:
indexed = inverted_Index(documents_df["processed_text"])
indexed

Unnamed: 0,index,Term,docID,Posting_list,doc_Freq
0,77173,!,918,"[92, 349, 918, 952, 1050]",5
5,20000,$,220,"[188, 189, 220, 391, 551, 921, 1374]",14
19,53181,%,614,"[40, 52, 62, 76, 145, 190, 225, 253, 503, 576,...",113
132,36939,&,426,"[91, 121, 127, 153, 424, 425, 426, 427, 428, 4...",32
164,9301,',109,"[8, 12, 42, 88, 104, 109, 110, 113, 137, 147, ...",188
...,...,...,...,...,...
117149,4718,zund,57,"[57, 81]",2
117151,90951,zvezhinskii,1096,[1096],1
117152,91489,zyabrev,1106,[1106],1
117153,38181,{,443,[443],2


In [None]:
tfidf_retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"},num_results=10)


##**Query Expansion**

In [None]:
queries_df.head()

Unnamed: 0,qid,raw_query
0,1,What problems and concerns are there in making...
1,2,"How can actually pertinent data, as opposed to..."
2,3,What is information science? Give definitions...
3,4,Image recognition and any other methods of aut...
4,5,What special training will ordinary researcher...


In [None]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25",num_results=10)

query = queries_df["raw_query"][0]
query = process(query)
answer = bm25.search(query)
answer

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,428,429,0,11.84874,problem concern make descript titl difficulti ...
1,1,721,722,1,10.638468,problem concern make descript titl difficulti ...
2,1,1298,1299,2,10.292926,problem concern make descript titl difficulti ...
3,1,64,65,3,10.133325,problem concern make descript titl difficulti ...
4,1,758,759,4,9.951263,problem concern make descript titl difficulti ...
5,1,75,76,5,9.60229,problem concern make descript titl difficulti ...
6,1,927,928,6,8.948765,problem concern make descript titl difficulti ...
7,1,1420,1421,7,8.787256,problem concern make descript titl difficulti ...
8,1,665,666,8,8.569836,problem concern make descript titl difficulti ...
9,1,1089,1090,9,8.5246,problem concern make descript titl difficulti ...


In [None]:

bm25 = pt.BatchRetrieve(index, wmodel="BM25",num_results=10)
rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

rm3_qe = bm25 >> rm3_expander
expanded_query = rm3_qe.search(query).iloc[0]["query"]

expanded_query

for s in expanded_query.split()[1:]:
  print(s)

print("\n" + query)

problem^0.037500001
relev^0.037500001
involv^0.037500001
physic^0.027977895
automat^0.055471912
pattern^0.019037995
journal^0.063031383
articl^0.153639466
difficulti^0.037500001
approxim^0.037500001
content^0.057388879
retriev^0.037500001
concern^0.037500001
catalog^0.022908255
descript^0.037500001
usual^0.037500001
scatter^0.020215601
word^0.025646402
titl^0.217182189

problem concern make descript titl difficulti involv automat retriev articl approxim titl usual relev content articl titl


In [None]:
# After that you can search using the expanded query
expanded_query_formatted = ' '.join(expanded_query.split()[1:])

results_wqe = bm25.search(expanded_query_formatted)

print("   Before Expansion    After Expansion")
print(pd.concat([answer[['docid','score']][0:5].add_suffix('_1'),
            results_wqe[['docid','score']][0:5].add_suffix('_2')], axis=1).fillna(''))



   Before Expansion    After Expansion
   docid_1    score_1  docid_2    score_2
0      428  11.848740     1298  12.943719
1      721  10.638468       75  12.512497
2     1298  10.292926      721  12.089670
3       64  10.133325      758  11.912545
4      758   9.951263      428  11.700616


##**RM3**

In [None]:
def process_query(query,documents_df):

    query = process(query)
    tfidf_retr = pt.BatchRetrieve(index, controls={"wmodel": "TF_IDF"}, num_results=10)

    initial_results = tfidf_retr.search(query)
    rm3_expander = pt.rewrite.RM3(index, fb_terms=10, fb_docs=100)

    rm3_qe = tfidf_retr >> rm3_expander
    expanded_query = rm3_qe.search(query).iloc[0]["query"]
    expanded_query_formatted = ' '.join(expanded_query.split()[1:])


    results_wqe = tfidf_retr.search(expanded_query_formatted)

    relevant_docs = documents_df[documents_df["docno"].isin(results_wqe["docno"])]

    return relevant_docs["processed_text"]


##**ELMO**

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

Elmo = hub.load("https://tfhub.dev/google/elmo/3")

def cosine_similarity(v1, v2):
    dot_product = np.dot(v1,v2.T)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot_product / (norm_v1 * norm_v2 )
def elmo(query):
  answer = tfidf_retr.search(query)
  documents_df["docno"] = documents_df["docno"].astype(str)
  relevant_docs = documents_df[documents_df["docno"].isin(answer["docno"])]
  relevant_texts = relevant_docs["processed_text"].tolist()
  embeddings = Elmo.signatures["default"](tf.constant(relevant_texts))["elmo"]
  lis_query = list(query)
  query_embeding = Elmo.signatures["default"](tf.constant(lis_query))["elmo"]
  similarity_scores = {} #2d matrix
  for i in range(len(relevant_docs["processed_text"])):
      similarity = cosine_similarity(embeddings.numpy()[i],query_embeding.numpy()[0])
      similarity_scores[relevant_docs.iloc[i]["docno"]] = similarity
  sorted_keys = sorted(similarity_scores.keys(), reverse=True)

# Create a DataFrame with the sorted keys
  df = pd.DataFrame({'keys': sorted_keys})

  docs = documents_df[documents_df["docno"].isin(df["keys"])]


  return pd.DataFrame(docs["Text"])

In [None]:
elmo("information systems")

Unnamed: 0,Text
457,"Information Retrieval Systems Lancaster, F.W. ..."
537,"Information Retrieval Systems Lancaster, F.W. ..."
590,The Cost-Effectiveness Analysis of Information...
614,A Cost Model for Evaluating Information Retrie...
688,"The GREMAS System, an Integral Part of the IDC..."
708,The Chemical Abstracts Service Chemical Regist...
871,The Shared Cataloging System of the Ohio Colle...
1037,"Management Misinformation Systems Ackoff, R.L...."
1135,Data Retrieval Systems: Specifics and Problem...
1340,Communication Nets in Science: Status and Cita...


##**Project User Interface**

In [None]:
!pip install flask flask-ngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


In [None]:
!ngrok authtoken 2gJfHmeVJCV6BWTJPfhqaUeNQEi_3gHVhY7ydnNA98kpB5Gbr

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from flask import Flask
from pyngrok import ngrok

In [None]:
port_no = 5000

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
###the final one

import time
from flask import Flask, request, render_template_string
import pyterrier as pt

app = Flask(__name__)
ngrok.set_auth_token("2gJfHmeVJCV6BWTJPfhqaUeNQEi_3gHVhY7ydnNA98kpB5Gbr")
public_url =  ngrok.connect(port_no).public_url

#

# Record the start time


def cosine_similarity(v1, v2):
    dot_product = np.dot(v1,v2.T)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot_product / (norm_v1 * norm_v2 )
def process_query(query,documents_df):

    query = process(query)
    tfidf_retr = pt.BatchRetrieve(index, controls={"wmodel": "TF_IDF"}, num_results=10)

    initial_results = tfidf_retr.search(query)
    rm3_expander = pt.rewrite.RM3(index, fb_terms=10, fb_docs=100)

    rm3_qe = tfidf_retr >> rm3_expander
    expanded_query = rm3_qe.search(query).iloc[0]["query"]
    expanded_query_formatted = ' '.join(expanded_query.split()[1:])


    results_wqe = tfidf_retr.search(expanded_query_formatted)

    relevant_docs = documents_df[documents_df["docno"].isin(results_wqe["docno"])]

    return relevant_docs["Text"]

HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Ahmed Ibrahim Search Engine</title>
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css">
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 0;
            background-color: #f5f5f5;
            color: #333;
        }
        .container {
            max-width: 800px;
            margin: 20px auto;
            padding: 20px;
            background-color: #fff;
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
        }
        h1 {
            text-align: center;
            color: #333;
        }
        form {
            text-align: center;
            margin-bottom: 20px;
        }
        input[type="text"] {
            padding: 10px;
            width: 70%;
            border: 1px solid #ccc;
            border-radius: 5px;
            font-size: 16px;
        }
        input[type="submit"] {
            padding: 10px 20px;
            background-color: #007bff;
            color: #fff;
            border: none;
            border-radius: 5px;
            cursor: pointer;
            font-size: 16px;
        }
        input[type="submit"]:hover {
            background-color: #0056b3;
        }
        #searchResults {
            margin-top: 20px;
            padding: 10px;
            background-color: #f9f9f9;
            border-radius: 5px;
            box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
        }
        ul {
            list-style-type: none;
            padding: 0;
        }
        li {
            margin-bottom: 10px;
            padding: 10px;
            background-color: #fff;
            border-radius: 5px;
            box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>Ahmed Ibrahim Search Engine</h1>
        <form id="searchForm">
            <input type="text" id="query" name="query" placeholder="Search Here">
            <input type="submit" value="Search">
        </form>
        <div id="searchResults"></div>
    </div>

    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <script>
        // Function to handle form submission
        $('#searchForm').submit(function(event) {
            // Prevent default form submission
            event.preventDefault();

            // Get the query from the input field
            var query = $('#query').val();

            // Send the query to the server using AJAX
            $.ajax({
                type: 'POST',
                url: '/search',
                data: { query: query },
                success: function(response) {
                    // Update the search results div with the response
                    $('#searchResults').html(response);
                },
                error: function(xhr, status, error) {
                    // Handle errors
                    console.error(error);
                }
            });
        });
    </script>
</body>
</html>

"""

@app.route("/", methods=["GET"])
def home():
    return render_template_string(HTML_TEMPLATE)

@app.route("/search", methods=["POST"])
def search():
    start_time = time.time()
    query = request.form.get("query")
    # Perform search operation and retrieve search results
    search_results = process_query(query,documents_df)

    # Record the end time
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time

    # Format search results and include elapsed time
    results_html = f"<p>Effectiveness: {len(search_results)}</p>"
    results_html += f"<p>Efficiency: {elapsed_time:.2f} seconds</p>"
    results_html += "<h2>Search Results</h2>"
    results_html += "<ul>"
    results_html += "".join([f"<li>{result}</li>" for result in search_results])
    results_html += "</ul>"


    return results_html

print(f"To acces the Gloable link please click {public_url}")

app.run(port=port_no)

To acces the Gloable link please click https://bf00-104-199-133-183.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [12/May/2024 10:35:53] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [12/May/2024 10:35:55] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [12/May/2024 10:35:57] "POST /search HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [12/May/2024 10:38:36] "POST /search HTTP/1.1" 200 -
ERROR:__main__:Exception on /search [POST]
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 2529, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 1799, in dispatch_request
    return self.en

In [None]:
def process_query(query,documents_df):

    query = process(query)
    tfidf_retr = pt.BatchRetrieve(index, controls={"wmodel": "TF_IDF"}, num_results=10)

    initial_results = tfidf_retr.search(query)
    rm3_expander = pt.rewrite.RM3(index, fb_terms=10, fb_docs=100)

    rm3_qe = tfidf_retr >> rm3_expander
    expanded_query = rm3_qe.search(query).iloc[0]["query"]
    expanded_query_formatted = ' '.join(expanded_query.split()[1:])


    results_wqe = tfidf_retr.search(expanded_query_formatted)

    relevant_docs = documents_df[documents_df["docno"].isin(results_wqe["docno"])]

    return relevant_docs["processed_text"]

# Example usage
process_query("information systems",documents_df)



253     chemic inform system hyde e ash j e purpos che...
457     inform retriev system lancast f w thi book con...
458     inform retriev on-lin lancast f w thi book dea...
537     inform retriev system lancast f w 1972 thi boo...
669     chemic structur storag search system develop d...
689     experi mechan chemic biolog inform retriev sys...
706     french nation polici chemic inform darc system...
1037    manag misinform system ackoff r l five assumpt...
1091    method relat structur properti chemic compound...
1124    contribut theori system inform flow kozachkov ...
Name: processed_text, dtype: object

In [None]:
dataset = pt.get_dataset("vaswani")
# vaswani dataset provides an index, topics and qrels
tfidf = pt.BatchRetrieve(dataset.get_index(), wmodel="TF_IDF")
bm25 = pt.BatchRetrieve(dataset.get_index(), wmodel="BM25")

# Perform experiment
exp = experiment = pt.Experiment(
    [tfidf, bm25],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=["map", "recip_rank"],

)
exp

Downloading vaswani index to /root/.pyterrier/corpora/vaswani/index


data.direct.bf:   0%|          | 0.00/388k [00:00<?, ?iB/s]

data.document.fsarrayfile:   0%|          | 0.00/234k [00:00<?, ?iB/s]

data.inverted.bf:   0%|          | 0.00/362k [00:00<?, ?iB/s]

data.lexicon.fsomapfile:   0%|          | 0.00/682k [00:00<?, ?iB/s]

data.lexicon.fsomaphash:   0%|          | 0.00/777 [00:00<?, ?iB/s]

data.lexicon.fsomapid:   0%|          | 0.00/30.3k [00:00<?, ?iB/s]

data.meta-0.fsomapfile:   0%|          | 0.00/725k [00:00<?, ?iB/s]

data.meta.idx:   0%|          | 0.00/89.3k [00:00<?, ?iB/s]

data.meta.zdata:   0%|          | 0.00/224k [00:00<?, ?iB/s]

data.properties:   0%|          | 0.00/4.29k [00:00<?, ?iB/s]

md5sums:   0%|          | 0.00/619 [00:00<?, ?iB/s]

Downloading vaswani topics to /root/.pyterrier/corpora/vaswani/query-text.trec


query-text.trec:   0%|          | 0.00/3.05k [00:00<?, ?iB/s]

Downloading vaswani qrels to /root/.pyterrier/corpora/vaswani/qrels


qrels:   0%|          | 0.00/6.63k [00:00<?, ?iB/s]

Unnamed: 0,name,map,recip_rank
0,BR(TF_IDF),0.290905,0.699168
1,BR(BM25),0.296517,0.725665


In [None]:
def elmo(query):
    answer = tfidf_retr.search(query)
    relevant_docs = documents_df[documents_df["docno"].isin(answer["docno"])]
    relevant_texts = relevant_docs["processed_text"].tolist()
    embeddings = Elmo.signatures["default"](tf.constant(relevant_texts, dtype=tf.string))["elmo"]
    lis_query = [query]  # Wrap query in a list to make it a list of strings
    query_embedding = Elmo.signatures["default"](tf.constant(lis_query, dtype=tf.string))["elmo"]
    similarity_scores = {}
    for i in range(len(relevant_docs["processed_text"])):
        similarity = cosine_similarity(embeddings.numpy()[i], query_embedding.numpy()[0])
        similarity_scores[relevant_docs.iloc[i]["docno"]] = similarity
    sorted_keys = sorted(similarity_scores.keys(), reverse=True)

    # Create a DataFrame with the sorted keys
    df = pd.DataFrame({'keys': sorted_keys})
    docs = documents_df[documents_df["docno"].isin(df["keys"])]

    return docs["Text"]

type(elmo("information systems"))
print([result for result in elmo("information systems")])

['Information Retrieval Systems Lancaster, F.W. This book is concerned primarily with those "intellectual" factors that significantly affect the performance of all information retrieval systems; namely,  - indexing policy and practice - vocabulary control - searching strategies - interaction between the system and its users  My viewpoint is that of the evaluator of information systems.  I have therefore paid considerable attention to a discussion of the requirements of users of information systems and the measurement of system performance in terms of the efficient and economical satisfaction of these requirements. The book does not concern itself, except indirectly, with equipment for the implementation of retrieval systems, a topic that is adequately covered by other volumes in this seris.  Moreover, it is my contention that the importance of "hardware" and "data processing" aspects of information systems has been exaggerated in the United States, with some detriment to the performanc

In [None]:
#
def elmo(query):
  answer = tfidf_retr.search(query)
  documents_df["docno"] = documents_df["docno"].astype(str)
  relevant_docs = documents_df[documents_df["docno"].isin(answer["docno"])]
  relevant_texts = relevant_docs["processed_text"].tolist()
  embeddings = Elmo.signatures["default"](tf.constant(relevant_texts))["elmo"]
  lis_query = list(query)
  query_embeding = Elmo.signatures["default"](tf.constant(lis_query))["elmo"]
  similarity_scores = {} #2d matrix
  for i in range(len(relevant_docs["processed_text"])):
      similarity = cosine_similarity(embeddings.numpy()[i],query_embeding.numpy()[0])
      similarity_scores[relevant_docs.iloc[i]["docno"]] = similarity
  sorted_keys = sorted(similarity_scores.keys(), reverse=True)

# Create a DataFrame with the sorted keys
  df = pd.DataFrame({'keys': sorted_keys})

  docs = documents_df[documents_df["docno"].isin(df["keys"])]


  return docs["Text"]