In [None]:
!pip install python-terrier --quiet
!pip install nltk --quiet
!pip install --ignore-installed blinker --quiet
!pip install git+https://github.com/experimaestro/experimaestro-ir.git --quiet
!pip install transformers --quiet
!pip install streamlit --quiet

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import tkinter as tk
import pandas as pd
import numpy as np
import re
import os
import pyterrier as pt
if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])
import tensorflow as tf
import tensorflow_hub as hub
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import torch
from xpmir.models import AutoModel
pd.set_option('display.max_colwidth', 150)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
import ir_datasets
import streamlit as st

cuda:0


# **Data Collection**

In [None]:
dataset = ir_datasets.load("wikiclir/en-simple")

docs = [doc for doc in dataset.docs_iter()]
docs_df = pd.DataFrame(docs)
docs_df = docs_df.rename(columns={'doc_id': 'docno'})

queries = [query for query in dataset.queries_iter()]
queries_df = pd.DataFrame(queries)
queries_df = queries_df.rename(columns={'title': 'query'})
queries_df = queries_df.rename(columns={'query_id': 'qid'})

qrels = [qrel for qrel in dataset.qrels_iter()]
qrels_df = pd.DataFrame(qrels)
qrels_df = qrels_df.rename(columns={'doc_id': 'docno'})
qrels_df = qrels_df.rename(columns={'query_id': 'qid'})

In [None]:
docs_df

Unnamed: 0,docno,title,text
0,1,April,"april is the fourth month of the year , and comes between march and may . it is one of four months to have 30 days . april always begins on the s..."
1,2,August,"august is the eighth month of the year in the gregorian calendar , coming between july and september . it has 31 days , the same number of days a..."
2,6,Art,art is a creative activity by people . the artist hopes that it affects the emotions of people who experience it . artists express themselves by ...
3,8,A,"a is the first letter of the english alphabet . the small letter , a , is used as a lower case vowel . however , the english long a ( ā ) is said..."
4,9,Air,air is the earth 's atmosphere . air around us is a mixture of many gases and dust particles . it is the clear gas in which living things live an...
...,...,...,...
127084,594690,Culture of Egypt,"the culture of egypt has thousands of years of recorded history . ancient egypt was among the earliest civilizations in africa . for millennia , ..."
127085,594691,Capsicum annuum,capsicum annum is a scientific name of capsicum
127086,594696,Binding energy,"a binding energy is the negative potential energy ( `` i.e . `` , energy debt ) pulling a bound system together . conversely , an unbinding energ..."
127087,594701,Fort Lauderdale Strikers,is a god like team whos has been working its way up to even beat fc barcelona


In [None]:
queries_df.drop(columns=['first_sent'],inplace=True)
queries_df

Unnamed: 0,qid,query
0,12,Anarchism
1,25,Autism
2,39,Albedo
3,290,A
4,303,Alabama
...,...,...
114567,54933077,Vera Glagoleva
114568,54936329,2017 Barcelona attack
114569,54950592,2017 Turku stabbing
114570,54963595,Sirkka Selja


In [None]:
qrels_df

Unnamed: 0,qid,docno,relevance,iteration
0,12,4807,2,Q0
1,12,4080,1,Q0
2,12,60610,1,Q0
3,12,798,1,Q0
4,12,12446,1,Q0
...,...,...,...,...
250375,54933077,594407,2,Q0
250376,54936329,594463,2,Q0
250377,54950592,594606,2,Q0
250378,54963595,594686,2,Q0


# **Preprocessing**


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = docs_df

In [None]:
def preprocess_text(sen):
    '''Cleans text data, tokenizes, lemmatizes, and stems the text'''

    # Convert to lowercase
    sentence = sen.lower()

    # Remove html tags
    sentence = re.sub(r'<[^>]+>', ' ', sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Tokenize
    words = word_tokenize(sentence)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Join the words back into a single string
    sentence = ' '.join(words)

    return sentence
df['proccessed_Text'] = df['text'].apply(preprocess_text)

In [None]:
df

Unnamed: 0,docno,title,text,proccessed_Text
0,1,April,"april is the fourth month of the year , and comes between march and may . it is one of four months to have 30 days . april always begins on the s...",april fourth month year come march may one four month day april alway begin day week juli addit januari leap year april alway end day week decemb ...
1,2,August,"august is the eighth month of the year in the gregorian calendar , coming between july and september . it has 31 days , the same number of days a...",august eighth month year gregorian calendar come juli septemb day number day previou month juli name roman emperor augustu caesar august n begin d...
2,6,Art,art is a creative activity by people . the artist hopes that it affects the emotions of people who experience it . artists express themselves by ...,art creativ activ peopl artist hope affect emot peopl experi artist express art peopl find art relax mani peopl disagre defin art say peopl driven...
3,8,A,"a is the first letter of the english alphabet . the small letter , a , is used as a lower case vowel . however , the english long a ( ā ) is said...",first letter english alphabet small letter use lower case vowel howev english long said diphthong letter greek alphabet name alpha alpha omega las...
4,9,Air,air is the earth 's atmosphere . air around us is a mixture of many gases and dust particles . it is the clear gas in which living things live an...,air earth atmospher air around u mixtur mani ga dust particl clear ga live thing live breath indefinit shape volum color smell mass weight matter ...
...,...,...,...,...
127084,594690,Culture of Egypt,"the culture of egypt has thousands of years of recorded history . ancient egypt was among the earliest civilizations in africa . for millennia , ...",cultur egypt thousand year record histori ancient egypt among earliest civil africa millennium egypt maintain strikingli uniqu complex stabl cultu...
127085,594691,Capsicum annuum,capsicum annum is a scientific name of capsicum,capsicum annum scientif name capsicum
127086,594696,Binding energy,"a binding energy is the negative potential energy ( `` i.e . `` , energy debt ) pulling a bound system together . conversely , an unbinding energ...",bind energi neg potenti energi e energi debt pull bound system togeth convers unbind energi posit actual energi requir disassembl bound system sep...
127087,594701,Fort Lauderdale Strikers,is a god like team whos has been working its way up to even beat fc barcelona,god like team who work way even beat fc barcelona


# **indexing**

In [None]:
indexer = pt.DFIndexer("./DatasetIndex", overwrite=True)
# index the text, record the docnos as metadata
index_ref = indexer.index(df["proccessed_Text"], df["docno"])
print(index_ref.toString())
index_ref.toString()

index = pt.IndexFactory.of(index_ref)

07:53:20.367 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 194 empty documents
./DatasetIndex/data.properties


In [None]:
print(index.getCollectionStatistics().toString())

Number of documents: 127089
Number of terms: 200764
Number of postings: 4766562
Number of fields: 0
Number of tokens: 6605886
Field names: []
Positions:   false



In [None]:
term_to_doc = {}

# Iterate over the lexicon
for kv in index.getLexicon():
    term = kv.getKey()
    term_to_doc.setdefault(term, {})
    pointer = index.getLexicon()[term]
    for posting in index.getInvertedIndex().getPostings(pointer):
      term_to_doc[term][re.search(r'ID\((\d+)\)', str(posting)).group(1)] = re.search(r'TF\((\d+)\)', str(posting)).group(1)
term_to_doc

{'a': {'36788': '1',
  '66551': '1',
  '66552': '2',
  '70569': '1',
  '83732': '1',
  '104970': '1',
  '116519': '2',
  '116535': '2'},
 'aa': {'1179': '1',
  '2445': '1',
  '3477': '1',
  '6374': '1',
  '7741': '2',
  '8329': '1',
  '8845': '1',
  '9790': '1',
  '9832': '1',
  '10711': '1',
  '11286': '1',
  '13262': '1',
  '14763': '1',
  '17436': '1',
  '17441': '1',
  '22027': '1',
  '22088': '1',
  '25682': '1',
  '26090': '1',
  '26879': '1',
  '28554': '1',
  '28885': '4',
  '30324': '1',
  '30567': '1',
  '36155': '1',
  '36937': '1',
  '41797': '1',
  '41970': '1',
  '58844': '2',
  '61073': '1',
  '62090': '1',
  '62138': '1',
  '63423': '1',
  '64197': '1',
  '71919': '2',
  '73447': '1',
  '77430': '1',
  '77759': '1',
  '83170': '1',
  '86741': '4',
  '86744': '5',
  '90503': '1',
  '95901': '1',
  '98105': '1',
  '103913': '1',
  '107006': '2',
  '108632': '1',
  '109417': '1',
  '112538': '1',
  '117938': '1',
  '121556': '1',
  '121978': '3',
  '124872': '2',
  '126272

In [None]:
len(term_to_doc)

200764

# **Query Processing:**

In [None]:
# query = input("Your query: ")
query = "machine learning"
query = preprocess_text(query)
relevant_docs = set()
for term in query.split():
    if term in term_to_doc:
        relevant_docs.update(term_to_doc[term].keys())

final_relevant_docs = set()
for doc_id in relevant_docs:
    # Check if all query terms exist in the document
    if all(term in term_to_doc and doc_id in term_to_doc[term] for term in query.split()):
        final_relevant_docs.add(doc_id)

print("Most relevant documents with all query terms:")
for doc_id in final_relevant_docs:
  document = df.iloc[int(doc_id)]['text']
  print("Document ID:", doc_id)
  print(document)
  print()

# Print the relevant document IDs
print("Other less relevant documents that doesnt necessarily include all query terms:", relevant_docs-set(final_relevant_docs))

Most relevant documents with all query terms:
Document ID: 121496
 reinforcement learning is teaching a `` software agent '' how to behave in an environment by telling it how good it 's doing . it is an area of machine learning inspired by behaviorist psychology . reinforcement learning is different from supervised learning because the correct inputs and outputs are never shown . also , reinforcement learning usually learns as it goes ( online learning ) unlike supervised learning . this means an agent has to choose between exploring and sticking with what it knows best . a reinforcement learning system is made of a `` policy '' ( formula_1 ) , a `` reward function '' ( formula_2 ) , a `` value function '' ( formula_3 ) , and an optional `` model '' of the environment . a `` policy '' tells the agent what to do in a certain situation . it can be a simple table of rules , or a complicated search for the correct action . policies can even be stochastic , which means instead of rules the 

In [None]:
#Ranking Documents using TF-IDF
tfidf = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"},num_results=20)
results=tfidf.search(query)
results

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,104528,457032,0,11.890873,machin learn
1,1,121501,564928,1,11.777265,machin learn
2,1,82605,359370,2,10.332579,machin learn
3,1,121496,564854,3,9.878682,machin learn
4,1,1910,6360,4,9.468201,machin learn
5,1,104525,457009,5,9.003193,machin learn
6,1,98414,430252,6,8.495885,machin learn
7,1,7653,24970,7,8.305298,machin learn
8,1,98878,432031,8,7.774418,machin learn
9,1,2724,8584,9,7.518307,machin learn


In [None]:
results_ids = results['docno']
print("Most relevant documents with all query terms ranked: \n")
for docno in results_ids:
    document_row = df[df['docno'] == docno]
    document = document_row.iloc[0]['text']
    print("Document ID:", docno)
    print(document)
    print()

Most relevant documents with all query terms ranked: 

Document ID: 457032
 teaching machines were originally mechanical devices . they presented educational materials and taught students . they were first invented by sidney l. pressey . his machine originally administered multiple-choice questions . when the machine was set so it moved on only when the student got the right answer , tests showed that learning had taken place . much later , norman crowder developed the pressey idea much further . b.f. skinner was responsible for a different type of machine which used his ideas on how learning should be directed with positive reinforcement . there is extensive experience that both methods worked well , and so did programmed learning in other forms , such as books . the ideas of teaching machines and programmed learning provided the basis for later ideas such as open learning and computer-assisted instruction . quotes . - edward l. thorndike in 1912 : `` if , by a miracle of mechanical i

# **Query expansion**

In [None]:
rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

#output of the TF-IDF will be fed into the RM3 expander for query expansion.
rm3_qe = tfidf >> rm3_expander
expanded_query = rm3_qe.search(query).iloc[0]["query"]

# for s in expanded_query.split()[1:]:
#   print(s)

# print("\n" + query)

expanded_query = ' '.join(expanded_query.split()[1:])
expanded_query

'idea^0.015170929 machin^0.461723030 gun^0.029044874 mechan^0.018255448 ture^0.022017332 function^0.015327577 comput^0.051576175 learn^0.350152999 work^0.021869784 system^0.014861900'

In [None]:
results_wqe = tfidf.search(expanded_query)

In [None]:
print("   Before Expansion    After Expansion")
print(pd.concat([results[['docid','score']][0:10].add_suffix('_1'),
            results_wqe[['docid','score']][0:10].add_suffix('_2')], axis=1).fillna(''))

   Before Expansion    After Expansion
   docid_1    score_1  docid_2    score_2
0   104528  11.890873   104528  11.217305
1   121501  11.777265   121501  11.070423
2    82605  10.332579    82605   9.297954
3   121496   9.878682     1910   9.067845
4     1910   9.468201    88374   8.769728
5   104525   9.003193   104525   8.688466
6    98414   8.495885   121496   8.492095
7     7653   8.305298     8535   8.224648
8    98878   7.774418     2117   7.993225
9     2724   7.518307    72520   7.959353


In [None]:
# Top 10 relevant documents for expanded queries
expanded_results_ids = results_wqe['docno']
print("Most relevant documents with all expanded query terms ranked: \n")
for docno in expanded_results_ids:
    document_row = df[df['docno'] == docno]
    document = document_row.iloc[0]['text']
    print("Document ID:", docno)
    print(document)
    print()

Most relevant documents with all expanded query terms ranked: 

Document ID: 457032
 teaching machines were originally mechanical devices . they presented educational materials and taught students . they were first invented by sidney l. pressey . his machine originally administered multiple-choice questions . when the machine was set so it moved on only when the student got the right answer , tests showed that learning had taken place . much later , norman crowder developed the pressey idea much further . b.f. skinner was responsible for a different type of machine which used his ideas on how learning should be directed with positive reinforcement . there is extensive experience that both methods worked well , and so did programmed learning in other forms , such as books . the ideas of teaching machines and programmed learning provided the basis for later ideas such as open learning and computer-assisted instruction . quotes . - edward l. thorndike in 1912 : `` if , by a miracle of mec

# **Evaluation**

In [None]:
queries_df['query'] = queries_df['query'].apply(preprocess_text)

In [None]:
results = tfidf.transform(queries_df)
results

  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s"

Unnamed: 0,qid,docid,docno,rank,score,query
0,12,74640,321567,0,13.880701,list^0.021499535 libertarian^0.019903978 anarch^0.698418021 anarcho^0.024219347 spanish^0.023784906 anarchist^0.110012643 famou^0.022762064 person...
1,12,1358,4807,1,11.916231,list^0.021499535 libertarian^0.019903978 anarch^0.698418021 anarcho^0.024219347 spanish^0.023784906 anarchist^0.110012643 famou^0.022762064 person...
2,12,1107,4080,2,11.667629,list^0.021499535 libertarian^0.019903978 anarch^0.698418021 anarcho^0.024219347 spanish^0.023784906 anarchist^0.110012643 famou^0.022762064 person...
3,12,34064,135800,3,10.503840,list^0.021499535 libertarian^0.019903978 anarch^0.698418021 anarcho^0.024219347 spanish^0.023784906 anarchist^0.110012643 famou^0.022762064 person...
4,12,34077,135830,4,10.068251,list^0.021499535 libertarian^0.019903978 anarch^0.698418021 anarcho^0.024219347 spanish^0.023784906 anarchist^0.110012643 famou^0.022762064 person...
...,...,...,...,...,...,...
2280695,54964009,79852,347549,15,6.630679,music^0.040880475 russian^0.042891111 deynekin^0.300000012 princ^0.037255518 ilyich^0.035001669 ballet^0.044583552 great^0.025815921 nutcrack^0.02...
2280696,54964009,13513,49756,16,6.487320,music^0.040880475 russian^0.042891111 deynekin^0.300000012 princ^0.037255518 ilyich^0.035001669 ballet^0.044583552 great^0.025815921 nutcrack^0.02...
2280697,54964009,119299,547263,17,6.477348,music^0.040880475 russian^0.042891111 deynekin^0.300000012 princ^0.037255518 ilyich^0.035001669 ballet^0.044583552 great^0.025815921 nutcrack^0.02...
2280698,54964009,1046,3978,18,6.433031,music^0.040880475 russian^0.042891111 deynekin^0.300000012 princ^0.037255518 ilyich^0.035001669 ballet^0.044583552 great^0.025815921 nutcrack^0.02...


In [None]:
eval = pt.Evaluate(results,qrels_df)
eval

{'map': 0.6908845814556839, 'ndcg': 0.7768932204273412}

# **Final Model with Bert**

In [None]:
model = AutoModel.load_from_hf_hub("xpmir/monot5", as_instance=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

definition.json:   0%|          | 0.00/18.1k [00:00<?, ?B/s]

path:   0%|          | 0.00/990M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
result_merged = results_wqe.merge(df, on="docno")[["score", "proccessed_Text","docno"]]
result_merged = result_merged.sort_values(by="score", ascending=False)
result_merged

Unnamed: 0,score,proccessed_Text,docno
0,11.217305,teach machin origin mechan devic present educ materi taught student first invent sidney l pressey machin origin administ multipl choic question ma...,457032
1,11.070423,machin learn give comput abil learn without explicitli program arthur samuel subfield comput scienc idea came work artifici intellig machin learn ...,564928
2,9.297954,machin learn supervis learn task infer function label train data result train known beforehand system simpli learn get result correctli usual syst...,359370
3,9.067845,artifici intellig ai abil comput program machin think learn also field studi tri make comput smart john mccarthi came name artifici intellig gener...,6360
4,8.769728,ture machin idea comput scienc tri describ comput work determinist ture machin use function given current state ture machin select anoth state tur...,384351
5,8.688466,program learn program instruct research base system help learner work success method guid research done varieti appli psychologist educ learn mate...,457009
6,8.492095,reinforc learn teach softwar agent behav environ tell good area machin learn inspir behaviorist psycholog reinforc learn differ supervis learn cor...,564854
7,8.224648,ture machin term comput scienc ture machin system rule state transit rather real machin first describ english mathematician alan ture two purpos t...,29010
8,7.993225,machin thing creat peopl make work easier devic invent multipli effect human effort machin produc mechan advantag machin mani part move exampl bic...,7240
9,7.959353,light machin gun lmg type machin gun design use one soldier light machin gun often use squad automat weapon calib cartridg modern light machin gun...,310822


In [None]:
output = model.rsv(expanded_query, result_merged["proccessed_Text"].values)

data = [(result_merged.iloc[i]["docno"], list(obj.document.items.values())[0].text, obj.score) for i, obj in enumerate(output)]
reviews_result_v2 = pd.DataFrame(data, columns=['docno', 'text', "score"]).sort_values(by="score", ascending=False)
reviews_result_v2

Unnamed: 0,docno,text,score
1,564928,machin learn give comput abil learn without explicitli program arthur samuel subfield comput scienc idea came work artifici intellig machin learn ...,-0.055864
5,457009,program learn program instruct research base system help learner work success method guid research done varieti appli psychologist educ learn mate...,-0.237351
4,384351,ture machin idea comput scienc tri describ comput work determinist ture machin use function given current state ture machin select anoth state tur...,-0.239694
9,310822,light machin gun lmg type machin gun design use one soldier light machin gun often use squad automat weapon calib cartridg modern light machin gun...,-0.438359
0,457032,teach machin origin mechan devic present educ materi taught student first invent sidney l pressey machin origin administ multipl choic question ma...,-0.575946
7,29010,ture machin term comput scienc ture machin system rule state transit rather real machin first describ english mathematician alan ture two purpos t...,-0.822606
15,3925,machin code machin languag name command directli execut processor usual order tell comput code lowest level softwar kind softwar need translat mac...,-1.303871
2,359370,machin learn supervis learn task infer function label train data result train known beforehand system simpli learn get result correctli usual syst...,-1.427224
12,126869,virtual machin program comput work like separ comput insid main comput program control virtual machin call hypervisor comput run virtual machin ca...,-2.921323
19,311022,gener purpos machin gun gpmg machin gun get ammunit belt use mani differ role support infantri bipod tripod put onto helicopt armour vehicl provid...,-3.150856


In [None]:
merged_df = reviews_result_v2.merge(df, on="docno", how="left")

print("Most relevant documents with all expanded query terms ranked using bert : \n")
for index, row in merged_df.iterrows():
    print("Document ID:", row['docno'])
    print("Score:", row['score'])
    print(row['text_y'])
    print()

Most relevant documents with all expanded query terms ranked using bert : 

Document ID: 564928
Score: -0.05586378276348114
 machine learning gives computers the ability to learn without being explicitly programmed ( arthur samuel , 1959 ) . it is a subfield of computer science . the idea came from work in artificial intelligence . machine learning explores the study and construction of algorithms which can learn and make predictions on data . such algorithms follow programmed instructions , but can also make predictions or decisions based on data . they build a model from sample inputs . machine learning is done where designing and programming explicit algorithms can not be done . examples include spam filtering , detection of network intruders or malicious insiders working towards a data breach , optical character recognition ( ocr ) , search engines and computer vision .

Document ID: 457009
Score: -0.23735098540782928
 programmed learning ( or 'programmed instruction ' ) is a resea

# **GUI**

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
docs_df.to_csv('docs_df.csv', index=False)
queries_df.to_csv('queries_df.csv', index=False)
qrels_df.to_csv('qrels_df.csv', index=False)
from google.colab import files

# Provide the file path of the CSV file
file_path = 'docs_df.csv'
file_path2 = 'queries_df.csv'
file_path3 = 'qrels_df.csv'
# Download the file
files.download(file_path)
files.download(file_path2)
files.download(file_path3)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
%%writefile main.py
import tkinter as tk
import pandas as pd
import numpy as np
import re
import os
import pyterrier as pt
if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])
import tensorflow as tf
import tensorflow_hub as hub
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import torch
from xpmir.models import AutoModel
pd.set_option('display.max_colwidth', 150)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
import ir_datasets
import streamlit as st
nltk.download('punkt')
nltk.download('stopwords')



docs_df  =  pd.read_csv("/content/docs_df.csv")
docs_df = docs_df.iloc[0:10000]
queries_df = pd.read_csv("/content/queries_df.csv")
queries_df = queries_df.iloc[0:10000]
qrels_df = pd.read_csv("/content/qrels_df.csv")
qrels_df = qrels_df.iloc[0:10000]
def preprocess_text(sen):
    '''Cleans text data, tokenizes, lemmatizes, and stems the text'''

    # Convert to lowercase
    sentence = sen.lower()

    # Remove html tags
    sentence = re.sub(r'<[^>]+>', ' ', sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Tokenize
    words = word_tokenize(sentence)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]



    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Join the words back into a single string
    sentence = ' '.join(words)

    return sentence
df = docs_df
df['proccessed_Text'] = df['text'].apply(preprocess_text)
indexer = pt.DFIndexer("./DatasetIndex", overwrite=True)
df['docno'] = df['docno'].astype(str)
index_ref = indexer.index(df["proccessed_Text"], df["docno"])

index = pt.IndexFactory.of(index_ref)
tfidf = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"},num_results=20)

rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

def search(query):
    documents = []
    document_ids = []
    document_titles = []
    rm3_qe = tfidf >> rm3_expander
    expanded_query = rm3_qe.search(query).iloc[0]["query"]

    expanded_query = ' '.join(expanded_query.split()[1:11])
    results_wqe = tfidf.search(expanded_query)
    expanded_results_ids = results_wqe['docno']
    for docno in expanded_results_ids:
        document_row = df[df['docno'] == docno]

        doc_id = document_row.iloc[0]['docno']
        doc_title = document_row.iloc[0]['title']
        doc_text = document_row.iloc[0]['text']

        document_ids.append(doc_id)
        document_titles.append(doc_title)
        documents.append(doc_text)

    return documents, document_ids, document_titles

def main():
    st.title("Google")
    st.text("Enter a search query")

    search_query = st.text_input("Search Google or type URL")
    submit = st.button("Search")

    if submit and search_query:
        documents, document_ids, document_titles = search(search_query)

        for i in range(len(documents)):
            st.write("Document ID:", document_ids[i])
            st.write("Document Title:", document_titles[i])
            st.write("Document Text:", documents[i])
            st.write("---")

if __name__ == "__main__":
    main()

Overwriting main.py


In [None]:
!streamlit run main.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.


  You can now view your Streamlit app in your browser.

  Network URL: http://172.28.0.12:8501
  External URL: http://35.187.239.218:8501

[?25hnpx: installed 22 in 4.216s
your url is: https://fine-ads-kiss.loca.lt
PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
2024-05-12 09:19:13.587495: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-12 09:19:13.587544: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-12 09:19:13.588936: E external/local_xla/xla/stream_executor/cuda/cuda_blas.