In [272]:
#install the Pyterrier framework
!pip install python-terrier
# install the nltk modules
!pip install nltk



Libraries

In [273]:
import pyterrier as pt
if not pt.started():
    # In this lab, we need to specify that we start PyTerrier with PRF enabled
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])
  pt.init()

In [274]:
import pandas as pd
import numpy as np
import os
import re  # used to clean the data

In [275]:
import nltk
from nltk.stem import *
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [276]:
import tensorflow as tf
import tensorflow_hub as hub

Data Collection

In [277]:
#Read CSV
df=pd.read_csv('/content/tweet_emotions.csv')
df

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will."
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!"
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!


In [278]:
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [279]:
# Download NLTK resources
stop_words=nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [280]:
stop_words = set(stopwords.words('english'))
print('list of stopwords:')
stop_words

list of stopwords:


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [237]:
# Initialize Porter stemmer
stemmer = PorterStemmer()

In [281]:
df['docno'] = df.index
df

Unnamed: 0,tweet_id,sentiment,content,docno
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,0
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...,1
2,1956967696,sadness,Funeral ceremony...gloomy friday...,2
3,1956967789,enthusiasm,wants to hang out with friends SOON!,3
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will.",4
...,...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor,39995
39996,1753919001,love,Happy Mothers Day All my love,39996
39997,1753919005,love,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!",39997
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!,39998


Preprocessing

In [282]:
#stopwords Removal
def remove_stop(text) :
  tokens = word_tokenize(str(text))
  sentence = []
  for i in tokens:
    if i not in stop_words:
      sentence.append(i)
  return' '.join(sentence)
df["processed_text"] = df["content"].apply(remove_stop)
df

Unnamed: 0,tweet_id,sentiment,content,docno,processed_text
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,0,@ tiffanylue know listenin bad habit earlier started freakin part = [
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...,1,Layin n bed headache ughhhh ... waitin call ...
2,1956967696,sadness,Funeral ceremony...gloomy friday...,2,Funeral ceremony ... gloomy friday ...
3,1956967789,enthusiasm,wants to hang out with friends SOON!,3,wants hang friends SOON !
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will.",4,"@ dannycastillo We want trade someone Houston tickets , one ."
...,...,...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor,39995,@ JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love,39996,Happy Mothers Day All love
39997,1753919005,love,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!",39997,"Happy Mother 's Day mommies , woman man long 're 'momma ' someone day !"
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!,39998,@ niariley WASSUP BEAUTIFUL ! ! ! FOLLOW ME ! ! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF . WAT U IN THE VIDEO ! !


In [283]:
#Stemming
def steeming(text) :
  tokens = word_tokenize(text)
  steemed_text = []
  j = 0
  for i in tokens:
    steemed_text.append(stemmer.stem(i))
    #print(f"tokens : {steemed_text[j]}")
    j = j+1
  return ' '.join(steemed_text)
df["processed_text"] = df["processed_text"].apply(steeming)
df

Unnamed: 0,tweet_id,sentiment,content,docno,processed_text
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,0,@ tiffanylu know listenin bad habit earlier start freakin part = [
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...,1,layin n bed headach ughhhh ... waitin call ...
2,1956967696,sadness,Funeral ceremony...gloomy friday...,2,funer ceremoni ... gloomi friday ...
3,1956967789,enthusiasm,wants to hang out with friends SOON!,3,want hang friend soon !
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will.",4,"@ dannycastillo we want trade someon houston ticket , one ."
...,...,...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor,39995,@ johnlloydtaylor
39996,1753919001,love,Happy Mothers Day All my love,39996,happi mother day all love
39997,1753919005,love,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!",39997,"happi mother 's day mommi , woman man long 're 'momma ' someon day !"
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!,39998,@ niariley wassup beauti ! ! ! follow me ! ! peep out my new hit singl www.myspace.com/ipsohot i def . wat u in the video ! !


In [284]:
#Cleaning
def clean(text):
   text = re.sub(r"http\S+", " ", str(text)) # remove urls
   text = re.sub(r"RT ", " ", str(text)) # remove rt
   text = re.sub(r"@[\w]*", " ", str(text)) # remove handles
   text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", str(text)) # remove special characters
   text = re.sub(r'\t', ' ', str(text)) # remove tabs
   text = re.sub(r'\n', ' ', str(text)) # remove line jump
   text = re.sub(r"\s+", " ", str(text)) # remove extra white space
   text = str(text).strip()
   return str(text)
df["processed_text"] = df["processed_text"].apply(clean)
df

Unnamed: 0,tweet_id,sentiment,content,docno,processed_text
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,0,tiffanylu know listenin bad habit earlier start freakin part [
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...,1,layin n bed headach ughhhh waitin call
2,1956967696,sadness,Funeral ceremony...gloomy friday...,2,funer ceremoni gloomi friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,3,want hang friend soon !
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will.",4,dannycastillo we want trade someon houston ticket one
...,...,...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor,39995,johnlloydtaylor
39996,1753919001,love,Happy Mothers Day All my love,39996,happi mother day all love
39997,1753919005,love,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!",39997,happi mother 's day mommi woman man long 're 'momma ' someon day !
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!,39998,niariley wassup beauti ! ! ! follow me ! ! peep out my new hit singl www myspace com ipsohot i def wat u in the video ! !


In [285]:
# Display the  processed DataFrames
print('dataFrame after processing:\n')
df

dataFrame after processing:



Unnamed: 0,tweet_id,sentiment,content,docno,processed_text
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,0,tiffanylu know listenin bad habit earlier start freakin part [
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...,1,layin n bed headach ughhhh waitin call
2,1956967696,sadness,Funeral ceremony...gloomy friday...,2,funer ceremoni gloomi friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,3,want hang friend soon !
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will.",4,dannycastillo we want trade someon houston ticket one
...,...,...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor,39995,johnlloydtaylor
39996,1753919001,love,Happy Mothers Day All my love,39996,happi mother day all love
39997,1753919005,love,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!",39997,happi mother 's day mommi woman man long 're 'momma ' someon day !
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!,39998,niariley wassup beauti ! ! ! follow me ! ! peep out my new hit singl www myspace com ipsohot i def wat u in the video ! !


Indexing

In [286]:
# convert type to string
df['docno'] = df['docno'].astype(str)
#indexing
indexer = pt.DFIndexer("./myFirstIndex", overwrite=True)
index_ref = indexer.index(df["processed_text"], df["docno"])
index = pt.IndexFactory.of(index_ref)

20:45:49.215 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 41 empty documents


In [287]:
print(index_ref.toString())
#we will first load the index
index = pt.IndexFactory.of(index_ref)
#we will call getCollectionStatistics() to check the stats
print(index.getCollectionStatistics().toString())

./myFirstIndex/data.properties
Number of documents: 40000
Number of terms: 39649
Number of postings: 279726
Number of fields: 0
Number of tokens: 288161
Field names: []
Positions:   false



In [288]:
for kv in index.getLexicon():
  print("%s -> %s " % (kv.getKey(), kv.getValue().toString().split('@')[0]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
teg -> term22418 Nt=1 TF=1 maxTF=1  
tegan -> term22717 Nt=1 TF=1 maxTF=1  
tegs03 -> term37800 Nt=1 TF=1 maxTF=1  
teh -> term7832 Nt=7 TF=7 maxTF=1  
tehblu -> term17064 Nt=1 TF=1 maxTF=1  
tehcheapon -> term15257 Nt=1 TF=1 maxTF=1  
tehe -> term1291 Nt=3 TF=3 maxTF=1  
teheh -> term38624 Nt=1 TF=1 maxTF=1  
tehr -> term13515 Nt=1 TF=1 maxTF=1  
tei -> term35574 Nt=1 TF=1 maxTF=1  
teifion -> term29042 Nt=1 TF=1 maxTF=1  
teignmouth -> term27398 Nt=1 TF=1 maxTF=1  
teiisha -> term34026 Nt=1 TF=1 maxTF=1  
tekenen -> term25425 Nt=1 TF=1 maxTF=1  
tekson -> term36096 Nt=1 TF=1 maxTF=1  
tekzilla -> term36070 Nt=1 TF=1 maxTF=1  
tel -> term21238 Nt=3 TF=3 maxTF=1  
tele -> term25943 Nt=2 TF=2 maxTF=1  
teleco -> term4422 Nt=1 TF=1 maxTF=1  
telecom -> term2347 Nt=2 TF=3 maxTF=2  
teleconf -> term6194 Nt=1 TF=1 maxTF=1  
telegraph -> term39161 Nt=1 TF=1 maxTF=1  
teleject -> term22372 Nt=1 TF=1 maxTF=1  
telekinesi -> term2

In [289]:
index = pt.IndexFactory.of(index_ref)

Query Processing

In [290]:
def preprocess(sentence):
  sentence = remove_stop(sentence)
  sentence = clean(sentence)
  sentence = steeming(sentence)
  return sentence

In [291]:
docs = []
Q = "twitt"
pointer = index.getLexicon()[Q]
for posting in index.getInvertedIndex().getPostings(pointer):
  docs.append(posting.toString()[3:-7])

In [292]:
print("Docs are: ", docs)

Docs are:  ['3557', '4016', '5354', '6532', '10501', '13632', '13813', '21375', '22021', '26070', '28542', '34695', '36112', '36342', '36400', '36627', '37277']


In [293]:
len(docs)

17

In [294]:
#"TF_IDF"
tfidf_retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"})

In [295]:
res=tfidf_retr.search(Q)
res

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,4016,4016,0,6.557943,twitt
1,1,36400,36400,1,6.557943,twitt
2,1,13813,13813,2,6.181176,twitt
3,1,28542,28542,3,6.181176,twitt
4,1,26070,26070,4,5.845348,twitt
5,1,34695,34695,5,5.845348,twitt
6,1,36112,36112,6,5.544132,twitt
7,1,36342,36342,7,5.544132,twitt
8,1,5354,5354,8,5.272438,twitt
9,1,22021,22021,9,5.272438,twitt


Query expansion

In [296]:
pd.set_option('display.max_colwidth', 150)

In [297]:
# Need to install additional terrier package for PRF. It will take around 1 min
!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven   #used for Java projects to manage project dependencies and build processes
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

fatal: destination path 'terrier-prf' already exists and is not an empty directory.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
maven is already the newest version (3.6.3-5).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
/content/terrier-prf
[[1;34mINFO[m] Scanning for projects...
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m----------------------< [0;36morg.terrier:terrier-prf[0;1m >-----------------------[m
[[1;34mINFO[m] [1mBuilding terrier-prf 0.2-SNAPSHOT[m
[[1;34mINFO[m] [1m--------------------------------[ jar ]---------------------------------[m
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mmaven-resources-plugin:2.6:resources[m [1m(default-resources)[m @ [36mterrier-prf[0;1m ---[m
[[1;34mINFO[m] Using 'UTF-8' encoding to copy filtered resources.
[[1;34mINFO[m] skip non existing resourceDirectory /content/terrier-prf/src/main/resources
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mma

In [298]:
df[['content']][df['docno'].isin(res['docno'].loc[0:4].tolist())]

Unnamed: 0,content
4016,Good Morning Twitts! Another GloOmy day in NYC!
13813,@adrenalynntoao I've been readin your last few twitts. I hope your ok
26070,Morning twitts heading home had a great sleep over w. my bf Tennille!
28542,it's after 3 AM.!! I think it's time to bed.!! have a good night twitts.! ;))
36400,@lmatechnologies Gotcha ! Let's meet up @ The Twitt Cafe


In [299]:
# Define our retrieval model
bm25 = pt.BatchRetrieve(index, wmodel="BM25",num_results=10)
res = bm25.search(Q)
res

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,4016,4016,0,11.976709,twitt
1,1,36400,36400,1,11.976709,twitt
2,1,13813,13813,2,11.288622,twitt
3,1,28542,28542,3,11.288622,twitt
4,1,26070,26070,4,10.675304,twitt
5,1,34695,34695,5,10.675304,twitt
6,1,36112,36112,6,10.125196,twitt
7,1,36342,36342,7,10.125196,twitt
8,1,5354,5354,8,9.629004,twitt
9,1,22021,22021,9,9.629004,twitt


In [300]:
# "rewrite" function from PyTerrier will be used to expand queries specifying RM3 as the model
# fb_docs ==> no. expansion documents
# fb_terms ==> no. expansion terms
rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

#output of the BM25 will be fed into the RM3 expander for Q expansion.
rm3_qe = bm25 >> rm3_expander
expanded_query = rm3_qe.search(Q).iloc[0]["query"]

expanded_query

'applypipeline:off meet^0.018588183 cafe^0.018588183 dai^0.057313565 love^0.022305820 gotcha^0.018588183 time^0.028324850 bed^0.029873865 ve^0.029873865 morn^0.032529321 twitt^0.744014144'

In [301]:
# Just print the expanded Q with term scores
for s in expanded_query.split()[1:]:
  print(s)
print("\n" + Q)

meet^0.018588183
cafe^0.018588183
dai^0.057313565
love^0.022305820
gotcha^0.018588183
time^0.028324850
bed^0.029873865
ve^0.029873865
morn^0.032529321
twitt^0.744014144

twitt


In [302]:
# After that you can search using the expanded query
expanded_query_formatted = ' '.join(expanded_query.split()[1:])
results_wqe = bm25.search(expanded_query_formatted)
print("   Before Expansion    After Expansion")
print(pd.concat([res[['docid','score']][0:5].add_suffix('_1'),
            results_wqe[['docid','score']][0:5].add_suffix('_2')], axis=1).fillna(''))
#Let's check the tweets content for the top 5 retrieved tweets
df[['content']][df['docno'].isin(results_wqe['docno'].loc[0:5].tolist())]

   Before Expansion    After Expansion
   docid_1    score_1  docid_2    score_2
0     4016  11.976709    36400  12.901027
1    36400  11.976709     4016  12.583890
2    13813  11.288622    28542  11.772914
3    28542  11.288622    13813  11.557250
4    26070  10.675304    34695  11.480847


Unnamed: 0,content
4016,Good Morning Twitts! Another GloOmy day in NYC!
13813,@adrenalynntoao I've been readin your last few twitts. I hope your ok
26070,Morning twitts heading home had a great sleep over w. my bf Tennille!
28542,it's after 3 AM.!! I think it's time to bed.!! have a good night twitts.! ;))
34695,Okay Im going to bed..Toodles twitts i've had my fun for the day
36400,@lmatechnologies Gotcha ! Let's meet up @ The Twitt Cafe


User Interface

In [303]:
!pip install flask_ngrok



In [304]:
doc = df.head(70)
doc = doc.to_dict()
doc

{'tweet_id': {0: 1956967341,
  1: 1956967666,
  2: 1956967696,
  3: 1956967789,
  4: 1956968416,
  5: 1956968477,
  6: 1956968487,
  7: 1956968636,
  8: 1956969035,
  9: 1956969172,
  10: 1956969456,
  11: 1956969531,
  12: 1956970047,
  13: 1956970424,
  14: 1956970860,
  15: 1956971077,
  16: 1956971170,
  17: 1956971206,
  18: 1956971473,
  19: 1956971586,
  20: 1956971981,
  21: 1956972097,
  22: 1956972116,
  23: 1956972270,
  24: 1956972359,
  25: 1956972444,
  26: 1956972557,
  27: 1956972884,
  28: 1956973598,
  29: 1956973690,
  30: 1956974706,
  31: 1956975441,
  32: 1956975860,
  33: 1956975876,
  34: 1956975927,
  35: 1956976187,
  36: 1956976312,
  37: 1956976371,
  38: 1956976557,
  39: 1956976681,
  40: 1956977084,
  41: 1956977187,
  42: 1956977618,
  43: 1956977624,
  44: 1956978276,
  45: 1956978410,
  46: 1956978668,
  47: 1956979150,
  48: 1956979437,
  49: 1956979756,
  50: 1956979894,
  51: 1956979900,
  52: 1956979917,
  53: 1956980788,
  54: 1956980883,
  55: 19

In [305]:
def data(df2 , que):
 j = 0
 quer = preprocess(que)
 docs = []
 for key, value in df2.items():
   if key == 'processed_text':
         val = value.values()
         for doc in val:
           terms = doc.split()
           for term in terms:
             if term == quer and i not in docs:
               docs.append(f'''Document number {j} -----> \n{df["content"][j]}''')
           j = j + 1
 return docs

In [306]:
df

Unnamed: 0,tweet_id,sentiment,content,docno,processed_text
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,0,tiffanylu know listenin bad habit earlier start freakin part [
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...,1,layin n bed headach ughhhh waitin call
2,1956967696,sadness,Funeral ceremony...gloomy friday...,2,funer ceremoni gloomi friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,3,want hang friend soon !
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will.",4,dannycastillo we want trade someon houston ticket one
...,...,...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor,39995,johnlloydtaylor
39996,1753919001,love,Happy Mothers Day All my love,39996,happi mother day all love
39997,1753919005,love,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!",39997,happi mother 's day mommi woman man long 're 'momma ' someon day !
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!,39998,niariley wassup beauti ! ! ! follow me ! ! peep out my new hit singl www myspace com ipsohot i def wat u in the video ! !


In [307]:
Query = "school"
i = data(doc , Query)
i

["Document number 28 -----> \nFudge.... Just BS'd that whole paper.... So tired.... Ugh I hate school.....  time to sleep!!!!!!!!!!!",
 'Document number 44 -----> \n@creyes middle school and elem. High schools will remain open for those who need credits to graduate. Cali is broken',
 'Document number 44 -----> \n@creyes middle school and elem. High schools will remain open for those who need credits to graduate. Cali is broken',
 "Document number 45 -----> \nBed!!!!!... its time,..... hope i go to school tomorrow, all though i don't feel very well right now"]

In [308]:
from google.colab.output import eval_js
print (eval_js("google.colab.kernel.proxyPort(5000)"))

https://od5pewi44e-496ff2e9c6d22116-5000-colab.googleusercontent.com/


In [271]:
from flask import Flask, request
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

@app.route("/")
def home():
    return """
    <style>
        body {
            background-color: white;
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 0;
        }

        .header {
            background-color: black;
            color: white;
            padding: 20px 0;
        }

        .container {
            text-align: center;
        }

        h1 {
            text-align: center;
            margin: 0;
            padding: 10px 0;
        }

        #searchInput {
            padding: 10px;
            border: 1px solid #ccc;
            border-radius: 20px; /* Increased border-radius for a rounded appearance */
            margin-bottom: 10px;
            width: 300px; /* Adjust the width as needed */
            box-sizing: border-box; /* Include padding and border in the element's total width */
            transition: border-color 0.3s; /* Smooth transition for border color change */
        }

        #searchInput:focus {
            border-color: #007bff; /* Change border color on focus */
        }

        button {
            padding: 10px 20px;
            background-color: #007bff;
            color: white;
            border: none;
            border-radius: 20px; /* Increased border-radius for a rounded appearance */
            cursor: pointer;
            transition: background-color 0.3s; /* Smooth transition for background color change */
        }

        button:hover {
            background-color: #0056b3; /* Change background color on hover */
        }
    </style>

    <div class="header">
        <h1>Search Engine</h1>
    </div>
    <div class="container">
        <input type="text" id="searchInput" placeholder="Enter your query...">
        <button onclick="search()">Search</button>
    </div>
    <div id="searchResult"></div>

    <script>
        function search() {
            var searchTerm = document.getElementById("searchInput").value;
            fetch('/search', {
                method: 'POST',
                body: JSON.stringify({ query: searchTerm }),
                headers:{
                    'Content-Type': 'application/json'
                }
            })
            .then(response => response.json())
            .then(data => {
                console.log("Received data:", data); // Debug: Check if data is received
                var resultDiv = document.getElementById("searchResult");
                resultDiv.innerHTML = "<h2>Relevant Documents IDs:</h2>";
                if (data.results.length === 0) {
                    resultDiv.innerHTML += "<p>No documents found</p>";
                } else {
                    data.results.forEach(doc => {
                        console.log("Displaying document:", doc); // Debug: Check if document is displayed
                        resultDiv.innerHTML += "<p>" + doc + "</p>";
                    });
                }
            })
            .catch(error => {
                console.error('Error occurred during fetch:', error); // Debug: Log fetch errors
            });
        }
    </script>
    """

@app.route("/search", methods=['POST'])
def search():
    query = request.json['query']
    print("Received query:", query)
    results = data(doc, query)
    print("Search results:", results)
    return {'results': results}

app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [11/May/2024 20:43:20] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/May/2024 20:43:20] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
Exception in thread Thread-38:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 203, in _new_conn
    sock = connection.create_connection(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 791, in urlopen
    response = self._make_request(
  File "

Received query: school
Search results: ["Document number 28 -----> \nFudge.... Just BS'd that whole paper.... So tired.... Ugh I hate school.....  time to sleep!!!!!!!!!!!", 'Document number 44 -----> \n@creyes middle school and elem. High schools will remain open for those who need credits to graduate. Cali is broken', 'Document number 44 -----> \n@creyes middle school and elem. High schools will remain open for those who need credits to graduate. Cali is broken', "Document number 45 -----> \nBed!!!!!... its time,..... hope i go to school tomorrow, all though i don't feel very well right now"]


Evaluation

In [309]:
vaswani_dataset = pt.datasets.get_dataset("vaswani")
dff = vaswani_dataset.get_topics()
dff['docno'] = dff.index
dff = dff.rename(columns={'query': 'content'})
qrels = vaswani_dataset.get_qrels()
qrels['docno']=qrels['docno'].astype(str)
dff

Unnamed: 0,qid,content,docno
0,1,measurement of dielectric constant of liquids by the use of microwave techniques,0
1,2,mathematical analysis and design details of waveguide fed microwave radiations,1
2,3,use of digital computers in the design of band pass filters having given phase and attenuation characteristics,2
3,4,systems of data coding for information transfer,3
4,5,use of programs in engineering testing of computers,4
...,...,...,...
88,89,tunnel diode construction and its electrical characteristics explained,88
89,90,electronic density of states at the surface of a semiconductor compared with that at depth,89
90,91,resistivity of metallic thin films related to surface roughness,90
91,92,the phenomenon of radiation caused by charged particles moving in varying electric and magnetic fields,91


In [310]:
indexref = vaswani_dataset.get_index()
index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

Number of documents: 11429
Number of terms: 7756
Number of postings: 224573
Number of fields: 1
Number of tokens: 271581
Field names: [text]
Positions:   false



In [311]:
retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"})
res = retr.search("characteristics")
res

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,422,423,0,3.536179,characteristics
1,1,1167,1168,1,3.486181,characteristics
2,1,7557,7558,2,3.394005,characteristics
3,1,1543,1544,3,3.303070,characteristics
4,1,11101,11102,4,3.303070,characteristics
...,...,...,...,...,...,...
821,1,7110,7111,821,0.991624,characteristics
822,1,9694,9695,822,0.983712,characteristics
823,1,6816,6817,823,0.968260,characteristics
824,1,6708,6709,824,0.917801,characteristics


In [312]:
dff = vaswani_dataset.get_topics()
dff.head(5)

Unnamed: 0,qid,query
0,1,measurement of dielectric constant of liquids by the use of microwave techniques
1,2,mathematical analysis and design details of waveguide fed microwave radiations
2,3,use of digital computers in the design of band pass filters having given phase and attenuation characteristics
3,4,systems of data coding for information transfer
4,5,use of programs in engineering testing of computers


In [314]:
qrels = vaswani_dataset.get_qrels()
print(qrels)
print(dff)

     qid  docno  label
0      1   1239      1
1      1   1502      1
2      1   4462      1
3      1   4569      1
4      1   5472      1
...   ..    ...    ...
2078  93   9875      1
2079  93   9956      1
2080  93  10497      1
2081  93  11191      1
2082  93  11318      1

[2083 rows x 3 columns]
   qid  \
0    1   
1    2   
2    3   
3    4   
4    5   
..  ..   
88  89   
89  90   
90  91   
91  92   
92  93   

                                                                                                             query  
0                                 measurement of dielectric constant of liquids by the use of microwave techniques  
1                                   mathematical analysis and design details of waveguide fed microwave radiations  
2   use of digital computers in the design of band pass filters having given phase and attenuation characteristics  
3                                                                  systems of data coding for information tran

In [313]:
eval = pt.Evaluate(res,qrels)
eval

{'map': 8.574711460959339e-07, 'ndcg': 0.0001684744004750219}