# IMPORTS

In this section we are importing and installing all necessary libraries:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/Shareddrives/INFO RETREIVAL/PROJECT/Part 3

/content/drive/Shareddrives/INFO RETREIVAL/PROJECT/Part 3


In [3]:
data_path = './data/'

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
!pip install demoji

Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl.metadata (9.2 kB)
Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [7]:
!pip install sentence_transformers



In [8]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import pandas as pd
import json
import re
import demoji
import time
from array import array

Importing 'Build terms' function from previous part: <br>

Note: We will use it to pre-process the input query

In [9]:
def build_terms(line):
    """
    Description:
    Preprocess the text (tweet) by removing stop words, punctuation (but keeping numbers), URLs,
    stemming, transforming to lowercase, and extracting hashtags. The hashtags are excluded
    from the preprocessed text.

    Input:
    line -- string (text) to be preprocessed

    Output:
    preprocessed_line -- a list of tokens corresponding to the input text after preprocessing
    hashtags -- a list of extracted hashtags
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    # Transform to lowercase
    line = line.lower()

    # HASHTAGS

    # Extract hashtags (keeping the # symbol)
    hashtags = re.findall(r'#\w+', line)
    # Remove punctuation but keep numbers
    hashtags = [re.sub(r'[^a-z0-9\s]', '', word) for word in hashtags]
    # Remove empty strings (in case stemming or other operations result in an empty hashtag)
    hashtags = [word for word in hashtags if word]

    # LINE

    # Remove emojis using demoji
    line = demoji.replace(line, " ")
    # Remove hashtags from the text --> remove all the word
    line = re.sub(r'#\w+', '', line)
    # Remove URLs
    line = re.sub(r'http\S+|www\S+|https\S+', '', line)
    # Remove punctuation but keep numbers (and hashtags are already removed)
    line = re.sub(r'[^a-z0-9\s]', '', line)
    # Tokenize the text to get a list of terms
    line = line.split()
    # Remove the word amp since & it is parsed as &amp and it is one of the most frequent words!
    line = [word for word in line if word != 'amp']
    # Remove stopwords
    line = [word for word in line if word not in stop_words]
    # Perform stemming
    line = [stemmer.stem(word) for word in line]

    # Return preprocessed text and hashtags
    return line, hashtags # return the preprocessed tweet content and hashtags

# Load the original and processed tweets json files

If data is not loaded from json files, then it will need to be imported farmer's protest tweets file, create the dictionaries and apply build terms function. With the already created dictionaries (in json files), data is loaded faster!

Note: Data from json files came from the dictionaries created in the first part of the project.

First let's load processed tweets dictionary:

In [10]:
with open(data_path + 'processed_tweets.json', 'r') as f:
    processed_tweets_dict = json.load(f)

In [11]:
print(f"We have {len(processed_tweets_dict)} processed tweets!")

We have 48429 processed tweets!


Print a sample:

In [12]:
processed_tweets_dict["doc_37"]

{'tweet': ['remain',
  'unit',
  'dont',
  'fall',
  'govt',
  'tactic',
  'tri',
  'hard',
  'creat',
  'divis',
  'within',
  'protest'],
 'hashtags': ['farmersprotest',
  'modiignoringfarmersdeaths',
  'farmersprotests',
  'modidontsellfarmers']}

Now let's load the original tweets dictionary:

In [13]:
with open(data_path + 'original_tweets.json', 'r') as f:
    original_tweets_dict = json.load(f)

Print same sample as before:

In [14]:
original_tweets_dict["doc_37"]

{'tweet': "Remain United 👇🏾👇🏾 Dont fall into govt's tactics they are trying hard to create a division within the protests #FarmersProtest #ModiIgnoringFarmersDeaths #FarmersProtests #ModiDontSellFarmers https://t.co/4eX8ckNLJT",
 'hashtags': ['#FarmersProtest',
  '#ModiIgnoringFarmersDeaths',
  '#FarmersProtests',
  '#ModiDontSellFarmers'],
 'date': '2021-02-24T08:57:35+00:00',
 'likes': 1,
 'retweets': 1,
 'url': 'https://twitter.com/B56Ricky/status/1364499704147374082',
 'comment_count': 0}

# <b>RANKING 1: </b> TF-IDF & COSINE SIMILARITY

## Inverted Index

We are using the same full inverted index as the one used in the previous part:

In [15]:
def create_index(tweets):
    """
    Description:
    Creates a full inverted index based on the content of the dictionary

    Input:
    tweets -- dictionary with 'tweet' variable that contains the processed tweet content.

    Output:
    full inverted index with all the corpus processed terms, documents where they appear and its corresponding positions
    """
    # init the dictionary
    index = defaultdict(list)

    for tweet_id, tweet_data in tweets.items():
        # get current tweet content (pre-processed)
        terms = tweet_data['tweet']
        # inverted index for current tweet
        current_tweet_index = defaultdict(lambda: [tweet_id, array('I')])  # defaultdict with list of positions
        #  Counts occurrences of each term in the tweet
        term_counts = defaultdict(int)

        # Build term frequency for this tweet and position lists
        for position, term in enumerate(terms):
            current_tweet_index[term][1].append(position)
            term_counts[term] += 1
        # Update index
        for term, (id, positions) in current_tweet_index.items():
            index[term].append([tweet_id, positions])
    return index

Inverted Index for all tweets.

In [16]:
index = create_index(processed_tweets_dict)

Here is the inverted intex for 'rain' word:

In [17]:
print(index['rain'])

[['doc_555', array('I', [4])], ['doc_4049', array('I', [0])], ['doc_5773', array('I', [10])], ['doc_6354', array('I', [5])], ['doc_10384', array('I', [9])], ['doc_11880', array('I', [4])], ['doc_12218', array('I', [1])], ['doc_14404', array('I', [16])], ['doc_16995', array('I', [15])], ['doc_18469', array('I', [13])], ['doc_24960', array('I', [9])], ['doc_25326', array('I', [3])], ['doc_25871', array('I', [6])], ['doc_26069', array('I', [2])], ['doc_32517', array('I', [7, 25])], ['doc_35314', array('I', [1])], ['doc_38132', array('I', [13])], ['doc_39395', array('I', [1])], ['doc_39695', array('I', [3])], ['doc_39879', array('I', [8])], ['doc_41832', array('I', [8])]]


## TF-IDF Implementation

In [18]:
def create_tfidf(tweets, num_documents):
    """
    Optimized inverted index creation with TF, DF, and IDF calculations.

    Input:
    tweets -- doctionary of preprocessed tweets
    num_documents -- total number of tweets in the corpus

    Output:
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """

    tf = defaultdict(list)  # term frequencies of terms in documents
    df = defaultdict(int)    # document frequencies of terms in the corpus
    idf = {}

    for tweet_id, tweet_data in tweets.items():
        terms = tweet_data['tweet']
        current_tweet_index = defaultdict(lambda: [tweet_id, array('I')])  # defaultdict with list of positions
        term_counts = defaultdict(int)  # Counts occurrences of each term in the tweet

        # Build term frequency for this tweet and position lists
        for position, term in enumerate(terms):
            current_tweet_index[term][1].append(position)
            term_counts[term] += 1

        # Calculate L2 norm for the tweet (once per document)
        norm = math.sqrt(sum(count ** 2 for count in term_counts.values()))

        # Update index, TF, and DF
        for term, (id, positions) in current_tweet_index.items():
            # Normalized term frequency
            tf_value = term_counts[term] / norm
            tf[term].append(np.round(tf_value, 4))

            # Increment DF count (only once per document per term)
            df[term] += 1
    # Compute IDF for each term
    for term, doc_count in df.items():
        idf[term] = np.round(np.log(num_documents / doc_count), 4)

    return tf, df, idf


In [19]:
start_time = time.time()
num_documents = len(processed_tweets_dict.keys())
tf, df, idf = create_tfidf(processed_tweets_dict, num_documents)
print("Total time to create the tf - idf - df dictionaries: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the tf - idf - df dictionaries: 11.42 seconds


## Bert Similarity

Import a BERT model to compute the similarity between queries and hashtags: <br>

Note: This is part of our score implementation.

In [20]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # multi-language model

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
def get_embeddings(name_list):

    """
    Generates embeddings for a list of names using a pre-trained model.

    Input:
    name_list -- list of names (strings) for which embeddings are to be generated

    Output:
    embedding -- list of embeddings for each name in name_list
    """

    embedding = model.encode(name_list, convert_to_tensor=False)
    #print("Embedding shape: ", embedding.shape)
    return embedding

def bert_similarity(target_name, embedding, name_list, threshold=0.5):

    """
    Computes the cosine similarity between a target name and a list of names
    based on their embeddings. Returns the highest similarity score if it
    exceeds a threshold, otherwise returns 0.

    Input:
    target_name -- string, the name to compare against name_list
    embedding -- list of embeddings for each name in name_list (output of get_embeddings)
    name_list -- list of names (strings) corresponding to the embeddings
    threshold -- float, minimum similarity score to consider as a match (default 0.5)

    Output:
    best_match[1] -- float, the highest similarity score above the threshold
                     or 0 if no scores exceed the threshold
    """

    if pd.isna(target_name): return None

    target_embedding = model.encode(target_name, convert_to_tensor=False)

    # Compute cosine similarity scores
    cosine_scores = util.cos_sim(target_embedding, embedding)

    # Create a dictionary to store university names and their similarity scores
    similarity_scores = {}
    for i, name in enumerate(name_list):
        similarity_scores[name] = cosine_scores[0][i].item()

    # Sort the scores in descending order
    sorted_scores = dict(sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True))

    best_match = list(sorted_scores.items())[0]

    if best_match[1] > threshold:
        return best_match[1]
    else:
        return 0

Compute the best hashtags scores based on a given query:

In [22]:
def best_score (list_hashtags, query,threshold=0.5):

    """
    Finds the highest similarity score between a query and a list of hashtags
    using BERT embeddings, returning the score if it exceeds the threshold (0.5).

    Input:
    list_hashtags -- list of hashtags (strings) for which embeddings are to be generated
    query -- string, the query hashtag to compare against list_hashtags
    threshold -- decision point

    Output:
    float -- highest similarity score if it exceeds 0.5, otherwise 0
    """

    embeddings = get_embeddings(list_hashtags)
    return bert_similarity(query, embeddings, list_hashtags, threshold=threshold)

## Ranking Functions

Now let's rank documents as follows:
- Given a query, we are creating a final weight with weighted sum of the cosine similarity between query and tweet content, and the BERT similarity between the query combinations and tweet hashtags. That is, if we have two same tweets but the first one has a hashtag relevant to the query and the other one no, then we are giving more score to the second tweet for the ranking.

Note 1: We are creating unions of two words of the query so that we're creating different hashtags from our input query. That is, if the input is "Support farmers of India", we are computing the similarity of hashtags vs "supportfarmer", "supportindia" & "farmerindia" (note that preprocessed query is "support farmer india"). <br>

Note 2: If flag = False, the score is computed as TF-IDF + cosine similarity. For flag = True, the bert similarity is added for an additional weight based on the hashtags similarity with the given query.


In [23]:
def rank_documents(query, docs, index,processed_tweets, tf, idf, alpha=1, beta=3, flag=False):
    """
    Ranks documents based on their relevance (in descending order) to the query using TF-IDF scores.

    Input:
        query: The search query as a list of terms (preprocessed)
        docs: A list of document IDs to be ranked.
        index: The inverted index for tweets.
        processed_tweets: processed tweets dictionary
        tf: term frequency dictionary
        idf: The inverse document frequency dictionary.
        alpha: Weight for the TF-IDF score.
        beta: Weight for the similarity score between hashtags.
        flag: boolean which indicates whether to use or not the hashtag similarity score (additional to the tf-idf)

    Output:
        A list of document IDs sorted by their relevance to the query and their scores
    """
    doc_vectors = defaultdict(lambda: [0] * len(query))  # Initialize all scores to 0
    query_vector = [0] * len(query)

    query_terms_count = collections.Counter(query)

    query_norm = la.norm(list(query_terms_count.values()))


    for termIndex, term in enumerate(query):
        if term not in index:
        # as the term does not exist, return anything
            return []

        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        for doc_index, (doc, postings) in enumerate(index[term]):
            if doc in docs:
            #if doc in tf and term in tf[doc] and doc in tfidf and term in tfidf[doc]:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]

    # array d'arrays
    scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]

    if flag:
        # compute the similarity between query and hashtags --> we are joining query together as hashtags are one-word terms
        for doc_id in docs:
            # perform combinations of 2 words
            score_tweet = 0
            for i in range(len(query)):
                for j in range(i+1,len(query)):
                    temp = best_score(processed_tweets_dict[doc_id]['hashtags'], query[i]+query[j])
                    score_tweet = max(score_tweet,temp)

            for i in range(len(scores)):
                if scores[i][1] == doc_id:
                    scores[i][0] = (alpha * scores[i][0] + beta * score_tweet)
                    break  # Exit the loop once the doc_id is found

    scores.sort(reverse=True)
    ranked_tweets = [x[1] for x in scores]

    # Sort documents by relevance score (descending)
    return ranked_tweets, scores

def search_tf_idf(query, index, tf, idf, processed_tweets,flag=False):
    """
    Searches for documents relevant to the query using TF-IDF scores.

    Input:
        query: The search query as a string.
        index: The inverted index.
        tf: term frequency term - document
        idf: The inverse document frequency dictionary per term
        processed_tweets: processed tweets dictionary
        flag: boolean which indicates whether to use or not the hashtag similarity score (additional to the tf-idf)

    Output:
        A list of document IDs sorted by their relevance to the query.
    """
    query_terms = build_terms(query)[0]  # Preprocess the query

    docs = set(processed_tweets.keys())
    for term in query_terms:
        try:
            # store in term_docs the ids of the docs that contain "term"
            term_docs=[posting[0] for posting in index[term]]

            # docs = docs Intersection term_docs --> applying AND method for tweets seach
            docs = docs.intersection(set(term_docs))
        except:
            #term is not in index
            pass
    docs = list(docs)
    print("Number of documents found: {}".format(len(docs)))

    #query_hash = process_query(query)

    ranked_tweets,scores = rank_documents(query_terms, docs, index, processed_tweets, tf, idf,flag = flag)
    return ranked_tweets,scores

## Queries test for TF-IDF + COSINE SIMILARITY

Let's try of search method with the query “Support Farmers of India”:

Note: Second search may last for 1 minute.

In [24]:
query = "Support Farmers of India"
# first without hashtag similarity weight
flag = False
ranked_tweets,scores = search_tf_idf(query, index, tf, idf, processed_tweets_dict,flag)
top = 20
print(f"Top {top} documents without hashtag similarity:")
print(ranked_tweets[:top])


Number of documents found: 367
Top 20 documents without hashtag similarity:
['doc_23455', 'doc_1922', 'doc_26563', 'doc_22269', 'doc_21005', 'doc_27466', 'doc_7053', 'doc_7052', 'doc_7048', 'doc_7046', 'doc_7043', 'doc_7041', 'doc_7040', 'doc_7038', 'doc_7035', 'doc_7034', 'doc_7031', 'doc_7028', 'doc_7025', 'doc_7021']


In [25]:
# Now with hashtag similarity weight
flag = True
ranked_tweets,scores = search_tf_idf(query, index, tf, idf, processed_tweets_dict,flag)
print(f"Top {top} documents with hashtag similarity:")
print(ranked_tweets[:top])

Number of documents found: 367
Top 20 documents with hashtag similarity:
['doc_26563', 'doc_21005', 'doc_19136', 'doc_25316', 'doc_23455', 'doc_1922', 'doc_42666', 'doc_967', 'doc_32483', 'doc_17175', 'doc_22625', 'doc_26453', 'doc_22269', 'doc_25609', 'doc_47063', 'doc_5482', 'doc_27466', 'doc_22096', 'doc_27735', 'doc_7053']


Here are the top 1 tweet WITHOUT hashtag similarity for query = "Support Farmers of India":

In [26]:
print("Best result for search WITHOUT hashtag similarity:")
original_tweets_dict['doc_23455']

Best result for search WITHOUT hashtag similarity:


{'tweet': 'Supporting farmers in India! #FarmersProtest https://t.co/o9I0uENPe4',
 'hashtags': ['#FarmersProtest'],
 'date': '2021-02-17T14:53:00+00:00',
 'likes': 1,
 'retweets': 0,
 'url': 'https://twitter.com/PreetiPMenon/status/1362052432071708675',
 'comment_count': 0}


Here are the top 1 tweet WITH hashtag similarity for query = "Support Farmers of India":

In [27]:
print("Best result for search WITH hashtag similarity:")
original_tweets_dict['doc_26563']

Best result for search WITH hashtag similarity:


{'tweet': 'Wake up India Support farmers 🙏😞  #FarmersMakeIndia #FarmersProtest https://t.co/aNybKF9EGs',
 'hashtags': ['#FarmersMakeIndia', '#FarmersProtest'],
 'date': '2021-02-17T01:34:06+00:00',
 'likes': 3,
 'retweets': 1,
 'url': 'https://twitter.com/Gursewak1991/status/1361851385742913542',
 'comment_count': 0}

# <b>RANKING 2:</b> Our Score + Cosine Similarity

Implementation for our score as:


- <b>Step 1:</b> popularity_score= (γ⋅likes + δ⋅retweets + ϵ⋅comments)*date<br>
where <b>date</b> is a recency factor

- <b>Step 2:</b> final_score= α⋅tf_idf_score + β⋅hashtag_similarity_score + Ɣ*popularity_score

- <b>Step 3:</b> Bert Similarity between hashtags and query (done in project part 2)




## Popularity score function

First we are creating a function to obtain the most recent tweet date so that it will be useful for the popularity score computation:

In [28]:
from datetime import datetime, timezone
import math

def get_newest_tweet(tweets):
    """
    Finds the newest tweet based on the date.

    Input:
        tweets: list - list of tweet JSON objects, each with a 'date' field in ISO format.

    Output:
        newest_tweet: dict - the tweet JSON object with the most recent date.
    """
    # Convert dates and find the tweet with the maximum date
    newest_tweet = max(tweets, key=lambda tweet: datetime.fromisoformat(tweet['date'].replace("Z", "+00:00")))
    return newest_tweet

In [29]:
most_recent = get_newest_tweet(original_tweets_dict.values())['date']
most_recent = datetime.fromisoformat(most_recent.replace("Z", "+00:00"))
print(f"The most recent tweet date is {most_recent}")

The most recent tweet date is 2021-02-24 09:23:35+00:00


In [30]:
def popularity_score(tweet,recent_date): # compute the popularity score defined above based on the tweet characteristics
    """
    Computes a popularity score based on tweets characteristics

    Input:
        tweet: tweet json object
        recent_date: date of the most recent tweet

    Output:
        Popularity Score (number)
    """
    likes = tweet['likes']
    retweets = tweet['retweets']
    comments = tweet['comment_count']
    date = tweet['date']
    date = datetime.fromisoformat(date.replace("Z", "+00:00"))


    # Calculate recency factor
    days_since_tweet = (recent_date - date).days
    recency_factor = math.exp(-0.05 * days_since_tweet)  # Decay factor based on days

    # Calculate popularity score
    score = (0.5 * likes + 0.5 * retweets + 0.3 * comments) * recency_factor

    return score

## Ranking function

In [31]:
def rank_documents(query, docs, index,processed_tweets,original_tweets_dict, tf, idf, alpha=1, beta=3, flag=False, recent_date=most_recent):
    """
    Ranks documents based on their relevance (in descending order) to the query using TF-IDF scores.

    Input:
        query: The search query as a list of terms (preprocessed)
        docs: A list of document IDs to be ranked.
        index: The inverted index for tweets.
        processed_tweets: processed tweets dictionary
        original_tweets_dict: original tweets dictionary
        tf: term frequency dictionary
        idf: The inverse document frequency dictionary.
        alpha: Weight for the TF-IDF score.
        beta: Weight for the similarity score between hashtags.
        flag: boolean which indicates whether to use or not the hashtag similarity score (additional to the tf-idf)
        recent_date: date of the most recent tweet

    Output:
        A list of document IDs sorted by their relevance to the query and their scores
    """
    doc_vectors = defaultdict(lambda: [0] * len(query))  # Initialize all scores to 0
    query_vector = [0] * len(query)

    query_terms_count = collections.Counter(query)

    query_norm = la.norm(list(query_terms_count.values()))


    for termIndex, term in enumerate(query):
        if term not in index:
        # as the term does not exist, return anything
            return []

        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        for doc_index, (doc, postings) in enumerate(index[term]):
            if doc in docs:
            #if doc in tf and term in tf[doc] and doc in tfidf and term in tfidf[doc]:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]

    # array d'arrays
    scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]

    if flag:
        # compute the similarity between query and hashtags --> we are joining query together as hashtags are one-word terms
        for doc_id in docs:
            # perform combinations of 2 words
            score_tweet = 0
            for i in range(len(query)):
                for j in range(i+1,len(query)):
                    temp = best_score(processed_tweets_dict[doc_id]['hashtags'], query[i]+query[j])
                    score_tweet = max(score_tweet,temp)

            for i in range(len(scores)):
                if scores[i][1] == doc_id:
                    # for the BERT similarity between query terms and hashtags
                    scores[i][0] = (alpha * scores[i][0] + beta * score_tweet)
                    # for the popularity score
                    temp = popularity_score(original_tweets_dict[doc_id],recent_date)
                    scores[i][0] = scores[i][0] + temp

                    break  # Exit the loop once the doc_id is found

    scores.sort(reverse=True)
    ranked_tweets = [x[1] for x in scores]

    # Sort documents by relevance score (descending)
    return ranked_tweets, scores

def search_tf_idf(query, index, tf, idf, processed_tweets,original_tweets_dict,flag,most_recent):
    """
    Searches for documents relevant to the query using TF-IDF scores.

    Input:
        query: The search query as a string.
        index: The inverted index.
        tf: term frequency term - document
        idf: The inverse document frequency dictionary per term
        processed_tweets: processed tweets dictionary
        original_tweets_dict: processed tweets dictionary
        flag: boolean which indicates whether to use or not the hashtag similarity score (additional to the tf-idf)
        most_recent: date of the most recent tweet

    Output:
        A list of document IDs sorted by their relevance to the query.
    """
    query_terms = build_terms(query)[0]  # Preprocess the query

    docs = set(original_tweets_dict.keys())
    for term in query_terms:
        try:
            # store in term_docs the ids of the docs that contain "term"
            term_docs=[posting[0] for posting in index[term]]

            # docs = docs Intersection term_docs --> applying AND method for tweets seach
            docs = docs.intersection(set(term_docs))
        except:
            #term is not in index
            pass
    docs = list(docs)
    print("Number of documents found: {}".format(len(docs)))

    #query_hash = process_query(query)

    ranked_tweets,scores = rank_documents(query_terms, docs, index, processed_tweets,original_tweets_dict, tf, idf,flag = flag,recent_date=most_recent)
    return ranked_tweets,scores

## Queries for OUR SCORE method

This may take 1 minute, depending on the number of results for the given query:

In [32]:
query = "Support Farmers of India"
flag = True
ranked_tweets,scores = search_tf_idf(query, index, tf, idf, processed_tweets_dict,original_tweets_dict,flag,most_recent)
top = 20
print(f"Top {top} documents with hashtag similarity + popularity score:")
print(ranked_tweets[:top])

Number of documents found: 367
Top 20 documents with hashtag similarity + popularity score:
['doc_37099', 'doc_6462', 'doc_32035', 'doc_19518', 'doc_38015', 'doc_44945', 'doc_28694', 'doc_8439', 'doc_29886', 'doc_28348', 'doc_26453', 'doc_27922', 'doc_34789', 'doc_23250', 'doc_28457', 'doc_37497', 'doc_5169', 'doc_44218', 'doc_8153', 'doc_15303']


In [33]:
print("The top 1 tweet for the previous query:")
original_tweets_dict['doc_37099']

The top 1 tweet for the previous query:


{'tweet': 'Disha Ravi is 21 yrs\n\nA climate activist from India she campaigns for clean air, clean water and a liveable planet\n\nShe is now facing state sanctioned violence for peacefully supporting farmers\n\nSilence is not an option we must all condemn this act of suppression\n\n#FarmersProtest',
 'hashtags': ['#FarmersProtest'],
 'date': '2021-02-14T14:59:11+00:00',
 'likes': 2843,
 'retweets': 1525,
 'url': 'https://twitter.com/ClaudiaWebbe/status/1360966826298138629',
 'comment_count': 1065}

In [34]:
print("The top 2 tweet for the previous query:")
original_tweets_dict['doc_6462']

The top 2 tweet for the previous query:


{'tweet': "In US, cotton farmers receive a massive subsidy support of Rs 85.10 lakh ($1,17,494) per farmer/year. In India, it is barely Rs 1,455 ($27) [Source:IIFT]. This is how US has 'market competitiveness' in cotton. Our cotton growers are left to commit suicide. #FarmersProtest",
 'hashtags': ['#FarmersProtest'],
 'date': '2021-02-22T09:13:28+00:00',
 'likes': 1590,
 'retweets': 723,
 'url': 'https://twitter.com/Devinder_Sharma/status/1363778926670012416',
 'comment_count': 37}

As we can see now, the tweets shown are more relevant so that they may contain similar content (as shown in td-idf + cosine similarity), but now top tweets are more relevant.

# <b>RANKING 3:</b> BM25

In this section we are implementing BM25 algorithm for the search of tweets for a given query:

In [35]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


## Algorithm Implementation

In [36]:
from collections import Counter

In [37]:
def bm25_score(query, doc, idf, avgdl, k1=1.5, b=0.75):
    """
    Compute the bm25 score for a tweet given a query

    Input:
      query - processed query
      doc - processed tweet content
      idf - inverted document frequency dictionary
      avgdl - average document length
      k1 - tuning parameter
      b - tuning parameter

    Output:
      bm25 score (number)
    """
    # document has to contain ALL QUERY TERMS
    if not all(term in doc for term in query):
        return 0.0  # 0 score for not containing all query terms


    score = 0.0
    doc_len = len(doc)
    doc_freq = Counter(doc)
    for term in query:
        if term in doc_freq:
            f = doc_freq[term]
            score += idf.get(term, 0) * ((f * (k1 + 1)) / (f + k1 * (1 - b + b * (doc_len / avgdl))))
    return score

tweet_ids = list(processed_tweets_dict.keys())
tokenized_tweets = [data['tweet'] for data in processed_tweets_dict.values()]
avgdl = sum(len(doc) for doc in tokenized_tweets) / len(tokenized_tweets)

## Ranking for BM25

In [38]:
original = "Support Farmers of India"
query = build_terms(original)[0]

#BM25 score for each document
results = []
for tweet_id, tweet_content in zip(tweet_ids, tokenized_tweets):
    score = bm25_score(query, tweet_content, idf, avgdl) # idf computed at the beginning of the notebook!
    results.append({'id': tweet_id, 'score': score})

results = sorted(results, key=lambda x: x['score'], reverse=True)
# top results
top = 20
print(results[:top])

[{'id': 'doc_1922', 'score': 8.369238535920596}, {'id': 'doc_23455', 'score': 8.369238535920596}, {'id': 'doc_22269', 'score': 7.895476736112643}, {'id': 'doc_26563', 'score': 7.895476736112643}, {'id': 'doc_27466', 'score': 7.863373058556089}, {'id': 'doc_21005', 'score': 7.666260851366665}, {'id': 'doc_11660', 'score': 7.472478304943371}, {'id': 'doc_15369', 'score': 7.472478304943371}, {'id': 'doc_19136', 'score': 7.472478304943371}, {'id': 'doc_21596', 'score': 7.472478304943371}, {'id': 'doc_22101', 'score': 7.472478304943371}, {'id': 'doc_25316', 'score': 7.472478304943371}, {'id': 'doc_32785', 'score': 7.472478304943371}, {'id': 'doc_42666', 'score': 7.472478304943371}, {'id': 'doc_4323', 'score': 7.4505931147254145}, {'id': 'doc_4325', 'score': 7.4505931147254145}, {'id': 'doc_4333', 'score': 7.4505931147254145}, {'id': 'doc_4334', 'score': 7.4505931147254145}, {'id': 'doc_4336', 'score': 7.4505931147254145}, {'id': 'doc_4337', 'score': 7.4505931147254145}]


In [39]:
print(f"QUERY = {original}")

QUERY = Support Farmers of India


In [40]:
print("Top 1 tweet for the previous query:")
original_tweets_dict['doc_1922']

Top 1 tweet for the previous query:


{'tweet': '#FarmersProtest  I support farmers in india',
 'hashtags': ['#FarmersProtest'],
 'date': '2021-02-23T18:31:49+00:00',
 'likes': 0,
 'retweets': 0,
 'url': 'https://twitter.com/nowTekTalk1/status/1364281828283187205',
 'comment_count': 0}

In [41]:
print("Top 2 tweet for the previous query:")
original_tweets_dict['doc_23455']

Top 2 tweet for the previous query:


{'tweet': 'Supporting farmers in India! #FarmersProtest https://t.co/o9I0uENPe4',
 'hashtags': ['#FarmersProtest'],
 'date': '2021-02-17T14:53:00+00:00',
 'likes': 1,
 'retweets': 0,
 'url': 'https://twitter.com/PreetiPMenon/status/1362052432071708675',
 'comment_count': 0}

As we can see here, the length of the document is normalized so that we have as top results short tweets.  

# **RANKING 4**: Word2vec + cosine similarity

In this section, we are implementing a ranking model from the embeddings of the tweets generated with the word2vec model from gensim (the library we used in the previous lab for the visualization part). Once we have the vectors, we apply the cosine similarity to retrieve the top-20 relevant tweets.

## Implementation

In [42]:
from gensim.models.word2vec import Word2Vec
from numpy import dot
from numpy.linalg import norm
import numpy as np

In [43]:
# State the model from our tweet corpus
corpus = [tweet_data['tweet'] for tweet_data in processed_tweets_dict.values()]
model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)

In [46]:
# Helper function to compute the average embedding for a list of words
def average_vector(words, model):

    """
    Compute the average embedding vector for a list of words.

    Parameters:
        words (list of str): List of words for which the average embedding is computed.
        model (gensim.models.KeyedVectors): Trained word embedding model used to retrieve the vector for each word.

    Returns:
        The average embedding vector for the words provided, with the same dimension
            as the model's word vectors. If none of the words are in the model's vocabulary,
            returns a zero vector.
    """

    # Only use words that are in the vocabulary
    vectors = [model.wv[word] for word in words if word in model.wv.key_to_index]
    # Return the mean vector, or a zero vector if no words are in the model vocabulary
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Function to retrieve the top 20 most similar tweets to a given query
def get_top_20_tweets(query, model, processed_tweets_dict, original_tweets_dict):

    # Tokenize the query (assumes a simple whitespace split; adjust if needed)
    query_words = build_terms(query)

    # Calculate the vector for the query
    query_vector = average_vector(query_words[0], model)

    cosine_similarities = {}

    for tweet_id, tweet_data in processed_tweets_dict.items():
        tweet_words = tweet_data['tweet']

        # Ensure the tweet contains all terms from the query
        if all(term in tweet_words for term in query_words[0]):
            # Convert the tweet words into a vector embedding
            tweet_vector = average_vector(tweet_words, model)

            # Compute cosine similarity between the query and tweet vectors
            if np.any(query_vector) and np.any(tweet_vector):  # Ensure vectors are not all zeros
                cosine_similarity = dot(query_vector, tweet_vector) / (norm(query_vector) * norm(tweet_vector))
                cosine_similarities[tweet_id] = cosine_similarity

    # Sort tweets by similarity and get the top 20
    top_20_tweets = sorted(cosine_similarities, key=cosine_similarities.get, reverse=True)[:20]

    # Collect the results as a list of tuples (tweet_id, similarity, tweet_text)
    results = [(tweet_id, cosine_similarities[tweet_id], original_tweets_dict[tweet_id]) for tweet_id in top_20_tweets]

    return results

## Query 1: Are farmers being respected in India?

In [47]:
query = "Are farmers being respected in India?"
top_20_tweets = get_top_20_tweets(query, model, processed_tweets_dict, original_tweets_dict)

# Print the top 20 tweets
print(f"\nTop 20 most similar tweets for query {query}:")
for tweet_id, similarity, tweet_text in top_20_tweets:
    print(f"\nTweet ID: {tweet_id}, Similarity: {similarity}")
    print(f"Tweet text: {tweet_text}")


Top 20 most similar tweets for query Are farmers being respected in India?:

Tweet ID: doc_43729, Similarity: 0.9656084179878235
Tweet text: {'tweet': 'In India everything on sale\nBut not our respected Farmers \nSalute to them\n\n#RepealOnlyWayAhead \n#FarmersProtest https://t.co/LmU8NeZmNd', 'hashtags': ['#RepealOnlyWayAhead', '#FarmersProtest'], 'date': '2021-02-13T02:19:59+00:00', 'likes': 9, 'retweets': 13, 'url': 'https://twitter.com/_BrAr93/status/1360413380196982795', 'comment_count': 0}

Tweet ID: doc_26131, Similarity: 0.9315985441207886
Tweet text: {'tweet': 'Respect farmers #FarmersMakeIndia \ncountless human rights orgs, farmers &amp; activists stand in solidarity with farmers protesting in India \n#FarmersProtest https://t.co/xgUd8rCVMo', 'hashtags': ['#FarmersMakeIndia', '#FarmersProtest'], 'date': '2021-02-17T02:31:18+00:00', 'likes': 0, 'retweets': 0, 'url': 'https://twitter.com/Mkaur_pb/status/1361865779914756097', 'comment_count': 0}

Tweet ID: doc_25499, Similarity

## Query 2: Are people supporting Farmers?

In [48]:
query = "Are people supporting Farmers?"
top_20_tweets = get_top_20_tweets(query, model, processed_tweets_dict, original_tweets_dict)

# Print the top 20 tweets
print(f"\nTop 20 most similar tweets for query {query}:")
for tweet_id, similarity, tweet_text in top_20_tweets:
    print(f"\nTweet ID: {tweet_id}, Similarity: {similarity}")
    print(f"Tweet text: {tweet_text}")


Top 20 most similar tweets for query Are people supporting Farmers?:

Tweet ID: doc_25223, Similarity: 1.0
Tweet text: {'tweet': 'Support farmers people #FarmersProtest #FarmersMakelndia https://t.co/m6IJvMdHFt', 'hashtags': ['#FarmersProtest', '#FarmersMakelndia'], 'date': '2021-02-17T04:46:26+00:00', 'likes': 2, 'retweets': 1, 'url': 'https://twitter.com/Be1Benipal/status/1361899786807386115', 'comment_count': 0}

Tweet ID: doc_41400, Similarity: 0.9976512789726257
Tweet text: {'tweet': 'TN PEOPLE SUPPORTING FARMERS #FarmersProtest #StandWithFarmers #FarmersProtest', 'hashtags': ['#FarmersProtest', '#StandWithFarmers', '#FarmersProtest'], 'date': '2021-02-13T16:29:39+00:00', 'likes': 0, 'retweets': 0, 'url': 'https://twitter.com/karthikchella/status/1360627204107894787', 'comment_count': 0}

Tweet ID: doc_31432, Similarity: 0.9752092361450195
Tweet text: {'tweet': 'This is a farmers protest for farmers by farmers , common people !! Please keep supporting farmers if these people can 

## Query 3: Fight of farmers in India

In [49]:
query = "Fight of farmers in India"
top_20_tweets = get_top_20_tweets(query, model, processed_tweets_dict, original_tweets_dict)

# Print the top 20 tweets
print(f"\nTop 20 most similar tweets for query {query}:")
for tweet_id, similarity, tweet_text in top_20_tweets:
    print(f"\nTweet ID: {tweet_id}, Similarity: {similarity}")
    print(f"Tweet text: {tweet_text}")


Top 20 most similar tweets for query Fight of farmers in India:

Tweet ID: doc_11657, Similarity: 0.9981187582015991
Tweet text: {'tweet': 'How the Diaspora in Britain Is Fighting for India’s Farmers\n\n#FarmersProtest \nhttps://t.co/2EUnfVl6b1', 'hashtags': ['#FarmersProtest'], 'date': '2021-02-20T18:32:52+00:00', 'likes': 1, 'retweets': 2, 'url': 'https://twitter.com/sikhbeard/status/1363194927761551362', 'comment_count': 0}

Tweet ID: doc_17718, Similarity: 0.9845056533813477
Tweet text: {'tweet': 'Farmers are fighting for not only their future but also for India\n#FarmersProtest \n#ReleaseDetainedFarmers https://t.co/abfKP8hduX', 'hashtags': ['#FarmersProtest', '#ReleaseDetainedFarmers'], 'date': '2021-02-19T04:00:22+00:00', 'likes': 3, 'retweets': 2, 'url': 'https://twitter.com/Jass_k_G/status/1362612968257695751', 'comment_count': 0}

Tweet ID: doc_26322, Similarity: 0.9730579853057861
Tweet text: {'tweet': 'Farmer feeds India. Let\'s fight for the "FARMER RIGHTS INDIA"\n\n #Far

## Query 4: Impact of the Protests in India

In [50]:
query = "Impact of the Protests in India"
top_20_tweets = get_top_20_tweets(query, model, processed_tweets_dict, original_tweets_dict)

# Print the top 20 tweets
print(f"\nTop 20 most similar tweets for query {query}:")
for tweet_id, similarity, tweet_text in top_20_tweets:
    print(f"\nTweet ID: {tweet_id}, Similarity: {similarity}")
    print(f"Tweet text: {tweet_text}")


Top 20 most similar tweets for query Impact of the Protests in India:

Tweet ID: doc_45973, Similarity: 0.9275013208389282
Tweet text: {'tweet': "The farmers' protest in India is impacting Bollywood with likely after-effects - find out more:\n\nhttps://t.co/YQB7fWVkWQ\n\n#bollywood #FarmersProtest #farmersrprotest #Indian #IndiaTogether", 'hashtags': ['#bollywood', '#FarmersProtest', '#farmersrprotest', '#Indian', '#IndiaTogether'], 'date': '2021-02-12T11:05:02+00:00', 'likes': 1, 'retweets': 0, 'url': 'https://twitter.com/DESIblitz/status/1360183124404166656', 'comment_count': 0}

Tweet ID: doc_23037, Similarity: 0.923944354057312
Tweet text: {'tweet': "Why are farmers protesting in India, how is the Indian government responding, and what is the impact of the #FarmersProtest around the world? Listen via @thecarolinadesi's latest podcast episode --&gt; https://t.co/bkjuCleFMX. Take action --&gt; https://t.co/wQt69t7gOe https://t.co/kGe8IjfRWU", 'hashtags': ['#FarmersProtest'], 'date':

## Query 5: Protest against Indian Government

In [51]:
query = "Protest against Indian Government"
top_20_tweets = get_top_20_tweets(query, model, processed_tweets_dict, original_tweets_dict)

# Print the top 20 tweets
print(f"\nTop 20 most similar tweets for query {query}:")
for tweet_id, similarity, tweet_text in top_20_tweets:
    print(f"\nTweet ID: {tweet_id}, Similarity: {similarity}")
    print(f"Tweet text: {tweet_text}")


Top 20 most similar tweets for query Protest against Indian Government:

Tweet ID: doc_30422, Similarity: 0.9772749543190002
Tweet text: {'tweet': 'Why are Indian farmers protesting against the government?\n#FarmersProtest  https://t.co/eMUGoXtabZ', 'hashtags': ['#FarmersProtest'], 'date': '2021-02-16T03:39:08+00:00', 'likes': 1, 'retweets': 1, 'url': 'https://twitter.com/manjitghuman58/status/1361520461381738496', 'comment_count': 0}

Tweet ID: doc_17324, Similarity: 0.9623017311096191
Tweet text: {'tweet': 'The Indian Government immediately stop its escalating crackdown on protesters!! #FarmersProtest #ReleaseDetainedFarmers https://t.co/rnv7Lr7z0P', 'hashtags': ['#FarmersProtest', '#ReleaseDetainedFarmers'], 'date': '2021-02-19T06:34:04+00:00', 'likes': 3, 'retweets': 2, 'url': 'https://twitter.com/reet_kaurz/status/1362651648234029056', 'comment_count': 0}

Tweet ID: doc_18061, Similarity: 0.9623017311096191
Tweet text: {'tweet': 'The Indian Government immediately stop its escalat

# Other word embedding tests

In [52]:
import numpy as np
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
#from sentence_transformers import SentenceTransformer  # for SBERT
#import tensorflow_hub as hub
from scipy import linalg
from sklearn.metrics.pairwise import cosine_similarity

# Load or initialize each model

# Create the Doc2vec model
corpus = [TaggedDocument(words=tweet_data['tweet'], tags=[str(i)])
          for i, tweet_data in enumerate(processed_tweets_dict.values())] # Create corpus of TaggedDocument objects, each with a unique ID

model = Doc2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4) # Initialize and train the Doc2Vec model

In [53]:
# Helper function to compute cosine similarity
def compute_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (linalg.norm(vec1) * linalg.norm(vec2))

# Testing function for USE, Doc2Vec, and SBERT
def test_similarity_models(query, processed_tweets_dict, original_tweets_dict):
    # Convert query to embeddings
    query_doc2vec_vector = model.infer_vector(query.split())

    # Initialize similarity dictionaries
    similarities_doc2vec = {}

    # Loop through tweets and calculate similarity for each model
    for tweet_id, tweet_data in processed_tweets_dict.items():
        tweet_words = tweet_data['tweet']
        tweet_text = " ".join(tweet_words)  # Convert list of words to a single string for USE/SBERT

        # Doc2Vec similarity
        tweet_doc2vec_vector = model.infer_vector(tweet_words)
        similarities_doc2vec[tweet_id] = compute_similarity(query_doc2vec_vector, tweet_doc2vec_vector)

    # Get top 5 similar tweets for each model
    top_5_doc2vec = sorted(similarities_doc2vec, key=similarities_doc2vec.get, reverse=True)[:5]

    # Print results

    print("\nTop 5 tweets using Doc2Vec:")
    for tweet_id in top_5_doc2vec:
        print(f"Tweet ID: {tweet_id}, Similarity: {similarities_doc2vec[tweet_id]}")
        print(f"Tweet Text: {original_tweets_dict[tweet_id]}")

# Example usage
query = "Support Farmers of India"
test_similarity_models(query, processed_tweets_dict, original_tweets_dict)



Top 5 tweets using Doc2Vec:
Tweet ID: doc_6635, Similarity: 0.3356837091736434
Tweet Text: {'tweet': 'These are same you beg every #election for votes. Are they humans?\n#FarmersProtest \n#ModiIgnoringFarmersDeaths https://t.co/W3PD088GQF', 'hashtags': ['#election', '#FarmersProtest', '#ModiIgnoringFarmersDeaths'], 'date': '2021-02-22T07:47:11+00:00', 'likes': 0, 'retweets': 0, 'url': 'https://twitter.com/juspreetsra/status/1363757212389179392', 'comment_count': 0}
Tweet ID: doc_23115, Similarity: 0.3065656721115552
Tweet Text: {'tweet': 'Every one wants repeal of 3 Farm Acts and legal guarantee of MSP. #FarmersProtest #Revoke3FarmActs #FarmersMakeIndia @hrw @cnn @UNHumanRights @BBCWorld @AlJazeera_World @JoeBiden @TimUppal @amnesty @TanDhesi @JustinTrudeau @BorisJohnson @jacindaardern @Australia @nytimesworld https://t.co/sTnPzgrV3l', 'hashtags': ['#FarmersProtest', '#Revoke3FarmActs', '#FarmersMakeIndia'], 'date': '2021-02-17T16:51:48+00:00', 'likes': 2, 'retweets': 0, 'url': 'https