# Project part 3 - IRWA

Bernat Quintilla - 254530

Eugeni Soler - 253566

Roger Viader - 252282

In [1]:
from google.colab import drive
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

import pandas as pd
import numpy as np
from numpy import linalg as la
from array import array
import collections
from collections import defaultdict

import pickle
import time
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import preprocess_string
from sklearn.manifold import TSNE
import matplotlib.cm as cm
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
drive.mount('/content/drive')
nltk.download('stopwords')
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 0. RECOVER

In [3]:
# Dictionary of doc id to document info (from P1)
docs_df = pd.read_pickle('/content/drive/Shareddrives/IRWA/Practica3/pickles/docs_df.pkl')
docs_dict = docs_df.reset_index().set_index('doc_id').T.to_dict()
print("-"*15)
print("Document ids and document original text")
print(docs_df.head())

# Dictionary of doc id to processed text (fron P1)
docs_processed_df = pd.read_pickle('/content/drive/Shareddrives/IRWA/Practica3/pickles/docs_processed_df.pkl')
docs_processed = docs_processed_df.T.to_dict()
print("-"*15)
print("Document ids and document text processed")
print(docs_processed_df.head())

# DataFrame with the metrics of the tweets (rt and like, normalized) and their hashtags
# Generated from:
"""
  mapping_df = pd.read_csv('/content/drive/Shareddrives/IRWA/Practica 1/IRWA_data_2024/data/tweet_document_ids_map.csv')
  mapping_dict = dict(zip(mapping_df['id'], mapping_df['docId']))

  tweets_metrics = {}
  for t_id, tweet in tweets_dict.items():
    tweets_metrics[mapping_dict[t_id]] = {'rt':tweet['retweets'],'likes':tweet['likes'], 'hashtags':tweet['hashtags']}

  tweets_metrics_df = pd.DataFrame.from_dict(tweets_metrics, orient='index')
  tweets_metrics_df.rt = (tweets_metrics_df.rt - tweets_metrics_df.rt.mean())/ tweets_metrics_df.rt.std()
  tweets_metrics_df.likes = (tweets_metrics_df.likes - tweets_metrics_df.likes.mean())/ tweets_metrics_df.likes.std()

  with open('tweets_metrics.pickle', 'wb') as handle:
      pickle.dump(tweets_metrics_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""
docs_metrics = pickle.load(open('/content/drive/Shareddrives/IRWA/Practica3/pickles/tweets_metrics.pickle', 'rb'))
print("-"*15)
print("Document metrics for our own score")
print(docs_metrics.head())

# term, doc and inverse doc frequency (from P1&2)
with open('/content/drive/Shareddrives/IRWA/Practica3/pickles/tfidf_index.pkl', 'rb') as file:
    index, tf, df, idf = pickle.load(file)

# inverted index (from P1&2)
with open('/content/drive/Shareddrives/IRWA/Practica3/pickles/inverted_index.pkl', 'rb') as file:
    inverted_index = pickle.load(file)

# eliminate none values and modify the format of the dictionaries
for doc_id, terms in docs_processed.items():
    final_terms = []
    for term in terms.values():
      if term is not None:
        final_terms.append(term)
    docs_processed[doc_id] = final_terms
    docs_dict[doc_id] = docs_dict[doc_id]['text']

query_list = ["violent protest", "inflation price", "strike continuous", "human rights", "modi disaster"]

# print to see the final format to adapt it for the practice
print("-"*15)
print("Exemple with doc_0")
print(docs_processed['doc_0'])
print(docs_dict['doc_0'])

---------------
Document ids and document original text
                                                     text
doc_id                                                   
doc_0   The world progresses while the Indian police a...
doc_1   #FarmersProtest \n#ModiIgnoringFarmersDeaths \...
doc_2   @ReallySwara @rohini_sgh watch full video here...
doc_3   @mandeeppunia1 watch full video here https://t...
doc_4   @mandeeppunia1 watch full video here https://t...
---------------
Document ids and document text processed
           0           1        2      3           4      5       6      7   \
doc_0   world    progress   indian  polic        govt  still     tri   take   
doc_1  farmer  constantli  distroy   crop  throughout  india  realli  heart   
doc_2   watch        full    video   None        None   None    None   None   
doc_3   watch        full    video   None        None   None    None   None   
doc_4   watch        full    video   None        None   None    None   None   

      

In [4]:
# Code from part 1
def build_terms(line):
    line= line.lower()
    line= line.split()
    line= [token for token in line if token not in stop_words]
    line= [token for token in line if not (token.startswith("http") or token.startswith("#") or token.startswith("@"))]
    line = ' '.join(line)
    line = re.sub(r'[^a-zA-Z\s]', ' ', line)
    line= line.split()
    line= [stemmer.stem(token) for token in line]
    return line

def get_tokens(docs, preprocess=preprocess_string, verbose=10000):
    processed_docs = []
    for i, doc in enumerate(docs):
        yield preprocess(doc) # preprocess

        # print progress if needed
        if verbose > 0 and (i + 1) % verbose == 0:
            print(f"Progress: {i + 1}")
    return processed_docs

# Function from lab 1
def search(query, index):
    query = build_terms(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"
            term_docs=[posting[0] for posting in index[term]]
            docs = docs.union(term_docs)
        except:
            pass
    docs = list(docs)
    return docs

# 1. Rank results

## 1.1 TF-IDF + cosine similarity

In [5]:
def rank_documents(terms, docs, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    Arguments:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    Returns:
    List of ranked document IDs, sorted by relevance score (cosine similarity).
    """

    doc_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):
        if term not in index:
            continue

        query_vector[termIndex] = (query_terms_count[term] / query_norm) * idf.get(term, 0)

        for doc_id, postings in index[term]:
            if doc_id in docs:
                doc_vectors[doc_id][termIndex] = tf[term].get(doc_id, 0) * idf.get(term, 0)

    doc_scores = [[np.dot(doc_vec, query_vector), doc_id] for doc_id, doc_vec in doc_vectors.items()]
    doc_scores.sort(reverse=True, key=lambda x: x[0])

    ranked_docs = [doc_id for score, doc_id in doc_scores]
    ranked_scores = [score for score, doc_id in doc_scores]

    if not ranked_docs:
        print("No results found. Please try again.")
        return []

    return ranked_docs, ranked_scores

In [6]:
n = 5
for query in query_list:
    print("TOP",n,"RELEVANT DOCS USING TF-IDF and COSINE SIMILARITY for query : '"+query+"'")
    docs = search(query, inverted_index)
    ranked_docs, ranked_scores = rank_documents(query, docs, index, idf, tf)
    for ind, d_id in enumerate(ranked_docs[:n]):
        print("-"*15+" "*4+str(ind+1)+" (SCORE: " +str(ranked_scores[ind])+") " + "-"*15)
        print("doc_id = {} - tweet: {}\n".format(d_id, docs_dict[d_id].replace("\n", " ")))
    print("=" * 50 + "\n")

TOP 5 RELEVANT DOCS USING TF-IDF and COSINE SIMILARITY for query :  violent protest
---------------    1(0.005635362582400001) ) ) ) ---------------
doc_id = doc_31123 - tweet: @sunilgadavi @GanduTraveler @pranav_whatever @zoo_bear How about you read them!! Clearly people who have know why farmers are protesting #farmersprotest. https://t.co/1mmWJ8v3UY

---------------    2(0.005333683740000001) ) ) ) ---------------
doc_id = doc_48223 - tweet: “Those who make peaceful revolution impossible will make violent revolution inevitable.” — John F. Kennedy  #MahapanchayatRevolution #FarmersProtest https://t.co/Jxgcg1NRhN

---------------    3(0.005332071972400001) ) ) ) ---------------
doc_id = doc_41980 - tweet: 👇👇 Those who make peaceful revolution impossible will make violent revolution inevitable. #FarmersProtest #NoFarmersNoFood

---------------    4(0.005319926685799999) ) ) ) ---------------
doc_id = doc_20856 - tweet: Everyone pls Attend this protest. #FarmersProtest  #isupportfarmers

## 1.2 Own Metric (query expansion, regarding likes, comments...)

In [37]:
# We will use the cosine similarity and add 3 factors based on the popularity of the tweets:
#   Factor1: likes, normalized on the corpus
#   Factor2: rt, normalized on the dataset
#   Factor3: percentage of the hashtags that contain each query words
# So our ranking method will be: Score = cos(θ) + Factor2/Factor1 + Factor3

# This will balance popular tweets but still enhance the shared ones and with meaningful hashtags

def own_ranking(query, docs: list, docs_metrics: pd.DataFrame, index, idf, tf):
    # Initialize vectors and factors dictionary
    doc_vectors = defaultdict(lambda: [0] * len(query))
    query_terms_count = collections.Counter(query)
    query_vector = [0] * len(query)
    query_norm = la.norm(list(query_terms_count.values()))
    factors_dict = {}

    # Normalize metrics
    tweets_metrics_df = docs_metrics.copy()
    tweets_metrics_df['rt'] = (tweets_metrics_df['rt'] - tweets_metrics_df['rt'].mean()) / tweets_metrics_df['rt'].std()
    tweets_metrics_df['likes'] = (tweets_metrics_df['likes'] - tweets_metrics_df['likes'].mean()) / tweets_metrics_df['likes'].std()
    metrics_dict = tweets_metrics_df.loc[docs].to_dict('index')

    for termIndex, term in enumerate(query):
        if term in index:
            query_vector[termIndex] = (query_terms_count[term] / query_norm) * idf.get(term, 0)

    # Calculate document vectors and factors
    for termIndex, term in enumerate(query):
        if term not in index:
            continue
        for doc_id, _ in index[term]:
            if doc_id in docs:
                # Populate doc_vectors for term weight
                doc_vectors[doc_id][termIndex] = tf[term].get(doc_id, 0) * idf.get(term, 0)

                # Fetch metrics for the document
                if doc_id not in factors_dict:
                    tweet_metrics = metrics_dict[doc_id]
                    hashtags = tweet_metrics.get('hashtags', [])
                    factor_3 = sum(1 for hashtag in hashtags if term in hashtag) / max(len(hashtags), 1)
                    # Store factors for the document
                    factors_dict[doc_id] = [tweet_metrics['likes'],tweet_metrics['rt'],factor_3]
                else:
                    # Increment factor_3 if term is in hashtag (avoids re-fetching DataFrame)
                    hashtags = metrics_dict[doc_id]['hashtags']
                    factors_dict[doc_id][2] += sum(1 for hashtag in hashtags if term in hashtag) / max(len(hashtags), 1)

    # Calculate scores using factors
    doc_scores = []
    for doc_id, doc_vec in doc_vectors.items():
        if doc_id in factors_dict:
            likes, rt, factor_3 = factors_dict[doc_id]
            score = np.dot(doc_vec, query_vector) + (0 if likes==0 else rt/likes) + factor_3
            doc_scores.append([score, doc_id])

    # Sort scores and extract ranked documents and scores
    doc_scores.sort(reverse=True, key=lambda x: x[0])
    ranked_docs = [doc_id for score, doc_id in doc_scores]
    ranked_scores = [score for score, doc_id in doc_scores]

    if not ranked_docs:
        print("No results found. Please try again.")
        return []

    return ranked_docs, ranked_scores

### ⚠️ WARNING!
The last execution we did lasted 6 min, so we copy the results in this cell and comment out the next one.
```
TOP 5 RELEVANT DOCS USING TF-IDF+COSINE SIMILARITY + OUR OWN METRICS for query : 'violent protest'
---------------    1 (SCORE: 12.20356916516103) ---------------
doc_id = doc_46688 - tweet: Farmers Protest awareness and solidarity ad, running through Oilers match! ✊✊✊#FarmersProtest https://t.co/77iBYcOUzM #MahapanchayatRevolution

---------------    2 (SCORE: 11.926429860478043) ---------------
doc_id = doc_29008 - tweet: Looks like the farmers protests are shutting down slowly..... funds drying up I guess...#FarmersProtest #KhalistaniTerrorists also on the back foot....only some liberal women strutting around...

---------------    3 (SCORE: 11.261375090146176) ---------------
doc_id = doc_38842 - tweet: Is protest is only solution to any problem in India? They have forget, India became India due to a protest. #FarmersProtest #FarmersAboveReligiousHate #Indialandofprotest

---------------    4 (SCORE: 11.178038010487445) ---------------
doc_id = doc_48404 - tweet: @ANI Who will be file charge sheet aginst government for the death of 200 farmers in protest #Stop_Killing_Farmers #FarmersProtest #FarmersProtestDelhi2020 #Shamelessgovernment https://t.co/O8lQmVXjj9

---------------    5 (SCORE: 11.09772418099425) ---------------
doc_id = doc_30735 - tweet: In solidarity with India’s farmers, and those who have passed away during their peaceful protest, a solidarity protest was held before Ontario’s legislature and at Nathan Phillips Square in Toronto 🙏🏽  #farmersprotest https://t.co/gwUzE2EwPC

==================================================

TOP 5 RELEVANT DOCS USING TF-IDF+COSINE SIMILARITY + OUR OWN METRICS for query : 'inflation price'
---------------    1 (SCORE: 10.273878312174215) ---------------
doc_id = doc_22465 - tweet: May your happiness increase like petrol prices,your sorrows fall like Indian rupees and joy fill your heart like corruption in India ..#FarmersProtest #FarmersMakelndia #corruptioninindia #punjabiinmelbourne

---------------    2 (SCORE: 9.927486758268458) ---------------
doc_id = doc_12652 - tweet: Nobody bought Arjun Tendulkar during the IPL auctions so, Mumbai Indians (Ambani) bought him for his base price....now someone needs to explain to Sachin Tendulkar that his son was sold at the MSP....and that’s what farmers in India are demanding.#FarmersProtest #SachinTendulkar

---------------    3 (SCORE: 9.594792555600183) ---------------
doc_id = doc_32201 - tweet: Everybody can not be Sardaar Manmohan Singh ji Who can make strong economy  Increased value of rupees controlled  Prices #farmersprotest #ReleaseDetainedFarmers  #stanswithfarmersprotest https://t.co/LEGgrAWhqM

---------------    4 (SCORE: 8.940601014309118) ---------------
doc_id = doc_11185 - tweet: So after privatization of the oil sector Indian government saying they don’t control oil prices and they can’t force private companies to bring it down!! Just imagine what will happen once they privatize the agriculture sector! #FarmersProtest #worldsupportindianfarmers @jazzyb https://t.co/9sTQ1reG4b

---------------    5 (SCORE: 8.932230698588977) ---------------
doc_id = doc_3499 - tweet: Feeling ashamed hate won over #FarmersProtest, inflation and anti-corruption etc. #GujaratMunicipalElection2021

==================================================

TOP 5 RELEVANT DOCS USING TF-IDF+COSINE SIMILARITY + OUR OWN METRICS for query : 'strike continuous'
---------------    1 (SCORE: 11.434027295716565) ---------------
doc_id = doc_42922 - tweet: There cannot be continued occupation of public place affecting rights of others. #supremecourtofindia  On this ground it is duty of executive to get the roads blocked by #FarmersProtest , evicted. Clear message to executive. https://t.co/njaTzg1ujo

---------------    2 (SCORE: 11.427114111167795) ---------------
doc_id = doc_30338 - tweet: A salient fact that continues to be misreported. Navreet Singh was shot in the face by Indian police on 1/26, he lost control of his tractor after being shot. #farmersprotest #justicefornavreetsingh  https://t.co/PaQnNBrmjc

---------------    3 (SCORE: 10.679634901660597) ---------------
doc_id = doc_34947 - tweet: #FarmersProtest #PeacefulProtestContinues  #IndiaBeingSilenced  #ModiYouCantFailFarmers  Protest will continue until we repeal the new farm laws. Peaceful protest is our right. Speak up for farmers and their rights. Stay strong 💪 and united ✊✊✊✊

---------------    4 (SCORE: 10.602800704643233) ---------------
doc_id = doc_937 - tweet: We will fight for our rights and The struggle will continue till the day when the three black laws are withdrawn and laws are made on MSP. We will win  #ModiDontSellFarmers #ModiDontSellFarmers #FarmersProtest https://t.co/jIG6KgoxRi

---------------    5 (SCORE: 10.59429613431479) ---------------
doc_id = doc_3017 - tweet: "Intellectual liberals" continue to join the #FarmersProtest #ViolentProtest #KhalistaniProtest https://t.co/mxSnV8tLj9

==================================================

TOP 5 RELEVANT DOCS USING TF-IDF+COSINE SIMILARITY + OUR OWN METRICS for query : 'human rights'
---------------    1 (SCORE: 8.951860004579643) ---------------
doc_id = doc_43297 - tweet: “We need leaders not in love with money but in love with justice.Not in love with publicity but in love with humanity.”-Martin Luther King Jr. Couldn’t be more true for India.#RepealOnlyWayAhead Make new laws lucrative for the millions of farmers.#FarmersProtest #StandWithFarmers https://t.co/RvMKF7lqOE

---------------    2 (SCORE: 8.607316581129925) ---------------
doc_id = doc_5512 - tweet: @Champaignemania @rumsomal @jot__b @Kisanektamorcha If you did your job right as an Indian citizen and stood up against police brutality, illegal arrests, people outside the nation wouldn't get involved. #humanrights issues in one country is a issue for the whole world to speak about.  #ModiIgnoringFarmersDeaths #FarmersProtest

---------------    3 (SCORE: 8.432181703582009) ---------------
doc_id = doc_45789 - tweet: @AmandaCerny #FarmersProtest;the biggest protest in human history &amp;u have firmly entrenched your place in it😍Thankyou for amplifying their voicess Now,not just farmers,it’s Indians &amp; the world against Modi gov #HumanRightsViolations have u seen @rupikaur_ latest interview?its a must watch!

---------------    4 (SCORE: 8.277652278010933) ---------------
doc_id = doc_7831 - tweet: Let us LOVE not Hate Let us UNITE not Divide  Stand up for humanity Save Farmers lives.   #FarmersProtest  #ModiIgnoringFarmersDeaths        #ModiIgnoringFarmersDeaths https://t.co/sCAnCO7ZvT

---------------    5 (SCORE: 8.265595776680312) ---------------
doc_id = doc_7853 - tweet: Let us LOVE not Hate Let us UNITE not Divide  Stand up for humanity Save Farmers lives.   #FarmersProtest  #ModiIgnoringFarmersDeaths #ModiIgnoringFarmersDeaths

==================================================

TOP 5 RELEVANT DOCS USING TF-IDF+COSINE SIMILARITY + OUR OWN METRICS for query : 'modi disaster'
---------------    1 (SCORE: 11.126954520889083) ---------------
doc_id = doc_923 - tweet: #ModiDontSellFarmers  #FarmersProtest  #ModiDontSellFarmers #ModiDontSellFarmers #ModiDontSellFarmers  Modi u need this from farmers https://t.co/j1Kgu125UI

---------------    2 (SCORE: 10.926908231270623) ---------------
doc_id = doc_9298 - tweet: Modi bhagato why are you so blind .#FarmersProtest #modiwherearemissingfarmers #StandWithFarmers https://t.co/i0fK35OY9P

---------------    3 (SCORE: 10.632269190286141) ---------------
doc_id = doc_18632 - tweet: A Nation wide awakening against Modi.  #RailRokoForFarmers  #FarmersProtest @jazzyb  #kisaanmajdoorektazindabaad https://t.co/TKiTjVTzz3

---------------    4 (SCORE: 10.597679296648076) ---------------
doc_id = doc_14598 - tweet: @RaviSinghKA @DelhiPolice Delhi police is deaf and blind when it's about modi ji or any other minister of ruling party. They are getting paid to intimidate farmers at protest sites.. #DPstopIntimidatingFarmers #FarmersProtest  #DPstopIntimidatingFarmers

---------------    5 (SCORE: 10.593534444260506) ---------------
doc_id = doc_1188 - tweet: Tough situation for all Indians today, Modi Govt is to be blamed. Such high taxes on Petrol/Diesel  #ModiDontSellFarmers #FarmersProtest #ModiDontSellFarmers https://t.co/DN5ONMZAEV

==================================================
```

In [43]:
"""
n = 5
for query in query_list:
    print("TOP",n,"RELEVANT DOCS USING TF-IDF+COSINE SIMILARITY + OUR OWN METRICS for query : '"+query+"'")
    docs = search(query, inverted_index)
    ranked_docs, ranked_scores = own_ranking(query, docs, docs_metrics, index, idf, tf)
    for ind, d_id in enumerate(ranked_docs[:n]):
        print("-"*15+" "*4+str(ind+1)+" (SCORE: " +str(ranked_scores[ind])+") " + "-"*15)
        print("doc_id = {} - tweet: {}\n".format(d_id, docs_dict[d_id].replace("\n", " ")))
    print("=" * 50 + "\n")
  """

'\nn = 5\nfor query in query_list:\n    print("TOP",n,"RELEVANT DOCS USING TF-IDF+COSINE SIMILARITY + OUR OWN METRICS for query : \'"+query+"\'")\n    docs = search(query, inverted_index)\n    ranked_docs, ranked_scores = own_ranking(query, docs, docs_metrics, index, idf, tf)\n    for ind, d_id in enumerate(ranked_docs[:n]):\n        print("-"*15+" "*4+str(ind+1)+" (SCORE: " +str(ranked_scores[ind])+") " + "-"*15)\n        print("doc_id = {} - tweet: {}\n".format(d_id, docs_dict[d_id].replace("\n", " ")))\n    print("=" * 50 + "\n")\n  '

## 1.3 BM25

In [39]:
!pip3 install rank-bm25
from rank_bm25 import BM25Okapi



In [40]:
docs_id_list = [id for id,d in docs_processed.items()]
docs_list = [docs_processed[id] for id in docs_id_list]
bm25 = BM25Okapi(docs_list) # constructing a paragraph search index
def bm25_top_n(bm25, query, n=10):
    #apply preprocessing to the query using get_tokens and tranform it from string to list of terms
    query = query.split() # cast query from string to list
    query = list(get_tokens(query)) # apply preprocessing (through get_tokens)
    query = [item for sublist in query for item in sublist] # transform list of list to list
    # score docs using a specific function of bm25
    scores = np.array(bm25.get_scores(query))
    # get indices of top N scores
    idx = np.argpartition(scores, -n)[-n:]
    # sort top N scores and return their indices
    # if all the scores are 0 return empty list
    if np.sum(scores[idx]) == 0:
        return[]
    return idx[np.argsort(-scores[idx])]

In [41]:
n = 5
for query in query_list:
  print("TOP",n,"RELEVANT DOCS USING BM25 for query : '"+query+"'")
  for ind, i in enumerate(bm25_top_n(bm25, query, n=n)):
      print("-"*15+" "*4+str(ind+1)+" (SCORE: " +str(ranked_scores[ind])+") " + "-"*15)
      doc_id = docs_id_list[i]
      print("doc_id = {} - tweet: {}\n".format(d_id, docs_dict[d_id].replace("\n", " ")))
  print("=" * 50 + "\n")

TOP 5 RELEVANT DOCS USING BM25 for query : 'violent protest'
---------------    1 (SCORE: 11.126954520889083) ---------------
doc_id = doc_1188 - tweet: Tough situation for all Indians today, Modi Govt is to be blamed. Such high taxes on Petrol/Diesel  #ModiDontSellFarmers #FarmersProtest #ModiDontSellFarmers https://t.co/DN5ONMZAEV

---------------    2 (SCORE: 10.926908231270623) ---------------
doc_id = doc_1188 - tweet: Tough situation for all Indians today, Modi Govt is to be blamed. Such high taxes on Petrol/Diesel  #ModiDontSellFarmers #FarmersProtest #ModiDontSellFarmers https://t.co/DN5ONMZAEV

---------------    3 (SCORE: 10.632269190286141) ---------------
doc_id = doc_1188 - tweet: Tough situation for all Indians today, Modi Govt is to be blamed. Such high taxes on Petrol/Diesel  #ModiDontSellFarmers #FarmersProtest #ModiDontSellFarmers https://t.co/DN5ONMZAEV

---------------    4 (SCORE: 10.597679296648076) ---------------
doc_id = doc_1188 - tweet: Tough situation for al

#TODO: Explain differences, pros-cons of each ranking method

# 2. Return the top-20 ranked using word2vec for the 5 queries in ```query_list```

In [42]:
model = Word2Vec(sentences=docs_processed.values(), vector_size=100, window=5, min_count=1, workers=4)

def compute_average_vector(tokens, model):
    valid_tokens = [model.wv[word] for word in tokens if word in model.wv]
    if not valid_tokens:
        return np.zeros(model.vector_size)
    return np.mean(valid_tokens, axis=0)

doc_vectors = {doc_id: compute_average_vector(tokens, model) for doc_id, tokens in docs_processed.items()}

query_list = ["violent protest", "inflation price", "strike continuous", "human rights", "modi disaster"]

for query in query_list:
    query_tokens = build_terms(query)
    query_vector = compute_average_vector(query_tokens, model)

    similarities = {doc_id: cosine_similarity([query_vector], [doc_vec])[0][0] for doc_id, doc_vec in doc_vectors.items()}

    top_20_docs = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:20]

    print(f"TOP 20 RELEVANT DOCS USING Word2Vec and COSINE SIMILARITY for query: '"+query+"'")
    for rank, (doc_id, score) in enumerate(top_20_docs, 1):
        print("-"*15+" "*4+str(ind+1)+" (SCORE: " +str(ranked_scores[ind])+") " + "-"*15)
        print("doc_id = {} - tweet: {}\n".format(doc_id, docs_dict[doc_id].replace("\n", " ")))
    print("=" * 50 + "\n")

TOP 20 RELEVANT DOCS USING Word2Vec and COSINE SIMILARITY for query: 'violent protest'
---------------    5 (SCORE: 10.593534444260506) ---------------
doc_id = doc_17299 - tweet: Revolutionary Protest #FarmersProtest #FarmersAreIndia #ReleaseDetainedFarmers https://t.co/JxJHip0x1Z

---------------    5 (SCORE: 10.593534444260506) ---------------
doc_id = doc_8492 - tweet: #FarmersProtest agricuture protest in europe https://t.co/4mE3LOCZNb

---------------    5 (SCORE: 10.593534444260506) ---------------
doc_id = doc_28402 - tweet: Pic of telengana protest #FarmersProtest https://t.co/Av3OcdA3wp

---------------    5 (SCORE: 10.593534444260506) ---------------
doc_id = doc_16443 - tweet: @Anumanhas11 Its Protest *  #FarmersProtest

---------------    5 (SCORE: 10.593534444260506) ---------------
doc_id = doc_18120 - tweet: We Protest   #FarmersProtest #RailRokoForFarmers https://t.co/Y6L8uobXXF

---------------    5 (SCORE: 10.593534444260506) ---------------
doc_id = doc_18586 - twee

# 3. Can you imagine a better representation?

Full explanation in the report.