##Imports and Paths

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#imports
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
import time
import re
import json
import string
from numpy import linalg as la

In [None]:
#We create the path to our tweets file and store in the line variable the information of each tweet from the json file and we also create the path to the ids csv

tweets_path = "/content/drive/MyDrive/IRWA/FinalProject/IRWA_data_2023/Rus_Ukr_war_data.json"
with open(tweets_path) as fp:
    lines = fp.readlines()

id_to_docs_path = "/content/drive/MyDrive/IRWA/FinalProject/IRWA_data_2023/Rus_Ukr_war_data_ids.csv"

#Previous Functions

##Functions from part 1


In [None]:
#first of all we will convert our data into a dataframe, we will only take the variables we are interested in (columns of the df)

import pandas as pd
df = pd.DataFrame(columns = ['Id','Tweet','Date','Hashtags','Likes','Retweets','Url'])

#for each line we will read the variable we want and store it on the df
for l in lines:
  parsed_line = json.loads(l)
  id = parsed_line["id"]
  text = parsed_line["full_text"]
  date = parsed_line["created_at"]
  hashtags = parsed_line["entities"]["hashtags"][0]["text"]
  likes = parsed_line["favorite_count"]
  retweets = parsed_line["retweet_count"]
  try:
    url = parsed_line['entities']['media'][0]['url']
  except (KeyError, IndexError):
    url = None

  row = {'Id':id,'Tweet':text,'Date':date,'Hashtags':hashtags,'Likes':likes,'Retweets':retweets,'Url':url}
  new_row = pd.DataFrame([row])
  df = pd.concat([df, new_row], ignore_index=True)

#we also store the data from the csv file into a df
df_docs = pd.read_csv(id_to_docs_path, delimiter='\t', names=['Doc','Id'])

In [None]:
result_df = pd.merge(df, df_docs, on='Id', how='inner')
result_df = result_df[['Doc','Id','Tweet','Date','Hashtags','Likes','Retweets','Url']]

In [None]:
#now we will use the structure of build terms seen on the first lab to pre-process the text of each tweet
#we will add a few lines to eliminate from the tweets the urls, the @s, the numbers, the #s and the emojis
def build_terms(line):
  stemmer = PorterStemmer()
  stop_words = set(stopwords.words("english"))
  punctuations = string.punctuation

  line =  line.lower() ## Transform in lowercase
  line = re.sub(r'http.*', '',line)
  line = re.sub(r'@\w+', '', line)
  line = re.sub(r'\d+','', line)
  line = re.sub(r'#\w+|[^\x00-\x7F]+|['+ re.escape(punctuations) + ']', ' ',line)
  line=  line.split() ## Tokenize the text to get a list of terms
  line=[x for x in line if x not in stop_words]  ##eliminate the stopwords
  line=[stemmer.stem(word) for word in line] ## perform stemming

  return line

##Functions from part 2


In [None]:
#we create this function to create the inverted index
def create_index(lines):
    index = defaultdict(list)
    dictionary = {}
    term_id = 0

    for line in lines:
        result = result_df[result_df['Tweet'] == line]
        doc_number = result.iloc[0]['Doc']

        terms = build_terms(line)

        current_page_index = {}
        for position, term in enumerate(terms):
            if term not in dictionary:
              term_id +=1
              string_id = "term_id_"+str(term_id)
              dictionary[term] = string_id
            else:
              string_id = dictionary[term]
            try:
                current_page_index[string_id][1].append(position)
            except:
                current_page_index[string_id]=[doc_number, [position]]

        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

    return index, dictionary

In [None]:
start_time = time.time()
index, dictionary = create_index(result_df['Tweet'])
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2)))
print(dictionary)

Total time to create the index: 12.69 seconds
{'wrong': 'term_id_1', 'dictat': 'term_id_2', 'putin': 'term_id_3', 'fascist': 'term_id_4', 'russia': 'term_id_5', 'intend': 'term_id_6', 'conquer': 'term_id_7', 'much': 'term_id_8', 'ukrainian': 'term_id_9', 'land': 'term_id_10', 'possibl': 'term_id_11', 'arm': 'term_id_12', 'forc': 'term_id_13', 'liber': 'term_id_14', 'villag': 'term_id_15', 'urban': 'term_id_16', 'territori': 'term_id_17', 'commun': 'term_id_18', 'region': 'term_id_19', 'alert': 'term_id_20', 'poland': 'term_id_21', 'prep': 'term_id_22', 'anti': 'term_id_23', 'radiat': 'term_id_24', 'tablet': 'term_id_25', 'nuclear': 'term_id_26', 'threat': 'term_id_27', 'still': 'term_id_28', 'wait': 'term_id_29', 'googl': 'term_id_30', 'map': 'term_id_31', 'updat': 'term_id_32', 'new': 'term_id_33', 'annex': 'term_id_34', 'take': 'term_id_35', 'bit': 'term_id_36', 'longer': 'term_id_37', 'thought': 'term_id_38', 'probabl': 'term_id_39', 'right': 'term_id_40', 'say': 'term_id_41', 'anyw

In [None]:
#we create this function to create the inverted index and compute the tf, df and idf values
def create_index_tfidf(lines, num_docs):
    index = defaultdict(list)
    dictionary = {}
    term_id = 0
    tf = defaultdict(list)  # term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  # document frequencies of terms in the corpus
    idf = defaultdict(float)

    for line in lines:

        result = result_df[result_df['Tweet'] == line]
        doc_number = result.iloc[0]['Doc']

        terms = build_terms(line)

        current_page_index = {}
        for position, term in enumerate(terms):
            if term not in dictionary:
              term_id +=1
              string_id = "term_id_"+str(term_id)
              dictionary[term] = string_id
            else:
              string_id = dictionary[term]
            try:
                current_page_index[string_id][1].append(position)
            except:
                current_page_index[string_id]=[doc_number, [position]]

        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

        #normalize term frequencies
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        # calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():

            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1])/norm,4))

            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term

        # Compute IDF
        for term in df:
            idf[term] = np.round(np.log(float(num_docs/df[term])), 4)

    return index, tf, df, idf, dictionary

In [None]:
start_time = time.time()
num_documents = len(result_df['Tweet'])
index_tf, tf, df, idf, dictionary_tf = create_index_tfidf(result_df['Tweet'], num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 183.34 seconds


#1. Two scores for ranking

##a. TF-IDF + cosine similarity

In [None]:
def rank_documents(terms, docs, index, idf, tf, dictionary):

    # I'm interested only on the element of the docVector corresponding to the query terms
    # The remaining elements would became 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    processed_docs = set()
    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        try:
          if dictionary[term] not in index:
              continue
        except:
          continue
        term_id = dictionary[term]
        processed_docs = set()

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex]= query_terms_count[term] / query_norm*idf[term_id]
        # Generate doc_vectors for matching docs

        for doc_index, (doc, postings) in enumerate(index[term_id]):
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term_id][doc_index] * idf[term_id]

    # Calculate the score of each doc by doing the product of TF-IDF * cosine similarity
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_scores = [x[0] for x in doc_scores]

    return result_docs, result_scores

In [None]:
def search_tf_idf(query1, index, idf, tf, dictionary):
    """
    The output is the list of documents that contain all of the query terms.
    So, we will get the list of documents for each query term, and take the intersection of them.
    """
    query = build_terms(query1)
    first_term = query[0]
    id = dictionary[first_term]
    docs = set(posting[0] for posting in index[id])

    for term in query[1:]:
      id = dictionary[first_term]
      term_docs = set(posting[0] for posting in index[id])
      docs = docs.intersection(term_docs)

    docs = list(docs)
    ranked_docs, ranked_scores = rank_documents(query, docs, index, idf, tf, dictionary)
    return ranked_docs, ranked_scores

In [None]:
queries = ['Air bombs', 'UN Congress', "Putin's next step", 'Dead Soldiers', 'World War']

for query in queries:
  docs, ranked_scores = search_tf_idf(query, index_tf, idf, tf, dictionary_tf)
  top = 20

  print("\n======================\nSample of {} results out of {} for the searched query: '{}'".format(top, len(docs), query))
  for d_id in docs[:top]:
    print("Doc number: "+d_id)


Sample of 20 results out of 33 for the searched query: 'Air bombs'
Doc number: doc_3418
Doc number: doc_3688
Doc number: doc_1951
Doc number: doc_3300
Doc number: doc_3141
Doc number: doc_196
Doc number: doc_2019
Doc number: doc_1973
Doc number: doc_36
Doc number: doc_2746
Doc number: doc_1590
Doc number: doc_1423
Doc number: doc_3329
Doc number: doc_2896
Doc number: doc_2604
Doc number: doc_922
Doc number: doc_3537
Doc number: doc_3505
Doc number: doc_3196
Doc number: doc_62

Sample of 20 results out of 21 for the searched query: 'UN Congress'
Doc number: doc_2086
Doc number: doc_3769
Doc number: doc_3297
Doc number: doc_3647
Doc number: doc_1618
Doc number: doc_3262
Doc number: doc_2248
Doc number: doc_2152
Doc number: doc_299
Doc number: doc_563
Doc number: doc_489
Doc number: doc_1656
Doc number: doc_1494
Doc number: doc_3673
Doc number: doc_420
Doc number: doc_1029
Doc number: doc_1343
Doc number: doc_2079
Doc number: doc_3996
Doc number: doc_560

Sample of 20 results out of 477 

##b. Your score + cosine similarity

**OUR RANKING**

In [None]:
#we create this function to create the inverted index and compute our score
def create_index_score(lines, num_docs):
  index = defaultdict(list)
  dictionary = {}
  term_id = 0
  relevance = defaultdict(list)  # relevance of terms in documents (documents in the same order as in the main index)
  df = defaultdict(int)  # document frequencies of terms in the corpus
  idf = defaultdict(float)

  for line in lines:
    result = result_df[result_df['Tweet'] == line]
    doc_number = result.iloc[0]['Doc']
    terms = build_terms(line)

    current_page_index = {}
    for position, term in enumerate(terms):
      if term not in dictionary:
        term_id +=1
        string_id = "term_id_"+str(term_id)
        dictionary[term] = string_id
      else:
        string_id = dictionary[term]
      try:
        current_page_index[string_id][1].append(position)
      except:
        current_page_index[string_id]=[doc_number, [position]]

    for term_page, posting_page in current_page_index.items():
      index[term_page].append(posting_page)

    for term, posting in current_page_index.items():
      relevance[term].append(result.iloc[0]['Likes']+result.iloc[0]['Retweets'])
      df[term] += 1

    for term in df:
        idf[term] = np.round(np.log(float(num_docs/df[term])), 4)
  return index, relevance, idf, dictionary

In [None]:
start_time = time.time()
num_documents = len(result_df['Tweet'])
index_score, relevance, idf2, dictionary_score = create_index_score(result_df['Tweet'], num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 186.75 seconds


In [None]:
def rank_documents_score(query, docs, index, idf, relevance, dictionary):
    doc_vectors = defaultdict(lambda: [0] * len(query))
    query_vector = [0] * len(query)

    for termIndex, term in enumerate(query):  #termIndex is the index of the term in the query
        term_id = dictionary[term]

        popularity = 0
        for doc_index, (doc, postings) in enumerate(index[term_id]):
            if doc in docs:
              doc_vectors[doc][termIndex] = relevance[term_id][doc_index]
              popularity += relevance[term_id][doc_index]

        for doc_index, (doc, postings) in enumerate(index[term_id]):
            if doc in docs:
              doc_vectors[doc][termIndex] *= idf[term_id]

        query_vector[termIndex]= popularity * idf[term_id]

    # Calculate the score of each doc by doing the product of OurScore * cosine similarity
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_scores = [x[0] for x in doc_scores]

    return result_docs, result_scores

In [None]:
def search_score(query1, index, idf, relevance, dictionary):
    query = build_terms(query1)
    first_term = query[0]
    id = dictionary[first_term]
    docs = set(posting[0] for posting in index[id])

    for term in query[1:]:
      id = dictionary[first_term]
      term_docs = set(posting[0] for posting in index[id])
      docs = docs.intersection(term_docs)

    docs = list(docs)
    ranked_docs, ranked_scores = rank_documents_score(query, docs, index, idf, relevance, dictionary)
    return ranked_docs, ranked_scores

In [None]:
queries = ['Air bombs', 'UN Congress', "Putin's next step", 'Dead Soldiers', 'World War']

for query in queries:
  docs, ranked_scores = search_score(query, index_score, idf2, relevance, dictionary_score)
  top = 20
  print("\n======================\nSample of {} results out of {} for the searched query: '{}'".format(top, len(docs), query))
  for d_id in docs[:top]:
    print("Doc number: "+d_id)


Sample of 20 results out of 33 for the searched query: 'Air bombs'
Doc number: doc_2701
Doc number: doc_2203
Doc number: doc_1590
Doc number: doc_3537
Doc number: doc_3688
Doc number: doc_1951
Doc number: doc_3611
Doc number: doc_3329
Doc number: doc_36
Doc number: doc_1973
Doc number: doc_3418
Doc number: doc_2019
Doc number: doc_2324
Doc number: doc_196
Doc number: doc_3856
Doc number: doc_922
Doc number: doc_741
Doc number: doc_3196
Doc number: doc_1287
Doc number: doc_3472

Sample of 20 results out of 21 for the searched query: 'UN Congress'
Doc number: doc_1539
Doc number: doc_1343
Doc number: doc_1656
Doc number: doc_3996
Doc number: doc_3262
Doc number: doc_299
Doc number: doc_3647
Doc number: doc_3297
Doc number: doc_560
Doc number: doc_420
Doc number: doc_2248
Doc number: doc_1029
Doc number: doc_563
Doc number: doc_489
Doc number: doc_3769
Doc number: doc_3673
Doc number: doc_2152
Doc number: doc_2086
Doc number: doc_2079
Doc number: doc_1618

Sample of 20 results out of 477

#2. Top-20 list using word2vec + cosine similarity

In [None]:
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
#we take each tweet terms and apply Word2Vec
df_terms = pd.DataFrame(columns=['Doc','Tweet', 'Terms', 'Number_Terms', 'Tweet_Vector'])

for element in result_df['Tweet']:
  term_list = build_terms(element)

  result = result_df[result_df['Tweet'] == element]
  doc_number = result.iloc[0]['Doc']

  row = {'Doc':doc_number,'Tweet':element,'Terms':term_list, 'Number_Terms': len(term_list), 'Tweet_Vector': 0}
  new_row = pd.DataFrame([row])
  df_terms = pd.concat([df_terms, new_row], ignore_index=True)

In [None]:
#we transform the terms of each tweet into vectors
vectors = Word2Vec(df_terms['Terms'], workers=4, min_count=1)
for i, terms in enumerate(df_terms['Terms']):
    if len(terms) == 0:
      continue
    average_tweet = np.zeros_like(vectors.wv[terms[0]])  # Initialize with zeros

    for term in terms:
      vector = vectors.wv[term]
      average_tweet = average_tweet + (vector / len(terms))

    # Update the "Tweet_Vector" column
    df_terms.at[i, "Tweet_Vector"] = (average_tweet)

# Display the DataFrame
display(df_terms)

Unnamed: 0,Doc,Tweet,Terms,Number_Terms,Tweet_Vector
0,doc_1,@MelSimmonsFCDO Wrong. Dictator Putin's Fascis...,"[wrong, dictat, putin, fascist, russia, intend...",11,"[-0.21729761, 0.2562514, 0.09154901, 0.0173452..."
1,doc_2,🇺🇦❤️ The Armed Forces liberated the village of...,"[arm, forc, liber, villag, urban, territori, c...",8,"[-0.21675757, 0.25248462, 0.09563956, 0.020938..."
2,doc_3,ALERT 🚨Poland preps anti-radiation tablets ove...,"[alert, poland, prep, anti, radiat, tablet, nu...",8,"[-0.11908125, 0.13564947, 0.05198963, 0.007521..."
3,doc_4,I’m still waiting for my google map 🗺️ to upda...,"[still, wait, googl, map, updat, russia, new, ...",13,"[-0.20656309, 0.23867625, 0.08571902, 0.015250..."
4,doc_5,@EmmanuelMacron probably you're right or you h...,"[probabl, right, say, anyway, game, citizen, t...",9,"[-0.12440035, 0.1486448, 0.054969758, 0.011158..."
...,...,...,...,...,...
3995,doc_3996,🎥 Ukraine’s president has warned that Russia’s...,"[ukrain, presid, warn, russia, sham, referendu...",21,"[-0.23699413, 0.2746104, 0.09951452, 0.0193930..."
3996,doc_3997,Germany amusingly shares days old intelligense...,"[germani, amusingli, share, day, old, intellig...",15,"[-0.17325865, 0.20852198, 0.07515176, 0.013114..."
3997,doc_3998,The US Embassy in Moscow is urging Americans t...,"[us, embassi, moscow, urg, american, leav, rus...",14,"[-0.18250926, 0.21626262, 0.07469832, 0.010537..."
3998,doc_3999,After the staged fake referendum as of Septemb...,"[stage, fake, referendum, septemb, russian, fo...",24,"[-0.28356323, 0.3324743, 0.12452203, 0.0253430..."


In [None]:
def rank_tweet2vec(query, docs, df, vectors):
    doc_vectors = defaultdict(lambda: [0]) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary

    query_vector = np.zeros_like(vectors.wv[query[0]])

    for term in query:
      vector = vectors.wv[term]
      query_vector = query_vector + (vector / len(terms))

    for doc in docs:
      result = df[df['Doc'] == doc]
      doc2vec = result.iloc[0]['Tweet_Vector']

      doc_vectors[doc] = doc2vec

    # Calculate the score of each doc by doing the product of TF-IDF * cosine similarity
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    result_scores = [x[0] for x in doc_scores]

    return result_docs, result_scores

In [None]:
def search_tweet2vec(query1, df, vectors):

    query = build_terms(query1)
    first_term = query[0]
    id = dictionary[first_term]
    docs = set(posting[0] for posting in index[id])
    for term in query[1:]:
      id = dictionary[first_term]
      term_docs = set(posting[0] for posting in index[id])
      docs = docs.intersection(term_docs)

    docs = list(docs)
    ranked_docs, ranked_scores = rank_tweet2vec(query, docs, df, vectors)
    return ranked_docs, ranked_scores

In [None]:
queries = ['Air bombs', 'UN Congress', "Putin's next step", 'Dead Soldiers', 'World War']
for query in queries:

  docs, ranked_scores = search_tweet2vec(query, df_terms, vectors)
  top = 20
  print("\n======================\nSample of {} results out of {} for the searched query: '{}'".format(top, len(docs), query))
  for d_id in docs[:top]:
    print("Doc number: "+d_id)


Sample of 20 results out of 33 for the searched query: 'Air bombs'
Doc number: doc_3688
Doc number: doc_3141
Doc number: doc_1590
Doc number: doc_1973
Doc number: doc_2324
Doc number: doc_2604
Doc number: doc_3300
Doc number: doc_2203
Doc number: doc_3537
Doc number: doc_3505
Doc number: doc_922
Doc number: doc_3472
Doc number: doc_1278
Doc number: doc_1832
Doc number: doc_3611
Doc number: doc_1287
Doc number: doc_2379
Doc number: doc_2896
Doc number: doc_3196
Doc number: doc_1423

Sample of 20 results out of 21 for the searched query: 'UN Congress'
Doc number: doc_2248
Doc number: doc_2152
Doc number: doc_2079
Doc number: doc_1343
Doc number: doc_563
Doc number: doc_560
Doc number: doc_3996
Doc number: doc_3769
Doc number: doc_3297
Doc number: doc_1656
Doc number: doc_1539
Doc number: doc_2086
Doc number: doc_420
Doc number: doc_1618
Doc number: doc_3673
Doc number: doc_299
Doc number: doc_3262
Doc number: doc_1494
Doc number: doc_489
Doc number: doc_1029

Sample of 20 results out of