### Part 3: Ranking

In [112]:
import array
import collections
import csv
import json
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import numpy as np
from collections import defaultdict
import time
from array import array
import math
from numpy import linalg as la
import spacy

In [113]:
# Function to pre-process a tweet
def build_terms(line):

    filtered_line = line.lower() ## Transform in lowercase
    filtered_line = filtered_line.split() ## Tokenize the text to get a list of terms
    filtered_line = [re.sub(r'[^\w\s]', '', word) for word in filtered_line] # Removing non-words and non-whitespaces
    
    # Removing stop words
    stop_words = set(stopwords.words("english"))
    filtered_line = [word for word in filtered_line if word not in stop_words]  ## Eliminate the stopwords 

    # Stemming
    stemmer = PorterStemmer()
    filtered_line = [stemmer.stem(word) for word in filtered_line] ## Perform stemming

    return filtered_line

In [114]:
# Function to remove emoticons
def remove_emoticons(text):
    # Define a pattern to find all the emoticons
    emoticon_pattern = re.compile("[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" 
                                  u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" 
                                  u"\U00002500-\U00002BEF" u"\U00002702-\U000027B0" 
                                  u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" 
                                  u"\U00010000-\U0010ffff" u"\u2640-\u2642" 
                                  u"\u2600-\u2B55" u"\u200d" 
                                  u"\u23cf" u"\u23e9" 
                                  u"\u231a" u"\ufe0f" 
                                  u"\u3030" "]+", re.UNICODE)

    # Replace emoticons with an empty string
    text_without_emoticons = emoticon_pattern.sub('', text)

    return str(text_without_emoticons)

In [115]:
# Function to renove links
def remove_links(text):
    # Define a pattern to match URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    # Replace URLs with an empty string
    text_without_links = url_pattern.sub('', text)

    return str(text_without_links)  

# PRE-PROCESS OF THE DOCUMENT

In [116]:
# Load the JSON data

with open('IRWA_data_2023/Rus_Ukr_war_data.json', 'r') as fp:
    lines = fp.readlines()
lines = [l.strip().replace(' +', ' ') for l in lines]

tweet_information = {}
for line in lines:

        tweet_data = json.loads(line)

        # Clean the text
        tweet_text = tweet_data['full_text']
        tweet_text = remove_emoticons(tweet_text)
        tweet_text = remove_links(tweet_text)

        # Extract relevant information
        tweet_id = tweet_data['id_str']
        tweet_date = tweet_data['created_at']
        hashtags = [hashtag['text'] for hashtag in tweet_data['entities']['hashtags']]
        likes = tweet_data['favorite_count']
        retweets = tweet_data['retweet_count'] 
        twitter_username = tweet_data['user']['screen_name']
        tweet_url = f"https://twitter.com/{twitter_username}/status/{tweet_id}"

        processed_tweet = build_terms(tweet_text)

        # Store all the tweet information
        tweet_information[tweet_id] = {
            'Tweet ID': tweet_id,
            'Tweet Text': tweet_text,
            'Processed Tweet': processed_tweet,
            'Tweet Date': tweet_date,
            'Hashtags': hashtags,
            'Likes': likes,
            'Retweets': retweets,
            'Tweet_url': tweet_url
        }

# Map tweet IDs with document IDs for evaluation stage
tweet_document_ids_map = {}
tweet_document_ids_map1 = {}

with open('IRWA_data_2023/Rus_Ukr_war_data_ids.csv', 'r') as map_file:
    doc = csv.reader(map_file, delimiter='\t')
    for row in doc:
        doc_id, tweet_id = row
        tweet_document_ids_map[doc_id] = tweet_id
        tweet_document_ids_map1[tweet_id] = doc_id

# INDEXING

In [117]:
# Function to index creation
def create_index_tfidf(lines, num_documents):

    index = defaultdict(list)
    tf = defaultdict(list)  # term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  # document frequencies of terms in the corpus
    url_index = defaultdict(str)
    idf = defaultdict(float)

    for line in lines:
        tweet_data = json.loads(line)
        tweet_id = tweet_data['id_str']
        
        doc_id = tweet_document_ids_map1[tweet_id]
        terms = tweet_information[tweet_id]['Processed Tweet']
        url_index[doc_id] = tweet_information[tweet_id]['Tweet_url']

        current_page_index = {}

        for position, term in enumerate(terms):
            try:
                current_page_index[term][1].append(position)
            except:
                current_page_index[term] = [doc_id, array('I',[position])] #'I' indicates unsigned int (int in Python)

        norm = 0
        for term, posting in current_page_index.items():
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        for term, posting in current_page_index.items():
            tf[term].append(np.round(len(posting[1])/norm,4))
            df[term] += 1

        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

        for term in df:
            idf[term] = np.round(np.log(float(num_documents/df[term])), 4)

    return index, tf, df, idf, url_index

In [118]:
# Function to rank documents
def rank_documents_TF_IDF(terms, docs, index, idf, tf):
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.
    
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        query_vector[termIndex] = query_terms_count[term]/query_norm * idf[term]

        for doc_index, (doc, postings) in enumerate(index[term]):
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]

    # Cosine similarity
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]

    return result_docs, doc_scores

In [119]:
# Function to rank documents with custom score and cosine similarity
def rank_documents_with_custom_score(terms, docs, index):
    doc_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):
        if term not in index:
            continue

        query_vector[termIndex] = query_terms_count[term] / query_norm  # Using TF for the query

        for doc_index, (doc, postings) in enumerate(index[term]):
            if doc in docs:
                # Compute the TF/len(doc) for the term in the document
                tf_value = len([postings[0]])
                t_id = tweet_document_ids_map[doc]
                my_score = 0.25 * tweet_information[t_id]['Likes'] + 0.75 * tweet_information[t_id]['Retweets']
                doc_vectors[doc][termIndex] = (tf_value / len(tweet_information[t_id]['Processed Tweet'])) + my_score

    # Cosine similarity
    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]

    return result_docs, doc_scores

In [120]:
# Load spaCy model with word embeddings
nlp = spacy.load("en_core_web_md")  # You can replace "en_core_web_md" with other available models

# Function to calculate tweet representation using spaCy's word embeddings
def calculate_tweet_representation(tweet):
    tweet_vector = np.zeros(nlp.vocab.vectors.shape[1])
    word_count = 0

    for word in tweet:
        if nlp.vocab.has_vector(word):
            tweet_vector += nlp.vocab[word].vector
            word_count += 1

    if word_count > 0:
        tweet_vector /= word_count

    return tweet_vector

# Function to rank documents using spaCy's word embeddings + cosine similarity
def rank_documents_tweet2vec(terms, docs, index):
    doc_vectors = defaultdict(lambda: np.zeros(nlp.vocab.vectors.shape[1]))
    query_vector = np.zeros(nlp.vocab.vectors.shape[1])

    # Calculate the query vector using spaCy's word embeddings
    for term in terms:
        if nlp.vocab.has_vector(term):
            query_vector += nlp.vocab[term].vector

    # Normalize the query vector
    query_norm = np.linalg.norm(query_vector)
    if query_norm > 0:
        query_vector /= query_norm

    for doc_index, (doc, postings) in enumerate(index[term]):
        if doc in docs:
            # Calculate the tweet vector for the document using spaCy's word embeddings
            t_id = tweet_document_ids_map[doc]
            tweet_vector = calculate_tweet_representation(tweet_information[t_id]['Processed Tweet'])

            # Normalize the tweet vector
            tweet_norm = np.linalg.norm(tweet_vector)
            if tweet_norm > 0:
                tweet_vector /= tweet_norm

            # Cosine similarity
            cosine_similarity = np.dot(tweet_vector, query_vector)

            doc_vectors[doc] = cosine_similarity

    # Sort documents based on cosine similarity
    doc_scores = sorted(doc_vectors.items(), key=lambda x: x[1], reverse=True)
    result_docs = [doc[0] for doc in doc_scores]

    return result_docs, doc_scores

In [121]:
# Function to search docs for specific queries
def search(query, index):
    query = build_terms(query)
    docs = []
    try:
        term_docs=[posting[0] for posting in index[query[0]]]
        for d_id in term_docs:
            t_id = tweet_document_ids_map[d_id]
            intersection = set(tweet_information[t_id]['Processed Tweet']).intersection(set(query))
            if set(query) == intersection:
                docs.append(d_id)
    except:
        pass
    return query, docs

In [122]:
start_time = time.time()
num_documents = len(lines)
index, tf, df, idf, url_index = create_index_tfidf(lines, num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 180.07 seconds


In [124]:
q = True
while q == True:
    print("\nInsert your query or END to stop (i.e.: presidents visiting Kyiv):\n")
    query = input()
    print(query)
    if query == 'END':
        break
    query, docs = search(query, index)
    if len(docs) == 0:
        print("No results found, try again")
    else:
        ranked_docs, cosine_similarity = rank_documents_TF_IDF(query, docs, index, idf, tf)
        ranked_docs_custom, cosine_similarity2 = rank_documents_with_custom_score(query, docs, index)
        ranked_docs_tweet2vec, cosine_similarity3 = rank_documents_tweet2vec(query, docs, index)
        top = 20
        print("\n======================\nTop {} results out of {} for the searched query using TF-IDF:\n".format(top, len(ranked_docs)))
        ctr = 1
        for d_id in ranked_docs[:top]:
            t_id = tweet_document_ids_map[d_id]
            print("{}.\033[1mDOC_ID\033[0m = {} - \033[1mTWEET_ID\033[0m = {} - \033[1mLIKES\033[0m = {} - \033[1mRETWEETS\033[0m = {} ".format(ctr, d_id, t_id, tweet_information[t_id]['Likes'], tweet_information[t_id]['Retweets']))
            ctr += 1
            print("{}.\033[1mDOC_ID\033[0m = {} - \033[1mTWEET_ID\033[0m = {} - \033[1mTWEET_DATE\033[0m = {} - \033[1mHASHTAGS\033[0m = {} - \033[1mLIKES\033[0m = {} - \033[1mRETWEETS\033[0m = {} - \033[1mTWEET_URL\033[0m = {}".format(ctr, d_id, t_id, tweet_information[t_id]['Tweet Date'], tweet_information[t_id]['Hashtags'], tweet_information[t_id]['Likes'], tweet_information[t_id]['Retweets'], url_index[d_id]))
        print("\n======================\nTop {} results out of {} for the searched query using custom score:\n".format(top, len(ranked_docs)))
        ctr = 1
        for d_id in ranked_docs_custom[:top]:
            t_id = tweet_document_ids_map[d_id]
            print("{}.\033[1mDOC_ID\033[0m = {} - \033[1mTWEET_ID\033[0m = {} - \033[1mLIKES\033[0m = {} - \033[1mRETWEETS\033[0m = {} ".format(ctr, d_id, t_id, tweet_information[t_id]['Likes'], tweet_information[t_id]['Retweets']))
            ctr += 1
        print("\n======================\nTop {} results out of {} for the searched query using Tweet2Vec:\n".format(top, len(ranked_docs)))
        ctr = 1
        for d_id in ranked_docs_tweet2vec[:top]:
            t_id = tweet_document_ids_map[d_id]
            print("{}.\033[1mDOC_ID\033[0m = {} - \033[1mTWEET_ID\033[0m = {} - \033[1mLIKES\033[0m = {} - \033[1mRETWEETS\033[0m = {} ".format(ctr, d_id, t_id, tweet_information[t_id]['Likes'], tweet_information[t_id]['Retweets']))
            ctr += 1


Insert your query or END to stop (i.e.: presidents visiting Kyiv):



president in Kyiv

Top 20 results out of 5 for the searched query using TF-IDF:

1.[1mDOC_ID[0m = doc_654 - [1mTWEET_ID[0m = 1575827125159940096 - [1mLIKES[0m = 2 - [1mRETWEETS[0m = 2 
2.[1mDOC_ID[0m = doc_656 - [1mTWEET_ID[0m = 1575827101030064128 - [1mLIKES[0m = 22 - [1mRETWEETS[0m = 2 
3.[1mDOC_ID[0m = doc_87 - [1mTWEET_ID[0m = 1575908651658641408 - [1mLIKES[0m = 7 - [1mRETWEETS[0m = 3 
4.[1mDOC_ID[0m = doc_131 - [1mTWEET_ID[0m = 1575905170952982529 - [1mLIKES[0m = 12 - [1mRETWEETS[0m = 0 
5.[1mDOC_ID[0m = doc_2099 - [1mTWEET_ID[0m = 1575566317415178240 - [1mLIKES[0m = 0 - [1mRETWEETS[0m = 0 

Top 20 results out of 5 for the searched query using custom score:

1.[1mDOC_ID[0m = doc_656 - [1mTWEET_ID[0m = 1575827101030064128 - [1mLIKES[0m = 22 - [1mRETWEETS[0m = 2 
2.[1mDOC_ID[0m = doc_87 - [1mTWEET_ID[0m = 1575908651658641408 - [1mLIKES[0m = 7 - [1mRETWEETS[0m = 3 
3.[1mDOC_ID[0m = doc_131 - [1mTWEET_ID[0m = 1575905170952982