# Imports

In [37]:
#Import the necessary methods from tweepy library
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import seaborn as sns
import csv
from collections import Counter
#from config import *
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import API
from tweepy import Cursor
import datetime

import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag

import time
from collections import defaultdict
import collections
from numpy import linalg as la
import re
import math
import ast

# Loading datasets

In [38]:
original=pd.read_csv('originals.csv')
original.text=original.text.apply(lambda x: ast.literal_eval(x))

# Loading inverted index structures

In [39]:
def read_dict_to_file(filename):
    with open(filename, 'r') as file:
        aux = file.read()
    return json.loads(aux)

# Read index, tf and idf from files
index = read_dict_to_file('index.json')
tf = read_dict_to_file('tf.json')
idf =read_dict_to_file('idf.json')

In [40]:
punctuation = string.punctuation.replace('#','')+'…'

def preprocess_tweet(tweets_series):
    # Lowercasing text
    tweets_series = tweets_series.apply(lambda x: x.lower())

    # Remove URLS (https and www), mentions and rt
    tweets_series = tweets_series.apply(lambda x: re.sub(r'https?//\S+|www.\S+|@\w*|^rt','', x))
    
    #Removing numbers
    tweets_series=tweets_series.apply(lambda x: re.sub(r"([0-9])",'', x))

    # Remove punctuation except # (hashtags)
    tweets_series = tweets_series.apply(lambda x: "".join([char for char in x if char not in punctuation]))

    # Replacing symbol ’ for ' as they mean the same and it is needed to correctly remove stopwords
    tweets_series = tweets_series.apply(lambda x: x.replace("’","").replace('“',"").replace('”',''))
    
    #Removing emojis
    tweets_series=tweets_series.apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))

    # Tokenize text 
    tweets_series = tweets_series.apply(lambda x: x.split())

    # Removing stop words
    stop_words = stopwords.words('english')
    tweets_series = tweets_series.apply(lambda x: [word for word in x if word not in stop_words])
                
    # Stemming
    porter = PorterStemmer()
    tweets_series = tweets_series.apply(lambda x: [porter.stem(word) for word in x]) 

    return tweets_series

# Search Engine Build 

In [41]:
def rankDocuments(query_terms, docs, index, idf, tf, method='tf-idf'):
    """
    Perform the ranking of the results of a search based with tf-idf or popular (own score)
    
    Argument:
    query_terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    
    Returns:
    Print the list of ranked documents
    """
    global original   
    # I'm interested only on the element of the docVector corresponding to the query terms 
    # The remaing elements would became 0 when multiplied to the queryVector
    docVectors=defaultdict(lambda: [0]*len(query_terms)) # I call docVectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    queryVector=[0]*len(query_terms)    

    # compute the norm for the query tf
    query_terms_count = collections.Counter(query_terms) # get the frequency of each term in the query. 
    # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})
    
    # HINT: use when computing tf for queryVector   
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(query_terms): #termIndex is the index of the term in the query
        if term not in index:
            continue
                    
        ## Compute tf*idf(normalize tf as done with documents)
        queryVector[termIndex]=query_terms_count[term]/query_norm * idf[term] 

        # Generate docVectors for matching docs
        for docIndex, (doc, postings) in enumerate(index[term]):  
            if doc in docs:
                docVectors[doc][termIndex]=tf[term][docIndex]
                
    # calculate the score of each doc
    # compute the cosine similarity between queryVector and each docVector:
    
    # We have two methods. 'popular' is the score with our method and otherwise it computes it with tf-idf.
    if method == 'popular':
        # Applying formula described in the report
        docScores=[ [0.4*np.dot(curDocVec, queryVector) + 0.3*np.log(original['likes'].loc[doc] + 1)/np.log(original['likes'].max() + 1) +0.3*np.log(original.retweets.loc[doc] + 1)/np.log(original.retweets.max() + 1) ,doc] for doc, curDocVec in docVectors.items() ]
    else:
        docScores=[ [np.dot(curDocVec, queryVector),doc] for doc, curDocVec in docVectors.items() ]
        
    docScores.sort(reverse=True)
    resultDocs=[x[1] for x in docScores]
    
    if len(resultDocs) == 0:
        print("No results found, try again\n")
        query = input("Insert your query:")
        resultDocs = search_tf_idf(query, index, method)    
    return resultDocs

In [42]:
def search_tf_idf(query, index, method):
    '''
    output is the list of documents that contain all of the query terms. 
    So, we will get the list of documents for each query term, and take the intersection of them.
    '''
    query=preprocess_tweet(pd.Series(query)).values[0]

    docs=set()
    for term in query:
        try:
            # store in termDocs the ids of the docs that contain "term"                        
            termDocs=[posting[0] for posting in index[term]]
            
            # Set containing docs with all query terms
            if len(docs) == 0:
                docs = docs.union(termDocs) 
            else:
                docs = docs.intersection(termDocs)
        except:
            #term is not in index
            pass
    docs=list(docs)
    ranked_docs = rankDocuments(query, docs, index, idf, tf, method)   

    return ranked_docs

# Try some queries with metod 'popular' or 'tf-idf'

In [43]:
query = input("Insert your query:")
#method accepts two options: 'popular' and 'tf-idf'
ranked_docs = search_tf_idf(query, index, method='popular')    
top = 10
results_df = original.loc[ranked_docs[:top]][['original_text','url','user','date','hashtags','likes','retweets']]

print('=======================================\n')
for ind, row in results_df.iterrows():
    print('Tweet ID:',ind,'  URL:',row.url)
    print('Date:',row.date)
    print('User:',row.user,'  Hashtags:',row.hashtags)
    print('Likes:',row.likes,'  Retweets:',row.retweets)
    print('Tweet:',row.original_text)
    print('\n=======================================\n')

Insert your query:covid lockdown

Tweet ID: 17127   URL: https://twitter.com/twitter/statuses/1331743887040376832
Date: Wed Nov 25 23:37:39 +0000 2020
User: 58108   Hashtags: []
Likes: 0   Retweets: 1
Tweet: This is why lockdowns are worse than COVID itself https//t.co/YdCHWlOD7q


Tweet ID: 9910   URL: https://twitter.com/twitter/statuses/1331749477342973952
Date: Wed Nov 25 23:59:52 +0000 2020
User: 103048   Hashtags: []
Likes: 1   Retweets: 0
Tweet: "What do think about all these Covid lockdowns?" https//t.co/MAdHm96la3


Tweet ID: 29135   URL: https://twitter.com/twitter/statuses/1331019880020242439
Date: Mon Nov 23 23:40:43 +0000 2020
User: 39359   Hashtags: []
Likes: 0   Retweets: 0
Tweet: https//t.co/JIH3qoK0LJ https//t.co/efu2qUA6OU we are both on covid lockdown so tune in


Tweet ID: 19437   URL: https://twitter.com/twitter/statuses/1331742094507925505
Date: Wed Nov 25 23:30:32 +0000 2020
User: 102448   Hashtags: []
Likes: 203   Retweets: 105
Tweet: 90-year-old woman opts for 

# Creating method with Word2Vector + cosine similarity score

In [44]:
from gensim.models import Word2Vec
# Loading Word2Vector model
w2v_model = Word2Vec.load("word2vec.model")

In [45]:
def emb_vec(tweet,model=w2v_model):
    # Appending in a list the embedded representation of all the words in the tweet if they are in vocab
    query_vectors=[w2v_model.wv.word_vec(word) for word in tweet if word in w2v_model.wv.vocab]
    # In case this list of vectors have length grater than 0 we compute the average value
    if len(query_vectors)>0:
        vec=np.average(np.array(query_vectors),axis=0)
        return vec/np.linalg.norm(vec)
    # Otherwise we return a 0
    return np.zeros((1,1))

In [46]:
# Same function as emb_vec but returns the list of words that were in vocabulary
def calculate_w2v_vector(lista,model=w2v_model):
    query_vectors=[]
    for word in lista:
        if word in w2v_model.wv.vocab:
            query_vectors.append(w2v_model.wv.word_vec(word))
        else:
            lista.remove(word)
    vec=np.average(np.array(query_vectors),axis=0)
    return lista,vec/np.linalg.norm(vec)

In [47]:
def rank_docs_w2v(query_terms,query_vec, docs, index, model=w2v_model):
    global original 
    docVectors=dict() # I call docVectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    # For each doc we calculate it's vector representation
    for doc in docs:
        terms=original['text'][doc]
        _,docVectors[doc]=calculate_w2v_vector(terms,model=w2v_model)
        
    # Compputing doc scores as the cosine similarity between vector representation of query and doc
    docScores=[ [np.dot(curDocVec, query_vec),doc] for doc, curDocVec in docVectors.items() ]
    # Sorting the results
    docScores.sort(reverse=True)
    resultDocs=[x[1] for x in docScores]
    
    if len(resultDocs) == 0:
        print("No results found, try again\n")
        query = input("Insert your query:")
        resultDocs = compute_w2v_rank(query, index, model=w2v_model)    
    return resultDocs

In [48]:
def compute_w2v_rank(query, index, model=w2v_model):
    '''
    output is the list of documents that contain all of the query terms. 
    So, we will get the list of documents for each query term, and take the intersection of them.
    '''
    query=preprocess_tweet(pd.Series(query)).values[0]
    lista,query_vec=calculate_w2v_vector(query,model=w2v_model)
    docs=set()
    for term in query:
        try:
            # store in termDocs the ids of the docs that contain "term"                        
            termDocs=[posting[0] for posting in index[term]]
            # Set containing docs with all query terms
            if len(docs) == 0:
                docs = docs.union(termDocs) 
            else:
                docs = docs.intersection(termDocs)
        except:
            #term is not in index
            pass
    docs=list(docs)
    ranked_docs = rank_docs_w2v(lista,query_vec, docs, index, model=w2v_model)   
    return ranked_docs

In [49]:
query = input("Insert your query:")
ranked_docs = compute_w2v_rank(query, index, model=w2v_model)  
top = 10

results_df = original.loc[ranked_docs[:top]][['original_text','url','user','date','hashtags','likes','retweets']]

print('=======================================\n')
for ind, row in results_df.iterrows():
    print('Tweet ID:',ind,'  URL:',row.url)
    print('Date:',row.date)
    print('User:',row.user,'  Hashtags:',row.hashtags)
    print('Likes:',row.likes,'  Retweets:',row.retweets)
    print('Tweet:',row.original_text)
    print('\n=======================================\n')

Insert your query:covid lockdown

Tweet ID: 29135   URL: https://twitter.com/twitter/statuses/1331019880020242439
Date: Mon Nov 23 23:40:43 +0000 2020
User: 39359   Hashtags: []
Likes: 0   Retweets: 0
Tweet: https//t.co/JIH3qoK0LJ https//t.co/efu2qUA6OU we are both on covid lockdown so tune in


Tweet ID: 17127   URL: https://twitter.com/twitter/statuses/1331743887040376832
Date: Wed Nov 25 23:37:39 +0000 2020
User: 58108   Hashtags: []
Likes: 0   Retweets: 1
Tweet: This is why lockdowns are worse than COVID itself https//t.co/YdCHWlOD7q


Tweet ID: 13052   URL: https://twitter.com/twitter/statuses/1331746996735119360
Date: Wed Nov 25 23:50:01 +0000 2020
User: 20118   Hashtags: []
Likes: 0   Retweets: 0
Tweet: @26329 @13126 @15762 @15720 You asked why lockdown for covid when there is no suc… https//t.co/wMjgp948Xx


Tweet ID: 9910   URL: https://twitter.com/twitter/statuses/1331749477342973952
Date: Wed Nov 25 23:59:52 +0000 2020
User: 103048   Hashtags: []
Likes: 1   Retweets: 0
Tweet