
# TF-IDF + Cosine Similarity

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from array import array
import collections
from collections import defaultdict
import string
import pickle

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import Files

In [3]:
data = pd.read_csv(f"{PATH}final_Tweets.csv")
data.head()

Unnamed: 0,created_at,entities.hashtags,favorite_count,full_text,id,retweet_count,user.id,user.name
0,2020-11-11,[],4,International friendly roundup: Finland stun F...,1326667371730378753,1,16042794,Guardian US
1,2020-11-11,[],11,When Joe Biden formally takes over the preside...,1326666012142526466,5,16042794,Guardian US
2,2020-11-11,[],4,New Yorker fires Jeffrey Toobin after he repor...,1326663505454510081,1,16042794,Guardian US
3,2020-11-11,[],8,One week on: how has Donald Trump handled losi...,1326661105498796032,1,16042794,Guardian US
4,2020-11-11,[],13,France pays tribute to six-year-old resistance...,1326659924278046728,6,16042794,Guardian US


## Preprocessing

In [4]:

def getTerms(terms):

  """
  Preprocess step for tweet.
    1. transform all text in lowercase
    2. Tokenize by transforming string to list
    3. Remove stopwords
    4. Apply stemming, simplify word to root word.
  
  Argument:
    terms -- string (text)
  
  Returns:
    terms -- a list of preprocessed tokens corresponding to the input tweet 
  """
  # Init stemming and stopwords
  stemming = PorterStemmer()
  STOPWORDS = set(stopwords.words("english"))

  # Transform into lower case
  terms = terms.lower() 

  # Remove punctuation
  terms = terms.translate(str.maketrans("","", string.punctuation))

  # Tokenize
  terms = terms.split()

  # Remove stop words
  terms = [t for t in terms if t not in STOPWORDS]

  #Stemming
  terms = [stemming.stem(t) for t in terms]

  # Remove http links found in tweets last term
  try:
    if terms[-1][:4] == "http":
     terms = terms[:-1]
  except:
    pass

  return terms

# TF-IDF Inverted Index

In [5]:
def create_index_tfidf(data, numDocs):
  """
    Creates index and computes tf, idf and df for collection

    Arguments:
      data -- Pandas dataframe with cols "full_index" per text tweet and "id" per 
              tweet-
    Returns:
      index --  inverted list "term": [["id",[pos1,pos1,..]]..]
      tf -- normalized term frequency per doc 
      df -- document frequency per term
      idf -- inversed docuemnt frequency
  """

  index = defaultdict(list)

  # Term freq of terms in tweets
  tf = defaultdict(list)

  # Tweet freq of term in corpus
  df = defaultdict(int)

  # Inverse df
  idf = defaultdict(float)

  for i, row in data.iterrows():
    tweet = row["full_text"]
    tweet = getTerms(tweet)
    tweet_id = row["id"]

    # Page Index per doc
    termdictPage = {}
    for position, term in enumerate(tweet):
      try:
        termdictPage[term][1].append(position)
      except:
        termdictPage[term] = [tweet_id, array("I",[position])]
    
    # NORM frequencies per doc 
    norm = 0
    for term, posting in termdictPage.items():
      norm += len(posting[1])**2
    norm = np.sqrt(norm)

    # TF normalized per term and DF
    for term, posting in termdictPage.items():
      tf[term].append(np.round(len(posting[1])/norm,4))
      df[term] += 1
    
    # Merge doc page index with main index
    for termpage, postingpage in termdictPage.items():
      index[termpage].append(postingpage)

    # IDF per term
    for term in df:
      idf[term] = np.round(np.log(float(numDocs/df[term])),4)
  
  return index, tf, df, idf

In [6]:
%%time

numDocs = len(data)
index, tf, df, idf = create_index_tfidf(data, numDocs)

CPU times: user 17min 2s, sys: 2.07 s, total: 17min 4s
Wall time: 17min 6s


In [7]:
# Save results
pickle.dump(index,open(f"{PATH}utils/index.p", "wb"))
pickle.dump(tf, open(f"{PATH}utils/tf.p", "wb"))
pickle.dump(df,open(f"{PATH}utils/df.p", "wb"))
pickle.dump(idf,open(f"{PATH}utils/idf.p", "wb"))

# Ranking

In [8]:
def rankDocuments(terms, docs, index, idf, tf):
  """
  Computes ranking given query and collection of tweets.

  Arguments:
    terms -- query - str.
    docs -- ID list of docs - list.
    index -- invertex index. - dict
    idf -- inverse document frequency - dict
    tf -- term frequency - dict
  Returns:
    resultDocs -- Ordered list of matching docs based on cosine-sim - list
  """

  # Dict with vector per docID
  docVectors = defaultdict(lambda: [0]*len(terms))

  # Vector per query
  queryVector = [0]*len(terms)

  # TF of query
  query_terms_count = collections.Counter(terms)
 
  # Norm query
  query_norm = np.linalg.norm(list(query_terms_count.values()))
  
  for termIndex, term in enumerate(terms):
    # Check if term exist in collection

    if term not in index:
      continue

    # Score per term-query
    queryVector[termIndex] = query_terms_count[term]/query_norm * idf[term]

    for docIndex, (doc,postings) in enumerate(index[term]):
      # check if IDdoc is in list of IDdocs containg term
      if doc in docs:
        # Score per term-doc
        docVectors[doc][termIndex] = tf[term][docIndex] * idf[term]

  #Cosine similarity query-doc
  docScores = [[np.dot(curDocVec, queryVector), doc] for doc, curDocVec in docVectors.items()]

  #Sort by descending similarity
  docScores.sort(reverse=True)

  #Get IDs
  resultDocs = [x[1] for x in docScores]

  return resultDocs

In [9]:
def search_tf_idf(query, index, idf, tf, topn):
  """
Preprocess query and find docs with words in query
Arguments:
  query -- query - str.
  index -- inverted index - dict
  idf -- inverse document frequency - dict
  tf -- term frequency - dict
  topn -- N top ranked docs to be returned - int
Returns
  ranked_docs -- list of topn docs ranked by cosine-sim - list
  """
  # Preprocess query
  query = getTerms(query)
 
  # Init set of docs with terms in query
  docs = set()

  for term in query:
    try:
      # Get IDs of docs with term
      termDocs = [posting[0] for posting in index[term]]
      
      # Add new docsID
      docs = docs.union(termDocs)
    
    except:
      pass
    
  docs = list(docs)

  # Rank docs with rankDocuments
  ranked_docs = rankDocuments(query, docs, index, idf, tf)
  ranked_docs = ranked_docs[:topn]

  return ranked_docs

In [10]:
def parser_tweet_results(doc):
  """
Given a Pandas dataframe row formates the information por display
Arguments:
  docs -- pandas dataframe with unique row with tweet info.
Returns:
  tweet -- text tweet - str
  authors -- user name of tweet - str
  date -- of publication -- str
  retweets -- count of retweets - str
  favorites -- count of favourites - str
  """
  # Tweet
  tweet = str(doc["full_text"].values)
  tweet = tweet.replace("'","")
  tweet = tweet.replace("[","")
  tweet = tweet.replace("]","")

  # Author
  author = str(doc["user.name"].values)
  author = author.replace("[","")
  author = author.replace("]","")

  # Date
  date = str(doc["created_at"].values)
  date = date.replace("[","")
  date = date.replace("]","")
  date = date.replace("'","")

  # Retweets
  retweets = str(doc["retweet_count"].values)
  retweets = retweets.replace("[","")
  retweets = retweets.replace("]","")

  # Favorites
  favorites = str(doc["favorite_count"].values)
  favorites = favorites.replace("[","")
  favorites = favorites.replace("]","")

  # URL
  id = str(doc["id"].values)
  id = id.replace("[","")
  id = id.replace("]","")
  url = f"https://twitter.com/twitter/statuses/{id}"

  #Hashtags
  hashtags = str(doc["entities.hashtags"].values)

  return tweet, date, author, retweets, favorites, url, hashtags

In [11]:
def search(index, idf, tf, topn = 20):
  """
Search for tweets inputing a query and see displayed results.
Arguments:
  index -- inverted index - dict
  idf -- inverse document frequency - dict.
  tf -- term frequency - dict.
  topn -- default: 20 - Top N result to display - int.
  """
  print("######################################################")
  print("Insert query:")
  query = input()
  print("######################################################\n")
  
  # Get topn docs
  ranked_docs = search_tf_idf(query, index, idf, tf, topn)

  if len(ranked_docs) == 0:
    print("No results found !")
    return -1
  
  print("Results\n")

  for i, id in enumerate(ranked_docs):
    # Get tweet corresponding to id
    doc = data[data['id'] == id]
    tweet, date, author, retweets, favorites, url, hashtags = parser_tweet_results(doc)
    
    print("______________________________________________________")
    print(f"Tweet {i}")
    print(f"\t·Author: {author}")
    print(f"\t·Date: {date}")
    print(f"\t·Tweet: {tweet}")
    print(f"\t·Retweets: {retweets}")
    print(f"\t·Favorites: {favorites}")
    print(f"\t·Hashtags: {hashtags}")
    print(f"\t·URL: {url}")
    print("______________________________________________________\n")

In [None]:
search(index, idf, tf)

######################################################
Insert query:
