# Search Engine

In [55]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from array import array
import collections
from collections import defaultdict
import string

nltk.download('stopwords')

from google.colab import drive
drive.mount('/content/drive/')
PATH = '/content/drive/My Drive/Colab Notebooks/IR-WA_FinalProject/data/'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import Files

In [168]:
data = pd.read_csv(f"{PATH}politicians_Tweets.csv")
data.head()

Unnamed: 0,created_at,favorite_count,full_text,id,retweet_count,user.id
0,2020-11-11,292716,"This Veterans Day, I feel the full weight of t...",1326638331204038656,19728,939091
1,2020-11-11,364040,"Today, we honor the service of those who have ...",1326557341697839106,32485,939091
2,2020-11-11,437255,We are going to build a health care system tha...,1326344141446373376,24757,939091
3,2020-11-11,193667,"If you’re sick, if you’re struggling, if you’r...",1326325015290273792,14376,939091
4,2020-11-10,632528,"When I’m speaking to foreign leaders, I’m tell...",1326306141056364544,37748,939091


## Preprocessing

In [110]:
def getTerms(terms):

  """
  Preprocess step for tweet.
    1. transform all text in lowercase
    2. Tokenize by transforming string to list
    3. Remove stopwords
    4. Apply stemming, simplify word to root word.
  
  Argument:
    terms -- string (text)
  
  Returns:
    terms -- a list of preprocessed tokens corresponding to the input tweet 
  """
  # Init stemming and stopwords
  stemming = PorterStemmer()
  STOPWORDS = set(stopwords.words("english"))

  # Transform into lower case
  terms = terms.lower() 

  # Remove punctuation
  terms = terms.translate(str.maketrans("","", string.punctuation))

  # Tokenize
  terms = terms.split()

  # Remove stop words
  terms = [t for t in terms if t not in STOPWORDS]

  #Stemming
  terms = [stemming.stem(t) for t in terms]

  # Remove http links found in tweets last term
  if terms[-1][:4] == "http":
    terms = terms[:-1]


  return terms

# TF-IDF Inverted Index

In [100]:
def create_index_tfidf(data, numDocs):

  index = defaultdict(list)

  # Term freq of terms in tweets
  tf = defaultdict(list)

  # Tweet freq of term in corpus
  df = defaultdict(int)

  # Inverse df
  idf = defaultdict(float)

  for i, row in data.iterrows():
    tweet = row["full_text"]
    tweet = getTerms(tweet)
    tweet_id = row["id"]

    # Page Index per doc
    termdictPage = {}
    for position, term in enumerate(tweet):
      try:
        termdictPage[term][1].append(position)
      except:
        termdictPage[term] = [tweet_id, array("I",[position])]
    
    # NORM frequencies per doc 
    norm = 0
    for term, posting in termdictPage.items():
      norm += len(posting[1])**2
    norm = np.sqrt(norm)

    # TF normalized per term and DF
    for term, posting in termdictPage.items():
      tf[term].append(np.round(len(posting[1])/norm,4))
      df[term] += 1
    
    # Merge doc page index with main index
    for termpage, postingpage in termdictPage.items():
      index[termpage].append(postingpage)

    # IDF per term
    for term in df:
      idf[term] = np.round(np.log(float(numDocs/df[term])),4)
  
  return index, tf, df, idf

In [101]:
%%time

numDocs = len(data)
index, tf, df, idf = create_index_tfidf(data, numDocs)

CPU times: user 15.1 s, sys: 32.7 ms, total: 15.1 s
Wall time: 15.1 s


# Ranking

In [124]:
def rankDocuments(terms, docs, index, idf, tf):

  # Dict with vector per docID
  docVectors = defaultdict(lambda: [0]*len(terms))

  # Vector per query
  queryVector = [0]*len(terms)

  # TF of query
  query_terms_count = collections.Counter(terms)
 
  # Norm query
  query_norm = np.linalg.norm(list(query_terms_count.values()))

  for termIndex, term in enumerate(terms):
    # Check if term exist in collection
    if term not in index:
      continue

    # Score per term-query
    queryVector[termIndex] = query_terms_count[term]/query_norm * idf[term]

    for docIndex, (doc,postings) in enumerate(index[term]):
      # check if IDdoc is in list of IDdocs containg term
      if doc in docs:
        # Score per term-doc
        docVectors[doc][termIndex] = tf[term][docIndex] * idf[term]

  #Cosine similarity query-doc
  docScores = [[np.dot(curDocVec, queryVector), doc] for doc, curDocVec in docVectors.items()]

  #Sort by descending similarity
  docScores.sort(reverse=True)

  #Get IDs
  resultDocs = [x[1] for x in docScores]

  return resultDocs

In [125]:
def search_tf_idf(query, index, idf, tf, topn):
  # Preprocess query
  query = getTerms(query)
 
  # Init set of docs with terms in query
  docs = set()

  for term in query:
    try:
      # Get IDs of docs with term
      termDocs = [posting[0] for posting in index[term]]
      
      # Add new docsID
      docs = docs.union(termDocs)
    
    except:
      pass
    
  docs = list(docs)

  # Rank docs with rankDocuments
  ranked_docs = rankDocuments(query, docs, index, idf, tf)
  ranked_docs = ranked_docs[:topn]

  return ranked_docs

In [186]:
def parser_tweet_results(doc):
  # Tweet
  tweet = str(doc["full_text"].values)
  tweet = tweet.replace("'","")
  tweet = tweet.replace("[","")
  tweet = tweet.replace("]","")

  # Author
  author = str(doc["user.id"].values)
  author = author.replace("[","")
  author = author.replace("]","")

  # Date
  date = str(doc["created_at"].values)
  date = date.replace("[","")
  date = date.replace("]","")
  date = date.replace("'","")


  # Retweets
  retweets = str(doc["retweet_count"].values)
  retweets = retweets.replace("[","")
  retweets = retweets.replace("]","")

  # Favorites
  favorites = str(doc["favorite_count"].values)
  favorites = favorites.replace("[","")
  favorites = favorites.replace("]","")

  return tweet, date, retweets, favorites

In [187]:
def search(index, idf, tf, topn = 20):
  print("######################################################")
  print("Insert query:")
  query = input()
  print("######################################################\n")
  
  # Get topn docs
  ranked_docs = search_tf_idf(query, index, idf, tf, topn)

  if len(ranked_docs) == 0:
    print("No results found !")
    return -1
  
  print("Results\n")

  for index, id in enumerate(ranked_docs):
    # Get tweet corresponding to id
    doc = data[data['id'] == id]
    tweet, date, author, retweets, favorites = parser_tweet_results(doc)
    
    print("______________________________________________________")
    print(f"Tweet {index}")
    print(f"\t·Author: {author}")
    print(f"\t·Date: {date}")
    print(f"\t·Tweet: {tweet}")
    print(f"\t·Retweets: {retweets}")
    print(f"\t·Favorites: {favorites}")
    print("______________________________________________________\n")

In [188]:
search(index, idf, tf)

######################################################
Insert query:
winner elections
######################################################

Results

______________________________________________________
Tweet 0
	·Author: AUTHOR
	·Date: 2020-11-05
	·Tweet: It’s clear that when the count is finished, we believe we will be the winners. https://t.co/qVk0igZlrF
	·Retweets: 22922
	·Favorites: 278083
______________________________________________________

______________________________________________________
Tweet 1
	·Author: AUTHOR
	·Date: 2020-11-04
	·Tweet: As @JoeBiden said last night, it’s not up to us or Donald Trump to declare the winner of this election. The voters will decide.
	·Retweets: 24176
	·Favorites: 295225
______________________________________________________

______________________________________________________
Tweet 2
	·Author: AUTHOR
	·Date: 2020-11-04
	·Tweet: "Its not my place or Donald Trumps place to declare the winner of this election. Its the voters place."
	·

In [152]:
ranked_docs

[1324172079025934336,
 1324077589338689539,
 1323867320826109954,
 1323042009075097600,
 1325099845045071873,
 1326353226749386757,
 1323726954231701505,
 1323419193778188288,
 1325961114514038786,
 1324368202139357186,
 1322656293103013889,
 1321598661617397760,
 1326348926652477446,
 1325896369534607360,
 1324927403811024896,
 1323868327744253952,
 1323581437459734529,
 1321083729096413185,
 1323726205993033728,
 1326519025552265216]

In [149]:
w

NameError: ignored