In [115]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from array import array
import collections
from collections import defaultdict
import string
import pickle
import random

from google.colab import drive
drive.mount('/content/drive/')
PATH = '/content/drive/My Drive/Colab Notebooks/IR-WA_FinalProject/'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# RQ2 - Report Script

A notebook containing all the required scripts to answer some of the report questions.

---

In [95]:
# Open data
data = pd.read_csv(f"{PATH}data/final_Tweets.csv")

#Open labels
id_labels = pickle.load(open(f"{PATH}ResearchQuestions/id_cluster.p","rb"))
ids = list(id_labels.keys())

In [96]:
# Init labels
data = (data[ data["id"].isin(ids) ]).reset_index()
data["Label"] = -2

In [97]:
# Assign labels
for i, row in data.iterrows():
  row_id = row["id"]
  try:
    label = id_labels[row_id]
    data.loc[i,"Label"]= label
  except:
    pass

In [98]:
# Get labeled data
data = data[data["Label"] != -2]

# TF-IDF + Cosine Similarity

In [67]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [108]:
# Inverse Probability of each cluster
P_cluster = {}

#Unique clusters
clusters = set(id_labels.values())

numDocs = len(data)

for cluster in clusters:
  num_in_cluster = len(data[data["Label"] == cluster])
  P_cluster[cluster] = numDocs/num_in_cluster

In [99]:

def getTerms(terms):

  """
  Preprocess step for tweet.
    1. transform all text in lowercase
    2. Tokenize by transforming string to list
    3. Remove stopwords
    4. Apply stemming, simplify word to root word.
  
  Argument:
    terms -- string (text)
  
  Returns:
    terms -- a list of preprocessed tokens corresponding to the input tweet 
  """
  # Init stemming and stopwords
  stemming = PorterStemmer()
  STOPWORDS = set(stopwords.words("english"))

  # Transform into lower case
  terms = terms.lower() 

  # Remove punctuation
  terms = terms.translate(str.maketrans("","", string.punctuation))

  # Tokenize
  terms = terms.split()

  # Remove stop words
  terms = [t for t in terms if t not in STOPWORDS]

  #Stemming
  terms = [stemming.stem(t) for t in terms]

  # Remove http links found in tweets last term
  try:
    if terms[-1][:4] == "http":
     terms = terms[:-1]
  except:
    pass

  return terms

In [101]:
def create_index_tfidf(data, numDocs):
  """
    Creates index and computes tf, idf and df for collection

    Arguments:
      data -- Pandas dataframe with cols "full_index" per text tweet and "id" per 
              tweet-
    Returns:
      index --  inverted list "term": [["id",[pos1,pos1,..]]..]
      tf -- normalized term frequency per doc 
      df -- document frequency per term
      idf -- inversed docuemnt frequency
  """

  index = defaultdict(list)

  # Term freq of terms in tweets
  tf = defaultdict(list)

  # Tweet freq of term in corpus
  df = defaultdict(int)

  # Inverse df
  idf = defaultdict(float)

  for i, row in data.iterrows():
    tweet = row["full_text"]
    tweet = getTerms(tweet)
    tweet_id = row["id"]

    # Page Index per doc
    termdictPage = {}
    for position, term in enumerate(tweet):
      try:
        termdictPage[term][1].append(position)
      except:
        termdictPage[term] = [tweet_id, array("I",[position])]
    
    # NORM frequencies per doc 
    norm = 0
    for term, posting in termdictPage.items():
      norm += len(posting[1])**2
    norm = np.sqrt(norm)

    # TF normalized per term and DF
    for term, posting in termdictPage.items():
      tf[term].append(np.round(len(posting[1])/norm,4))
      df[term] += 1
    
    # Merge doc page index with main index
    for termpage, postingpage in termdictPage.items():
      index[termpage].append(postingpage)

    # IDF per term
    for term in df:
      idf[term] = np.round(np.log(float(numDocs/df[term])),4)
  
  return index, tf, df, idf

In [102]:
%%time

numDocs = len(data)
index, tf, df, idf = create_index_tfidf(data, numDocs)

CPU times: user 16min 59s, sys: 1.58 s, total: 17min 1s
Wall time: 17min 4s


In [247]:
def rankDocuments(terms, docs, index, idf, tf):
  """
  Computes ranking given query and collection of tweets.

  Arguments:
    terms -- query - str.
    docs -- ID list of docs - list.
    index -- invertex index. - dict
    idf -- inverse document frequency - dict
    tf -- term frequency - dict
  Returns:
    resultDocs -- Ordered list of matching docs based on cosine-sim - list
  """

  # Dict with vector per docID
  docVectors = defaultdict(lambda: [0]*len(terms))

  # Vector per query
  queryVector = [0]*len(terms)

  # TF of query
  query_terms_count = collections.Counter(terms)
 
  # Norm query
  query_norm = np.linalg.norm(list(query_terms_count.values()))
  
  for termIndex, term in enumerate(terms):
    # Check if term exist in collection
    if term not in index:
      continue

    # Score per term-query
    queryVector[termIndex] = query_terms_count[term]/query_norm * idf[term]

    for docIndex, (doc,postings) in enumerate(index[term]):
      # check if IDdoc is in list of IDdocs containg term
      if doc in docs:
        
        # Score per term-doc
        docVectors[doc][termIndex] = tf[term][docIndex] * idf[term]

  #Cosine similarity query-doc
  docScores = [[np.dot(curDocVec, queryVector), doc] for doc, curDocVec in docVectors.items()]

  #Sort by descending similarity
  docScores.sort(reverse=True)

  # Transform to dict
  docScore_dic = {}
  for d in docScores:
    docScore_dic[d[1]] = d[0]

  #Get IDs
  resultDocs = [x[1] for x in docScores]
  
  diversity = 0
  final_resultDocs = []

  if diversification == True:
    #Create permutation
    for i in range(20):
      # Get top 40 docs
      resultDocs_ = resultDocs[:50]

      # Permute
      random.shuffle(resultDocs_)
      
      resultDocs_ = resultDocs_[:20]

      # Get data in permutation
      docs_in_permutation = data[data["id"].isin(resultDocs_)]

      #Coverage
      clusters_in_permutation = docs_in_permutation["Label"]
      coverage = len(set(clusters_in_permutation))
    
      #Cosine Sim
      cosine_sim = sum([docScore_dic[id] for id in resultDocs_])*0.1
      
      #Inverse cluster prob
      lambda_ = 0.01
      P_clusters_in_permutation = sum([P_cluster[c] for c in clusters_in_permutation])*lambda_
      
      #Diversity
      diversity_ = coverage + cosine_sim + P_clusters_in_permutation

      if diversity_ > diversity:
        diversity = diversity_

        dic = {}
        for d in resultDocs_:
          dic[d] = docScore_dic[d]
        
        dic = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1], reverse=True)}
        
        final_resultDocs = list(dic.keys())
  else:
    final_resultDocs = resultDocs[:20]
    docs_in_result = data[data["id"].isin(final_resultDocs)]

    #Coverage
    clusters_in_result = docs_in_result["Label"]
    coverage = len(set(clusters_in_result))
    # Cos Sim
    cosine_sim = sum([docScore_dic[id] for id in final_resultDocs])*0.1
    # Invser cluster prob
    lambda_ = 0.01
    P_cluster_in_result = sum([P_cluster[c] for c in clusters_in_result])*lambda_
    # Diversity 
    diversity = coverage + cosine_sim + P_cluster_in_result

  print(f"Coverage: {coverage}")
  print(f"Diversity: {diversity}")
  return final_resultDocs

In [248]:
def search_tf_idf(query, index, idf, tf, topn):
  """
Preprocess query and find docs with words in query
Arguments:
  query -- query - str.
  index -- inverted index - dict
  idf -- inverse document frequency - dict
  tf -- term frequency - dict
  topn -- N top ranked docs to be returned - int
Returns
  ranked_docs -- list of topn docs ranked by cosine-sim - list
  """
  # Preprocess query
  query = getTerms(query)
 
  # Init set of docs with terms in query
  docs = set()

  for term in query:
    try:
      # Get IDs of docs with term
      termDocs = [posting[0] for posting in index[term]]
      
      # Add new docsID
      docs = docs.union(termDocs)
    
    except:
      pass
    
  docs = list(docs)

  # Rank docs with rankDocuments
  ranked_docs = rankDocuments(query, docs, index, idf, tf)
  ranked_docs = ranked_docs[:topn]

  return ranked_docs

In [252]:
def parser_tweet_results(doc):
  """
Given a Pandas dataframe row formates the information por display
Arguments:
  docs -- pandas dataframe with unique row with tweet info.
Returns:
  tweet -- text tweet - str
  authors -- user name of tweet - str
  date -- of publication -- str
  retweets -- count of retweets - str
  favorites -- count of favourites - str
  """
  # Tweet
  tweet = str(doc["full_text"].values)
  tweet = tweet.replace("'","")
  tweet = tweet.replace("[","")
  tweet = tweet.replace("]","")

  # Author
  author = str(doc["user.name"].values)
  author = author.replace("[","")
  author = author.replace("]","")

  # Date
  date = str(doc["created_at"].values)
  date = date.replace("[","")
  date = date.replace("]","")
  date = date.replace("'","")

  # Retweets
  retweets = str(doc["retweet_count"].values)
  retweets = retweets.replace("[","")
  retweets = retweets.replace("]","")

  # Favorites
  favorites = str(doc["favorite_count"].values)
  favorites = favorites.replace("[","")
  favorites = favorites.replace("]","")

  # URL
  id = str(doc["id"].values)
  id = id.replace("[","")
  id = id.replace("]","")
  url = f"https://twitter.com/twitter/statuses/{id}"

  #Hashtags
  hashtags = str(doc["entities.hashtags"].values)

  return tweet, date, author, retweets, favorites, url, hashtags

In [283]:
def search(index, idf, tf, topn = 20):
  """
Search for tweets inputing a query and see displayed results.
Arguments:
  index -- inverted index - dict
  idf -- inverse document frequency - dict.
  tf -- term frequency - dict.
  topn -- default: 20 - Top N result to display - int.
  """
  print("######################################################")
  print("Insert query:")
  querys = ["Who won the elections?", "Votes count", "Is Joe Biden the president?", "Pennsylvania votes", "Georgia votes", "Texas votes", "Kamala Harris", "USA elections date", "Final recount day", "How to boil water"]
  print("######################################################\n")
  
  # Get topn docs
  df = pd.DataFrame()
  for query in querys:
    ranked_docs = search_tf_idf(query, index, idf, tf, topn)

    if len(ranked_docs) == 0:
      print("No results found !")
      return -1
    
    print("Results\n")

    for i, id in enumerate(ranked_docs):
      # Get tweet corresponding to id
      doc = data[data['id'] == id]
      tweet, date, author, retweets, favorites, url, hashtags = parser_tweet_results(doc)
      
      dic = {
          "query":query,
          "tweet":tweet,
          "date":date,
          "retweets":retweets,
          "favorites":favorites,
          "url":url,
          "hashtags":hashtags
      }

      df = df.append(dic, ignore_index = True)
  
  df.to_csv(f"{PATH}ResearchQuestions/RQ2_WITHOUT.csv")

In [253]:
def search(index, idf, tf, topn = 20):
  """
Search for tweets inputing a query and see displayed results.
Arguments:
  index -- inverted index - dict
  idf -- inverse document frequency - dict.
  tf -- term frequency - dict.
  topn -- default: 20 - Top N result to display - int.
  """
  print("######################################################")
  print("Insert query:")
  query = input()
  print("######################################################\n")
  
  # Get topn docs
  ranked_docs = search_tf_idf(query, index, idf, tf, topn)

  if len(ranked_docs) == 0:
    print("No results found !")
    return -1
  
  print("Results\n")

  for i, id in enumerate(ranked_docs):
    # Get tweet corresponding to id
    doc = data[data['id'] == id]
    tweet, date, author, retweets, favorites, url, hashtags = parser_tweet_results(doc)
    
    print("______________________________________________________")
    print(f"Tweet {i}")
    print(f"\t·Author: {author}")
    print(f"\t·Date: {date}")
    print(f"\t·Tweet: {tweet}")
    print(f"\t·Retweets: {retweets}")
    print(f"\t·Favorites: {favorites}")
    print(f"\t·Hashtags: {hashtags}")
    print(f"\t·URL: {url}")
    print("______________________________________________________\n")

In [284]:
diversification = False

search(index, idf, tf, topn=20)

######################################################
Insert query:
######################################################

Coverage: 7
Diversity: 34.82464988728815
Results

Coverage: 8
Diversity: 66.78907984179534
Results

Coverage: 6
Diversity: 54.01390667735626
Results

Coverage: 6
Diversity: 49.58758875155985
Results

Coverage: 7
Diversity: 71.98763742592809
Results

Coverage: 5
Diversity: 39.32855082276729
Results

Coverage: 7
Diversity: 46.27009432401513
Results

Coverage: 2
Diversity: 42.5010623032415
Results

Coverage: 9
Diversity: 55.86148093788917
Results

Coverage: 5
Diversity: 28.095314125300057
Results

