![dsl logo](https://github.com/BrockDSL/ARCH_Data_Explore/blob/main/dsl_logo.png?raw=true)

# TF-IDF Similarity comparison of URLS
Compares two documents (URLs) with TFIDF and displays their similarity. Color codes words of significance according to their contribution to the similarity.

In [None]:
!pip install spacy==3.2.3
!pip install matplotlib==3.5.1
!python -m spacy download en_core_web_md

Collecting spacy==3.2.3
  Downloading spacy-3.2.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 3.9 MB/s 
[?25hCollecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 13.3 MB/s 
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 25.3 MB/s 
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (653 kB)
[K     |████████████████████████████████| 653 kB 35.5 MB/s 
[?25hCollecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.1-py3-none-any.whl (27 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting pathy

Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
[K     |████████████████████████████████| 45.7 MB 2.2 MB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import scipy.sparse
import requests
import numpy as np
import pandas as pd
import json
import matplotlib as mpl
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout, Box, VBox, Output

In [None]:
import spacy
nlp = spacy.load('en_core_web_md')

print(f"imported spacy version: {spacy.__version__}")
print(f"pipeline components: {nlp.pipe_names}")

def lemma_lower_no_stop_punct_space_digit(doc):
    return [t.lemma_.lower() for t in doc if (not t.is_stop) and (not t.is_punct) and (not t.is_space) and (not t.is_digit)]

#This is our "analyzer" for the TfidfVectorizer object. 
#By using a custom analyzer we skip scikit's default text preprocessing and use SpaCy instead
def custom_analyzer(str_doc):
    doc = nlp(str_doc)
    tokens = lemma_lower_no_stop_punct_space_digit(doc)
    return tokens

imported spacy version: 3.2.3
pipeline components: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [None]:
#URLs of interest
url_list = pd.read_csv("https://raw.githubusercontent.com/BrockDSL/ARCH_Data_Explore/main/urls_of_interest.txt",header=None)
url_list.columns = ["base_url"]
url_list = url_list.values.flatten().tolist()

In [None]:
def download_file(url, filename=None, loud=True):
  
  if not filename:
    filename = url.split('/')[-1]
  
  r = requests.get(url, stream=True)
  if loud:
    total_bytes_dl = 0
    content_len = int(r.headers['Content-Length'])

  with open(filename, 'wb') as fd:
      for chunk in r.iter_content(chunk_size=4096):
          fd.write(chunk)
          if loud:
            total_bytes_dl += 4096
            percent = int((total_bytes_dl / content_len) * 100.0)
            print(f'\rDownload progress of {filename} {total_bytes_dl}/{content_len}: {percent}%',end='')
  r.close()

In [None]:
download_file("https://brockau.s3.us-east-2.amazonaws.com/municipal_data.csv.gz")
!gunzip -f municipal_data.csv.gz
archive_data = pd.read_csv("municipal_data.csv")
archive_data.drop(columns=['Unnamed: 0', 'index', 'length','v_pos','v_neg','v_neu','v_comp', 'mime_type_web_server', 'mime_type_tika', 'language'], inplace=True)
archive_data = archive_data[archive_data.url.isin(url_list) & archive_data.content.notna()]
archive_data.drop_duplicates(inplace=True)
archive_data = archive_data.reset_index(drop=True)

Download progress of municipal_data.csv.gz 51339264/51335494: 100%

In [None]:
# This is really costly and takes forever when we add SpaCy
# So instead precompute the tfidf_matrix and load it later
#--- Enable this section if you need to recompute results ---
#vector = TfidfVectorizer(analyzer=custom_analyzer)
#tfidf_matrix = vector.fit_transform(archive_data.content)
# scipy.sparse.save_npz('tf_idf_matrix.npz', tfidf_matrix)

# with open('tf_idf_vocab.json', 'w') as fp:
#   json.dumps(vector.vocabulary_, fp)

# feature_names = vector.get_feature_names_out()
# with open('tf_idf_feature_vector.npy', 'wb') as fp:
#   np.save(fp, feature_names)

#--- Load saved results here ---
download_file("https://brockau.s3.us-east-2.amazonaws.com/tf_idf_matrix.npz")
print()
download_file("https://brockau.s3.us-east-2.amazonaws.com/tf_idf_feature_vector.npy")
print()
download_file("https://brockau.s3.us-east-2.amazonaws.com/tf_idf_vocab.json")
print()

tfidf_matrix = scipy.sparse.load_npz('tf_idf_matrix.npz')

#Load the set of terms and their indices in the tfidf_matrix
with open('tf_idf_vocab.json', 'r') as fp:
  vector_vocabulary = json.load(fp)

#Load the set of terms as an np array. Their position (index) in the array is
#the same as in the tfidf_matrix
with open('tf_idf_feature_vector.npy', 'rb') as fp:
  feature_names = np.load(fp, allow_pickle=True)

Download progress of tf_idf_matrix.npz 3547136/3543929: 100%
Download progress of tf_idf_feature_vector.npy 94208/93902: 100%
Download progress of tf_idf_vocab.json 94208/93469: 100%


In [None]:
#-------Compare similarity of crawls using tf_idf-------

class SimilarityDisplay:
  def __init__(self, url_list, archive_data, tf_idf_matrix, vector_vocabulary, feature_names, cutoff_factor=1.0):
    self.url_list = url_list
    self.archive_data = archive_data
    self.tf_idf_matrix = tf_idf_matrix
    self.vector_vocabulary = vector_vocabulary
    self.feature_vector = feature_names 
    self.createColorMap()
    self.cutoff_factor = cutoff_factor

    #Make widgets
    self.createSimilarityLabel()
    self.createURLWidgets()
    self.createContentWidgets()
    self.createCutOffWidgets()
  
  def createCutOffWidgets(self):
    self.cutoff_slider = widgets.FloatSlider(
      value=self.cutoff_factor,
      min=1,
      max=10.0,
      step=0.1,
      description='Cutoff slider:',
      disabled=False,
      continuous_update=False,
      orientation='horizontal',
      readout=True,
      readout_format='.1f',
    )

    self.cutoff_button = widgets.Button(
      description='Apply cutoff',
      disabled=False,
      button_style='', # 'success', 'info', 'warning', 'danger' or ''
      tooltip='Click me',
      icon='check' # (FontAwesome names without the `fa-` prefix)
    )

    def onclick(change):
      self.cutoff_factor = self.cutoff_slider.value
      self.do_refresh_steps()
    self.cutoff_button.on_click(onclick)
  
    box_layout = Layout(display='flex', flex_flow='row', align_items='stretch', width='100%')
    #Layout widgets horizontally
    self.cutoff_control_box = Box(children=[self.cutoff_slider, self.cutoff_button], layout=box_layout)

  def createColorMap(self):
    #Create a "perceptually uniform palette" for visually representing the tf_idf values of terms
    #See: Perceptually uniform palettes in https://seaborn.pydata.org/tutorial/color_palettes.html
    
    #Actually, we will just be using matplotlib here, so check out:
    #https://matplotlib.org/3.5.1/api/cm_api.html#module-matplotlib.cm

    #The tfidf_matrix is sparse, so in order to calculate the min and max values (for our colormap)
    #let's only retrieve the non zero values
    non_zero = self.tf_idf_matrix.nonzero()
    tf = self.tf_idf_matrix[non_zero]
    self.tf_idf_mean = tf.mean()

    #Now create the mapper that takes a float and returns a RGBA value
    cmap = mpl.colormaps['plasma']
    self.scalar_map = mpl.cm.ScalarMappable(cmap=cmap)
    self.scalar_map.set_clim(vmin=tf.min(), vmax=tf.max())

  def createContentWidgets(self):
    #Document content output widgets
    
    #Create layouts
    box_layout = Layout(display='flex', flex_flow='row', align_items='stretch', width='100%')
    text_area_layout = Layout(**{'border': '1px solid rgb(255, 204, 102)', 'padding':'10px', 'width':'50%'})
    
    #Create html content widgets
    self.doc_output1 = widgets.HTML(
        value='',
        layout=text_area_layout,
    )
    self.doc_output2 = widgets.HTML(
        value='',
        layout=text_area_layout,
    )

    #Layout widgets horizontally
    self.html_content_box = Box(children=[self.doc_output1, self.doc_output2], layout=box_layout)
  
  def createSimilarityLabel(self):
    #------Similarity Label------
    self.similarity_label = widgets.HTML(value="")
  
  def createURLWidgets(self):
    #------Url 1 widgets---------
    self.url_1 = widgets.Dropdown(
        options=self.url_list,
        description='URL 1:',
        ensure_option=True,
        disabled=False,
        layout=Layout(width='99%')
    )
    self.url_1_date = widgets.Dropdown(
        options=[],
        description='Date:',
        ensure_option=True,
        disabled=False
    )

    #------Url 2 widgets---------
    self.url_2 = widgets.Dropdown(
        options=self.url_list,
        description='URL 2:',
        ensure_option=True,
        disabled=False,
        layout=Layout(width='99%')
    )
    self.url_2_date = widgets.Dropdown(
        options=[],
        description='Date:',
        ensure_option=True,
        disabled=False
    )

    #Set up listeners
    self.url_1.observe(self.on_value_change_url_1)
    self.url_1_date.observe(self.on_value_change_date)
    self.url_2.observe(self.on_value_change_url_2)
    self.url_2_date.observe(self.on_value_change_date)
  
  def display(self):
    #Once everything is setup, call this function to display the widgets

    #force a change (which calls the listener) to populate the _date_ drop down
    self.url_1.value = self.url_list[1] 
    self.url_2.value = self.url_list[2]

    intro_label = widgets.Label(value="Select URLs and crawl dates to perform similarity comparisons")
    selection_widgets = VBox([intro_label,
                              self.url_1, self.url_1_date, 
                              self.url_2, self.url_2_date, 
                              self.cutoff_control_box,
                              self.similarity_label],)
    display(selection_widgets)
   # display(self.cutoff_control_box)
    display(self.html_content_box)

  def filter_url(self, url, date=None):
    if date:
        filtered_url = self.archive_data[(self.archive_data.url == url) & (self.archive_data.crawl_date == date) ]
    else:
        filtered_url = self.archive_data[(self.archive_data.url == url)]
    return filtered_url

  def calculate_similarity(self):
      self.cosine_similarity = linear_kernel(self.first_url_tfidf_vector, self.second_url_tfidf_vector).flatten()[0]
      self.similarity_label.value = "Similarity is: <b>{:.6f}</b>".format(self.cosine_similarity)

  def on_value_change_url_1(self, change):
      self.on_value_change_url(change, self.url_1_date)

  def on_value_change_url_2(self, change):
      self.on_value_change_url(change, self.url_2_date)

  def on_value_change_url(self, change, url_date_widget):
      if change['name'] == 'value':
          filtered_url = self.filter_url(change['new'])
          dates = filtered_url.crawl_date.values.flatten().tolist()
          
          #Normally the output is changed only when the date selected is changed.
          #If the user selects a new url, but the date widget's value is the same
          #then the output will not refresh. This fixes that.
          refresh = False
          if url_date_widget.value == dates[0]: 
              refresh = True
          
          url_date_widget.options = dates
          
          if refresh: 
            self.do_refresh_steps()

  def on_value_change_date(self, change):
      if change['name'] == 'value':
          self.do_refresh_steps()

  def set_url_data(self):
    self.first_url = self.filter_url(self.url_1.value, self.url_1_date.value)
    self.first_url_index = self.first_url.index[0]
    self.second_url = self.filter_url(self.url_2.value, self.url_2_date.value)
    self.second_url_index = self.second_url.index[0]

  def set_tf_idf_vectors(self):
      self.first_url_tfidf_vector = self.tf_idf_matrix[self.first_url_index]
      self.second_url_tfidf_vector = self.tf_idf_matrix[self.second_url_index]
  
  def do_refresh_steps(self):
      self.set_url_data()
      self.set_tf_idf_vectors()
      self.calculate_similarity()
      self.refresh_output()

  def get_top_n(self, tf_idf_vector, n):
    #Return the top n important terms for a given document
    tfv = tf_idf_vector.toarray()[0]
    tfv_sorted_idx = tfv.argsort()
    return [(self.feature_names[i],tfv[i]) for i in tfv_sorted_idx[:-n-1:-1]]

  def annotate_terms(self, sentence, tf_idf_vector):
    #tf_idf_vector is the set of tf_idf values of terms for the doc we are annotating

    annotated_terms = []
    for token in sentence:
      #look up token (term) tf_idf value
      token_index = self.vector_vocabulary.get(token.lemma_.lower(), None)
      #if token is not found, it has no significance and has a tf_idf value of 0
      if token_index == None:
        annotated_terms.append(token.text_with_ws)
      else:
        token_val = tf_idf_vector[0,token_index]
        #if token value is less than the mean (of tf-idf values) * some factor, skip coloring it. 
        if token_val > (self.cutoff_factor * self.tf_idf_mean):
          token_color = self.scalar_map.to_rgba(token_val, bytes=True)
          annotated_terms.append(f'<span style="color:rgba{token_color}">{token.text_with_ws}</span>')
        else:
          annotated_terms.append(token.text_with_ws)
      
    return ''.join(annotated_terms)

  def refresh_output(self):
      doc1 = nlp(self.first_url['content'].values[0])
      doc2 = nlp(self.second_url['content'].values[0])

      def get_sents(doc, tf_idf_vector):
          return '\n'.join([f"<p>{self.annotate_terms(s, tf_idf_vector)}</p>" for s in doc.sents])
      
      self.doc_output1.value = get_sents(doc1, self.first_url_tfidf_vector)
      self.doc_output2.value = get_sents(doc2, self.second_url_tfidf_vector)

a = SimilarityDisplay(url_list, archive_data, tfidf_matrix, vector_vocabulary, feature_names)
a.display()


VBox(children=(Label(value='Select URLs and crawl dates to perform similarity comparisons'), Dropdown(descript…

Box(children=(HTML(value='<p>How to Protect Yourself from <span style="color:rgba(43, 5, 148, 255)">COVID-19 <…