![dsl logo](https://github.com/BrockDSL/ARCH_Data_Explore/blob/main/dsl_logo.png?raw=true)

# Similarity comparison of municipal URLs using SpaCy

In [1]:
!pip install pandas matplotlib ipywidgets
!pip install spacy
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 7.9 MB/s 
Installing collected packages: jedi
Successfully installed jedi-0.18.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.5.3.tar.gz (14 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gdown
  Building wheel for gdown (PEP 517) ... [?25l[?25hdone
  Created wheel for gdown: filename=gdown-4.5.3-py3-none-any.whl size=14840 sha256=fead2abb93f7cb7ff85c01ecfc72475c7491a6bfa2bb1590f4842b251e9adb6a
  Stored in directory:

In [2]:
import pandas as pd
import os
import gdown

import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout, Box, VBox, Output

In [3]:
!python -m spacy download en_core_web_md
import spacy
nlp = spacy.load('en_core_web_md')
def without_stopwords(doc):
    return nlp(' '.join([str(t) for t in doc if not t.is_stop]))

2022-11-11 03:25:31.680746: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[K     |████████████████████████████████| 42.8 MB 1.3 MB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [4]:
gdown.download("https://drive.google.com/u/0/uc?id=1oKNphdZkuNfeh-beuTkcIBo_EFLWO9zX&export=download","municipal_data.csv.gz",quiet=False)
!gunzip -f municipal_data.csv.gz
archive_data = pd.read_csv("municipal_data.csv")
archive_data.drop(columns=['Unnamed: 0', 'index', 'length','v_pos','v_neg','v_neu','v_comp', 'mime_type_web_server', 'mime_type_tika', 'language'], inplace=True)

Downloading...
From: https://drive.google.com/u/0/uc?id=1oKNphdZkuNfeh-beuTkcIBo_EFLWO9zX&export=download
To: /content/municipal_data.csv.gz
100%|██████████| 51.3M/51.3M [00:00<00:00, 71.6MB/s]


In [5]:
#URLs of interest
url_list = pd.read_csv("https://raw.githubusercontent.com/BrockDSL/ARCH_Data_Explore/main/urls_of_interest.txt",header=None)
url_list.columns = ["base_url"]
url_list = url_list.values.flatten().tolist()

In [6]:
#-------Compare similarity of crawls using SpaCy-------

similarity_label = widgets.HTML(value="")

#Document content output widgets
box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='100%')
text_area_layout = Layout(**{'border': '1px solid rgb(255, 204, 102)', 'padding':'10px', 'width':'50%'})
doc_output1 = widgets.HTML(
    value='',
    layout=text_area_layout,
)
doc_output2 = widgets.HTML(
    value='',
    layout=text_area_layout,
)

output_box = Box(children=[doc_output1, doc_output2], layout=box_layout)

#------Url 1 widgets---------
url_1 = widgets.Dropdown(
    options=url_list,
    description='URL 1:',
    ensure_option=True,
    disabled=False,
    layout=Layout(width='99%')
)
url_1_date = widgets.Dropdown(
    options=[],
    description='Date:',
    ensure_option=True,
    disabled=False
)

def filter_url(url, date=None):
    if date:
        filtered_url = archive_data[ (archive_data.url == url) & (archive_data.crawl_date == date) ]
    else:
        filtered_url = archive_data[(archive_data.url == url)]
    filtered_url = filtered_url[filtered_url.content.notna()]
    filtered_url = filtered_url.drop_duplicates()
    return filtered_url

def calculate_similarity():
    first_url = filter_url(url_1.value, url_1_date.value)
    second_url = filter_url(url_2.value, url_2_date.value)
    doc1 = nlp(first_url['content'].values[0])
    doc1_no_stop = without_stopwords(doc1)
    doc2 = nlp(second_url['content'].values[0])
    doc2_no_stop = without_stopwords(doc2)
    sim = doc1_no_stop.similarity(doc2_no_stop)
    similarity_label.value = "Similarity is: <b>{:.6f}</b>".format(sim)
    return doc1,doc2,sim

def on_value_change_url_1(change):
    if change['name'] == 'value':
        filtered_url = filter_url(change['new'])
        dates = filtered_url.crawl_date.values.flatten().tolist()
        url_1_date.options = dates

def on_value_change_date(change):
    if change['name'] == 'value':
        refresh_output(*calculate_similarity())

def refresh_output(doc1, doc2, sim):
    def get_sents(doc):
        v = []
        for i,s in enumerate(doc.sents):
            #v.append(f"<p>{i}: {s}</p>") #show sentence number 
            v.append(f"<p>{s}</p>")
        return '\n'.join(v)
    
    doc_output1.value = get_sents(doc1)
    doc_output2.value = get_sents(doc2)
    
    
url_1.observe(on_value_change_url_1)
url_1_date.observe(on_value_change_date)

#------Url 2 widgets---------
url_2 = widgets.Dropdown(
    options=url_list,
    description='URL 2:',
    ensure_option=True,
    disabled=False,
    layout=Layout(width='99%')
)
url_2_date = widgets.Dropdown(
    options=[],
    description='Date:',
    ensure_option=True,
    disabled=False
)

def on_value_change_url_2(change):
    if change['name'] == 'value':
        filtered_url = filter_url(change['new'])
        dates = filtered_url.crawl_date.values.flatten().tolist()
        url_2_date.options = dates
        
url_2.observe(on_value_change_url_2)
url_2_date.observe(on_value_change_date)

#-----Display------
url_1.value = url_list[1] #force a change to populate the date drop down
url_2.value = url_list[2]

selection_widgets = VBox([widgets.Label(value="Select URLs and crawl dates to perform similarity comparisons"),
                          url_1, url_1_date, url_2, url_2_date, similarity_label],)
display(selection_widgets)
display(output_box)

VBox(children=(Label(value='Select URLs and crawl dates to perform similarity comparisons'), Dropdown(descript…

Box(children=(HTML(value="<p>How to Protect Yourself from COVID-19 - Niagara Region, Ontario State of emergenc…

In [7]:
#Section for experimenting with similarity comparisons of with custom text
#------Button and label widgets---
sim_button = widgets.Button(
    description='Calculate Similarity',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click me',
)
def calc_similarity(*args):
    if stopword_checkbox.value:
        doc1 = without_stopwords(nlp(text_area1.value))
        doc2 = without_stopwords(nlp(text_area2.value))
    else:
        doc1 = nlp(text_area1.value)
        doc2 = nlp(text_area2.value)
    sim = doc1.similarity(doc2)
    sim_label.value = f"Similarity: <b>{sim}</b>"
    
sim_button.on_click(calc_similarity)

stopword_checkbox = widgets.Checkbox(
    value=True,
    description='Remove stopwords',
    disabled=False,
    indent=True
)

sim_label = widgets.HTML(value="Similarity:")
header = widgets.HBox([sim_button, stopword_checkbox, sim_label])


#------Text area widgets----------
box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='100%')
text_layout = Layout(**{'border': '1px solid rgb(255, 204, 102)', 'padding':'10px', 'width':'50%'})
text_area1 = widgets.Textarea(
    value='',
    layout=text_layout,
    disabled=False
)
text_area2 = widgets.Textarea(
    value='',
    layout=text_layout,
    disabled=False
)
output_text_box = Box(children=[text_area1, text_area2], layout=box_layout)

display(header)
display(widgets.Label("Enter text below in the two text fields to compare similarity."))
display(output_text_box)

HBox(children=(Button(description='Calculate Similarity', style=ButtonStyle(), tooltip='Click me'), Checkbox(v…

Label(value='Enter text below in the two text fields to compare similarity.')

Box(children=(Textarea(value='', layout=Layout(border='1px solid rgb(255, 204, 102)', padding='10px', width='5…