<a href="https://colab.research.google.com/github/BrockDSL/ARCH_Data_Explore/blob/main/sim_comparison_march_14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas matplotlib ipywidgets
!pip install spacy
!pip install --upgrade --no-cache-dir gdown
!python -m spacy download en_core_web_md

print("Loaded and ready")

Collecting gdown
  Downloading gdown-4.4.0.tar.gz (14 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gdown
  Building wheel for gdown (PEP 517) ... [?25l[?25hdone
  Created wheel for gdown: filename=gdown-4.4.0-py3-none-any.whl size=14774 sha256=91c15bfee7ab666b2528d186c35adbe9f197252a0c586ff2e515bf30659613a5
  Stored in directory: /tmp/pip-ephem-wheel-cache-55fhbk_u/wheels/fb/c3/0e/c4d8ff8bfcb0461afff199471449f642179b74968c15b7a69c
Successfully built gdown
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.2.2
    Uninstalling gdown-4.2.2:
      Successfully uninstalled gdown-4.2.2
Successfully installed gdown-4.4.0
Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (

In [None]:
#restart run-time automatically
import os
os.kill(os.getpid(), 9)

In [1]:
import pandas as pd
import os
import gdown

import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout, Box, VBox, Output

In [None]:

import spacy
nlp = spacy.load('en_core_web_md')
def without_stopwords(doc):
    return nlp(' '.join([str(t) for t in doc if not t.is_stop]))

In [None]:
gdown.download("https://drive.google.com/u/0/uc?id=1oKNphdZkuNfeh-beuTkcIBo_EFLWO9zX&export=download","municipal_data.csv.gz",quiet=False)
!gunzip -f municipal_data.csv.gz
archive_data = pd.read_csv("municipal_data.csv")
archive_data.drop(columns=['Unnamed: 0', 'index', 'length','v_pos','v_neg','v_neu','v_comp', 'mime_type_web_server', 'mime_type_tika', 'language'], inplace=True)

In [None]:
#URLs of interest
url_list = pd.read_csv("https://raw.githubusercontent.com/BrockDSL/ARCH_Data_Explore/main/urls_of_interest.txt",header=None)
url_list.columns = ["base_url"]
url_list = url_list.values.flatten().tolist()

In [None]:
#-------Compare similarity of crawls using SpaCy-------

similarity_label = widgets.HTML(value="")

#Document content output widgets
box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='100%')
text_area_layout = Layout(**{'border': '1px solid rgb(255, 204, 102)', 'padding':'10px', 'width':'50%'})
doc_output1 = widgets.HTML(
    value='',
    layout=text_area_layout,
)
doc_output2 = widgets.HTML(
    value='',
    layout=text_area_layout,
)

output_box = Box(children=[doc_output1, doc_output2], layout=box_layout)

#------Url 1 widgets---------
url_1 = widgets.Dropdown(
    options=url_list,
    description='URL 1:',
    ensure_option=True,
    disabled=False,
    layout=Layout(width='99%')
)
url_1_date = widgets.Dropdown(
    options=[],
    description='Date:',
    ensure_option=True,
    disabled=False
)

def filter_url(url, date=None):
    if date:
        filtered_url = archive_data[ (archive_data.url == url) & (archive_data.crawl_date == date) ]
    else:
        filtered_url = archive_data[(archive_data.url == url)]
    filtered_url = filtered_url[filtered_url.content.notna()]
    filtered_url = filtered_url.drop_duplicates()
    return filtered_url

def calculate_similarity():
    first_url = filter_url(url_1.value, url_1_date.value)
    second_url = filter_url(url_2.value, url_2_date.value)
    doc1 = nlp(first_url['content'].values[0])
    doc1_no_stop = without_stopwords(doc1)
    doc2 = nlp(second_url['content'].values[0])
    doc2_no_stop = without_stopwords(doc2)
    sim = doc1_no_stop.similarity(doc2_no_stop)
    similarity_label.value = "Similarity is: <b>{:.6f}</b>".format(sim)
    return doc1,doc2,sim

def on_value_change_url_1(change):
    if change['name'] == 'value':
        filtered_url = filter_url(change['new'])
        dates = filtered_url.crawl_date.values.flatten().tolist()
        url_1_date.options = dates

def on_value_change_date(change):
    if change['name'] == 'value':
        refresh_output(*calculate_similarity())

def refresh_output(doc1, doc2, sim):
    def get_sents(doc):
        v = []
        for i,s in enumerate(doc.sents):
            #v.append(f"<p>{i}: {s}</p>") #show sentence number 
            v.append(f"<p>{s}</p>")
        return '\n'.join(v)
    
    doc_output1.value = get_sents(doc1)
    doc_output2.value = get_sents(doc2)
    
    
url_1.observe(on_value_change_url_1)
url_1_date.observe(on_value_change_date)

#------Url 2 widgets---------
url_2 = widgets.Dropdown(
    options=url_list,
    description='URL 2:',
    ensure_option=True,
    disabled=False,
    layout=Layout(width='99%')
)
url_2_date = widgets.Dropdown(
    options=[],
    description='Date:',
    ensure_option=True,
    disabled=False
)

def on_value_change_url_2(change):
    if change['name'] == 'value':
        filtered_url = filter_url(change['new'])
        dates = filtered_url.crawl_date.values.flatten().tolist()
        url_2_date.options = dates
        
url_2.observe(on_value_change_url_2)
url_2_date.observe(on_value_change_date)

#-----Display------
url_1.value = url_list[1] #force a change to populate the date drop down
url_2.value = url_list[2]

selection_widgets = VBox([widgets.Label(value="Select URLs and crawl dates to perform similarity comparisons"),
                          url_1, url_1_date, url_2, url_2_date, similarity_label],)
display(selection_widgets)
display(output_box)

In [None]:
#Section for experimenting with similarity comparisons of with custom text
#------Button and label widgets---
sim_button = widgets.Button(
    description='Calculate Similarity',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click me',
)
def calc_similarity(*args):
    if stopword_checkbox.value:
        doc1 = without_stopwords(nlp(text_area1.value))
        doc2 = without_stopwords(nlp(text_area2.value))
    else:
        doc1 = nlp(text_area1.value)
        doc2 = nlp(text_area2.value)
    sim = doc1.similarity(doc2)
    sim_label.value = f"Similarity: <b>{sim}</b>"
    
sim_button.on_click(calc_similarity)

stopword_checkbox = widgets.Checkbox(
    value=True,
    description='Remove stopwords',
    disabled=False,
    indent=True
)

sim_label = widgets.HTML(value="Similarity:")
header = widgets.HBox([sim_button, stopword_checkbox, sim_label])


#------Text area widgets----------
box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='100%')
text_layout = Layout(**{'border': '1px solid rgb(255, 204, 102)', 'padding':'10px', 'width':'50%'})
text_area1 = widgets.Textarea(
    value='',
    layout=text_layout,
    disabled=False
)
text_area2 = widgets.Textarea(
    value='',
    layout=text_layout,
    disabled=False
)
output_text_box = Box(children=[text_area1, text_area2], layout=box_layout)

display(header)
display(widgets.Label("Enter text below in the two text fields to compare similarity."))
display(output_text_box)