<a href="https://colab.research.google.com/github/BrockDSL/ARCH_Data_Explore/blob/main/tfidf_muni_sim_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas matplotlib ipywidgets sklearn scipy
!pip install spacy==3.2.3
!pip install --upgrade --no-cache-dir gdown
!python -m spacy download en_core_web_md

print("Loaded and ready")

In [None]:
#restart run-time automatically
import os
os.kill(os.getpid(), 9)

In [1]:
import pandas as pd
import os
import gdown

import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout, Box, VBox, Output

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import scipy.sparse

In [2]:

import spacy
nlp = spacy.load('en_core_web_md')

print(f"imported spacy version: {spacy.__version__}")
print(f"pipeline components: {nlp.pipe_names}")

def lemma_lower_no_stop_punct_space_digit(doc):
    return [t.lemma_.lower() for t in doc if (not t.is_stop) and (not t.is_punct) and (not t.is_space) and (not t.is_digit)]

#This is our "analyzer" for the TfidfVectorizer object. 
#By using a custom analyzer we skip scikit's default text processing and use SpaCy instead
def custom_analyzer(str_doc):
    doc = nlp(str_doc)
    tokens = lemma_lower_no_stop_punct_space_digit(doc)
    return tokens

imported spacy version: 3.2.3
pipeline components: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [4]:
#URLs of interest
url_list = pd.read_csv("https://raw.githubusercontent.com/BrockDSL/ARCH_Data_Explore/main/urls_of_interest.txt",header=None)
url_list.columns = ["base_url"]
url_list = url_list.values.flatten().tolist()

In [5]:
gdown.download("https://drive.google.com/u/0/uc?id=1oKNphdZkuNfeh-beuTkcIBo_EFLWO9zX&export=download","municipal_data.csv.gz",quiet=False)
!gunzip -f municipal_data.csv.gz
archive_data = pd.read_csv("municipal_data.csv")
archive_data.drop(columns=['Unnamed: 0', 'index', 'length','v_pos','v_neg','v_neu','v_comp', 'mime_type_web_server', 'mime_type_tika', 'language'], inplace=True)
archive_data = archive_data[archive_data.url.isin(url_list) & archive_data.content.notna()]
archive_data.drop_duplicates(inplace=True)
archive_data = archive_data.reset_index(drop=True)

Downloading...
From: https://drive.google.com/u/0/uc?id=1oKNphdZkuNfeh-beuTkcIBo_EFLWO9zX&export=download
To: /content/municipal_data.csv.gz
100%|██████████| 51.3M/51.3M [00:00<00:00, 102MB/s] 


In [13]:
# This is really costly and takes forever when we add SpaCy
# So instead precompute the tfidf_matrix and load it later
# vector = TfidfVectorizer(analyzer=custom_analyzer)
# tfidf_matrix = vector.fit_transform(archive_data.content)
# scipy.sparse.save_npz('tdidf_matrix.npz', tfidf_matrix)
gdown.download("https://drive.google.com/u/0/uc?id=1A64-0_xKVrLs2UGhP4wtHshOHJPJp3Gv&export=download", 'tdidf_matrix.npz', quiet=False)
tfidf_matrix = scipy.sparse.load_npz('tdidf_matrix.npz')

Downloading...
From: https://drive.google.com/u/0/uc?id=1A64-0_xKVrLs2UGhP4wtHshOHJPJp3Gv&export=download
To: /content/tdidf_matrix.npz
100%|██████████| 3.54M/3.54M [00:00<00:00, 66.4MB/s]


In [14]:
#-------Compare similarity of crawls using SpaCy-------

similarity_label = widgets.HTML(value="")

#Document content output widgets
box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='100%')
text_area_layout = Layout(**{'border': '1px solid rgb(255, 204, 102)', 'padding':'10px', 'width':'50%'})
doc_output1 = widgets.HTML(
    value='',
    layout=text_area_layout,
)
doc_output2 = widgets.HTML(
    value='',
    layout=text_area_layout,
)

output_box = Box(children=[doc_output1, doc_output2], layout=box_layout)

#------Url 1 widgets---------
url_1 = widgets.Dropdown(
    options=url_list,
    description='URL 1:',
    ensure_option=True,
    disabled=False,
    layout=Layout(width='99%')
)
url_1_date = widgets.Dropdown(
    options=[],
    description='Date:',
    ensure_option=True,
    disabled=False
)

def filter_url(url, date=None):
    if date:
        filtered_url = archive_data[ (archive_data.url == url) & (archive_data.crawl_date == date) ]
    else:
        filtered_url = archive_data[(archive_data.url == url)]
    return filtered_url

def calculate_similarity():
    first_url = filter_url(url_1.value, url_1_date.value)
    first_url_index = first_url.index[0]
    second_url = filter_url(url_2.value, url_2_date.value)
    second_url_index = second_url.index[0]
    
    first_url_tfidf_vector = tfidf_matrix[first_url_index]
    second_url_tfidf_vector = tfidf_matrix[second_url_index]
    
    cosine_similarity = linear_kernel(first_url_tfidf_vector, second_url_tfidf_vector).flatten()[0]
    
    similarity_label.value = "Similarity is: <b>{:.6f}</b>".format(cosine_similarity)
    doc1 = nlp(first_url['content'].values[0])
    doc2 = nlp(second_url['content'].values[0])
    return doc1,doc2,cosine_similarity

def on_value_change_url_1(change):
    if change['name'] == 'value':
        filtered_url = filter_url(change['new'])
        dates = filtered_url.crawl_date.values.flatten().tolist()
        
        #Normally the output is changed only when the date selected is changed
        #If the user selects a new url, but the date widget's value is the same
        #then the output will not refresh. This fixes that.
        refresh = False
        if url_1_date.value == dates[0]: 
            refresh = True
        
        url_1_date.options = dates
        
        if refresh:
            refresh_output(*calculate_similarity())

def on_value_change_date(change):
    if change['name'] == 'value':
        refresh_output(*calculate_similarity())

def refresh_output(doc1, doc2, sim):
    def get_sents(doc):
        v = []
        for i,s in enumerate(doc.sents):
            #v.append(f"<p>{i}: {s}</p>") #show sentence number 
            v.append(f"<p>{s}</p>")
        return '\n'.join(v)
    
    doc_output1.value = get_sents(doc1)
    doc_output2.value = get_sents(doc2)
    
    
url_1.observe(on_value_change_url_1)
url_1_date.observe(on_value_change_date)

#------Url 2 widgets---------
url_2 = widgets.Dropdown(
    options=url_list,
    description='URL 2:',
    ensure_option=True,
    disabled=False,
    layout=Layout(width='99%')
)
url_2_date = widgets.Dropdown(
    options=[],
    description='Date:',
    ensure_option=True,
    disabled=False
)

def on_value_change_url_2(change):
    if change['name'] == 'value':
        filtered_url = filter_url(change['new'])
        dates = filtered_url.crawl_date.values.flatten().tolist()
        
        refresh = False
        if url_2_date.value == dates[0]: 
            refresh = True
        
        url_2_date.options = dates
        
        if refresh:
            refresh_output(*calculate_similarity())
        
url_2.observe(on_value_change_url_2)
url_2_date.observe(on_value_change_date)

#-----Display------
url_1.value = url_list[1] #force a change to populate the date drop down
url_2.value = url_list[2]

selection_widgets = VBox([widgets.Label(value="Select URLs and crawl dates to perform similarity comparisons"),
                          url_1, url_1_date, url_2, url_2_date, similarity_label],)
display(selection_widgets)
display(output_box)

VBox(children=(Label(value='Select URLs and crawl dates to perform similarity comparisons'), Dropdown(descript…

Box(children=(HTML(value="<p>How to Protect Yourself from COVID-19 - Niagara Region, Ontario State of emergenc…