In [11]:
import fasttext

In [12]:
# english wiki first 1 billion bytes
# https://fasttext.cc/docs/en/unsupervised-tutorial.html
model_path = './wiki_1b.bin'
# model = fasttext.train_unsupervised('data/fil9')
# model.save_model('wiki_1b.bin')

In [13]:
model = fasttext.load_model(model_path)



In [14]:
import os
 
file_size = os.path.getsize('./wiki_1b.bin')
print(f'{round(file_size / 1024 / 1024, 2)} MB')

933.15 MB


In [15]:
model.quantize()  # unsupervised word embedding cannot be quantized

# This means that we have to allow loading of the model on-demand instead bundling like the supervised model for release notes classification

ValueError: For now we only support quantization of supervised models

In [17]:
# https://masongallo.github.io/machine/learning,/python/2016/07/29/cosine-similarity.html
import numpy as np

def cos_sim(a, b):
    """Takes 2 vectors a, b and returns the cosine similarity according 
    to the definition of the dot product
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [18]:
def sim(model, a, b):
    if type(model) is str:
        model = fasttext.load_model(model)

    return cos_sim(*map(model.get_sentence_vector, (a, b)))

In [8]:
sim(
    './wiki_1b.bin',
    'fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',
    'fix issue on loading beacons for VWI job creation',
)



0.9222608

In [9]:
# prefix stripped
sim(
    model,
    'fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',
    'fix issue on loading beacons for VWI job creation',
)

0.9222608

In [10]:
# prefix intact
sim(
    model,
    'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',
    'builder - fix issue on loading beacons for VWI job creation',
)

0.9264801

In [11]:
sim(
    model,
    'modal - explore popstate events & add default onbeforeunload alert if there are unsaved changes',
    'builder - fix issue on loading beacons for VWI job creation',
)

0.79175115

In [12]:
sim(
    model,
    'react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master',
    'Map/GeoCohortMap - add setViewportBBox , setApiBBox'
)

0.86247617

In [2]:
# clear stopwords using gensim preprocessing collection
from gensim.parsing.preprocessing import STOPWORDS



In [4]:
def normalize(s):
    s = s.lower().split()
    s = [w for w in s if w not in STOPWORDS]
    return ' '.join(s)

In [5]:
normalize('react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master')

'react-maps - merge [g2m] map/geocohortmap - add setviewportbbox , setapibbox #76 master'

In [38]:
def sim_norm(model, a, b, normalizations = (normalize, model.get_sentence_vector)):
    if type(model) is str:
        model = fasttext.load_model(model)

    for fn in normalizations:
        a, b = map(fn, (a, b))

    return cos_sim(a, b)

In [39]:
sim_norm(
    model,
    'react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master',
    'Map/GeoCohortMap - add setViewportBBox , setApiBBox'
)

0.91396576

In [40]:
sim_norm(
    model,
    'modal - explore popstate events & add default onbeforeunload alert if there are unsaved changes',
    'builder - fix issue on loading beacons for VWI job creation',
)

0.79462

In [41]:
sim_norm(
    model,
    'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',
    'builder - fix issue on loading beacons for VWI job creation',
)

0.93234414

In [45]:
prs = [
    pr.strip() for pr in '''
        builder - fix issue on loading beacons for VWI job creation
        Hub - change connection type to direct when doing direct upload
        QL / New Taxonomies + Missing views + Layer views
        Popular Times - add ConscientAI for access
        Hub - add direct upload as connection type
        DevOps/Fix - Update cluster names
        New Airflow cluster + task priority + sensor to reschedule
        Camp/Insights/GeoCohort - Add bounded box filtering
        Athena/BeaconConv - Fix - SQL dict to json
        Adserver/Beacons - Add support for custom content as url params
        Report/GeoCohort - use bbox coords to pull postal code level insights
        package - update package to v0.6.1
        Geocohort mvt
        Intelligence map
        Map/GeoCohortMap - add setViewportBBox , setApiBBox
        13 - Data processing
        Work with generated/fake data
        Modal/refactor
        Add map widget
        Lumen tree/tailwindcss
        Commit watch integration and mention in git conventions
        journal - setup Notion dev-journal automation workflow
        losses visualization
    '''.strip().split('\n')
]

dids = [
    did.strip() for did in '''
        react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master
        react-maps- upgrade package and make new release 0.6.1
        overlord/geocohort - fix passing the clean list of GeoCohortFSA aggregated data to map #1833
        overlord/geocohort - upgrade @eqworks/react-maps package in #1833 and merge with master
        overlord/geocohort - review [G2M] table/cluster chart enhancement #1829
        overlord/geocohort - add extra tab for GeoCohort overview #1834
        overseer - test last master branch with bbox added to insights/geocohort
        overlord/geocohort - adjust tooltipFormatX and axisBottomLabelDisplayFn for dates in geocohort aggregated data #1834
        overlord/geocohort - add all geocohorts to the time line chart #1834
        react-maps/GeoCohortMap - open for review [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76
        overlord/GeoCohortMap - clean up and open for review [G2M] Report/GeoCohort - use bbox coords to pull postal code level insights
        react-maps - reviewed https://github.com/EQWorks/react-maps/pull/75
        overlord/geocohort - use bbox coords to pull postal code level insights #1833
        overlord/
        react-maps/GeoCohortMap/Map - design both maps to send out bbox coords of the current viewport #76
        atom - test Quebec cities with geocoder and api calls and collaborate with Ianec to fix Quebec cities geom pull from DB
        QL - refactor modal setup
        modal - explore popstate events & add default onbeforeunload alert if there are unsaved changes
        ql-connect - ql design prototype overview + ui/backend updates + discussion on Tailwindcss exploration experience
        modal/refactor - isolate query save-success modal configs
        modal/refactor - isolate query delete modal configs
        modal/refactor - isolate execution cancel modal configs + file cleanups
        modal/refactor - add reset modal configs
        modal/refactor - cleanups + remove unused modal config states
        design - add new buttons for rest/query/cancel-executions
        design/tree-selector - organize classes with clsx
        modal/refactor - re-style Card component
        modal/refactor - isolate Modal component
        modal/refactor - separate common vs ql components
        modal/refactor - re-style Textfield component
        modal/refactor - isolate query save modal configs
        design/tree-selector - replace Tree selector with tailwindcss
        design/tree-selector - create & style List component
        design/tree-selector - create & style Dialog Component
        design/tree-selector - handle search & support ListMenu for tree selector
        design/tree-selector - support TreeMenu for tree selector
        design/explore - explore tailwindcss
        design/explore - explore headlessui
        design/tree-selector - project setup + init tailwindcss
        design/tree-selector - create & style Textfield component
        notion/journal - fix bug with template literals
        notion/journal - add name-transform for displaying number of prev-day incomplete tasks
        notion/journal - setup db retrieving process
        notion/journal - setup journal routine automation workflows
        product - review https://github.com/EQWorks/ws-problems/issues/165 (client-side crashes on server hibernation)
        common - revise https://github.com/EQWorks/common/pull/27
        data - interview https://eqworks.workable.com/backend/jobs/547053/browser/interview/candidate/129031036
        data - interview https://eqworks.workable.com/backend/jobs/547053/browser/interview/candidate/126188189
        snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)
        python-curriculum/12 - elaborate on the use of Categorical columns, hash device ids
        tech-evan - review medium post "Bridging Web UI into Notebooks"
        data - review https://github.com/EQWorks/ws-problems/issues/127
        data - review https://github.com/EQWorks/ws-problems/issues/145
        data - review https://github.com/EQWorks/ws-problems/issues/150
        data - review https://github.com/EQWorks/ws-problems/issues/151
        python-curriculum/12 - finalize/grammar cleanup, diversify interactive example (filter by both region/name)
        data - review candidate/120151018 and candidate/121056418
        python-curriculum/12 - written material
        python-curriculum/12 - widgets (nested to select columns and values)
        connector-gcs - create connection_hub_gcs_dev test bucket (under EQ Hyperlocal, no org google cloud proj)
        Airflow - Migrate dev stage workflow to new cluster
        Review the data select process design
        locus-ql - make a user flow map for the views selection process
        design/locus-ql - refractor design so that selected columns are differentiable by view categories
        design/locus-ql - polish the user flow for selecting views and columns, check on possible interactions at each step
        design/locus-ql - incorporate team's feedback into the next iteration of views shopping cart prototype
        locus-ql - wireframe different user flows for views shopping cart feature
        design/locus-ql - built new views popup sequence, selection panel, individual column cards
    '''.strip().split('\n')
]

In [105]:
import pandas as pd

data = []
for did in dids:
    for pr in prs:
        data.append({
            'did': did,
            'pr': pr,
            'sim': sim(model, pr, did),
            'sim_norm': sim_norm(model, pr, did),
        })

df = pd.DataFrame(data)
df

Unnamed: 0,did,pr,sim,sim_norm
0,react-maps - merge [G2M] Map/GeoCohortMap - ad...,builder - fix issue on loading beacons for VWI...,0.726238,0.761086
1,react-maps - merge [G2M] Map/GeoCohortMap - ad...,Hub - change connection type to direct when do...,0.753736,0.635717
2,react-maps - merge [G2M] Map/GeoCohortMap - ad...,QL / New Taxonomies + Missing views + Layer views,0.481809,0.613164
3,react-maps - merge [G2M] Map/GeoCohortMap - ad...,Popular Times - add ConscientAI for access,0.747702,0.744181
4,react-maps - merge [G2M] Map/GeoCohortMap - ad...,Hub - add direct upload as connection type,0.756644,0.691825
...,...,...,...,...
1559,design/locus-ql - built new views popup sequen...,Add map widget,0.527497,0.614477
1560,design/locus-ql - built new views popup sequen...,Lumen tree/tailwindcss,0.601705,0.580893
1561,design/locus-ql - built new views popup sequen...,Commit watch integration and mention in git co...,0.661903,0.598540
1562,design/locus-ql - built new views popup sequen...,journal - setup Notion dev-journal automation ...,0.749851,0.745144


In [106]:
# filter by threshold of sim
dff = df.query('sim >= 0.8 or sim_norm >= 0.8')
dff

Unnamed: 0,did,pr,sim,sim_norm
14,react-maps - merge [G2M] Map/GeoCohortMap - ad...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.862476,0.913966
34,react-maps- upgrade package and make new relea...,package - update package to v0.6.1,0.801621,0.769318
46,overlord/geocohort - fix passing the clean lis...,builder - fix issue on loading beacons for VWI...,0.835479,0.798545
47,overlord/geocohort - fix passing the clean lis...,Hub - change connection type to direct when do...,0.804585,0.710856
53,overlord/geocohort - fix passing the clean lis...,Camp/Insights/GeoCohort - Add bounded box filt...,0.769736,0.824506
...,...,...,...,...
1518,locus-ql - wireframe different user flows for ...,builder - fix issue on loading beacons for VWI...,0.813455,0.779025
1525,locus-ql - wireframe different user flows for ...,Camp/Insights/GeoCohort - Add bounded box filt...,0.780383,0.819209
1528,locus-ql - wireframe different user flows for ...,Report/GeoCohort - use bbox coords to pull pos...,0.835963,0.819813
1532,locus-ql - wireframe different user flows for ...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.725369,0.804823


In [107]:
def printer(df):
    for row in df.itertuples():
        for k, v in row._asdict().items():
            if k != 'Index':
                print(f'{k}: {v}')

        print('-' * 10, '\n')

In [108]:
# get the pair of pr, did with the highest sim score
idx = dff.groupby('did').sim.idxmax()
sim_pairs = df.iloc[idx]
sim_pairs

Unnamed: 0,did,pr,sim,sim_norm
1390,Airflow - Migrate dev stage workflow to new cl...,Report/GeoCohort - use bbox coords to pull pos...,0.803692,0.740882
385,QL - refactor modal setup,Modal/refactor,0.701437,0.837877
1413,Review the data select process design,Report/GeoCohort - use bbox coords to pull pos...,0.845225,0.807889
355,atom - test Quebec cities with geocoder and ap...,Report/GeoCohort - use bbox coords to pull pos...,0.860691,0.853623
1367,connector-gcs - create connection_hub_gcs_dev ...,Report/GeoCohort - use bbox coords to pull pos...,0.83413,0.852461
1194,data - review https://github.com/EQWorks/ws-pr...,journal - setup Notion dev-journal automation ...,0.844592,0.863798
1217,data - review https://github.com/EQWorks/ws-pr...,journal - setup Notion dev-journal automation ...,0.849167,0.865895
1240,data - review https://github.com/EQWorks/ws-pr...,journal - setup Notion dev-journal automation ...,0.847276,0.865414
1263,data - review https://github.com/EQWorks/ws-pr...,journal - setup Notion dev-journal automation ...,0.845011,0.863669
552,design - add new buttons for rest/query/cancel...,builder - fix issue on loading beacons for VWI...,0.845306,0.797208


In [109]:
# get the pair of pr, did with the highest sim_norm score
idx = dff.groupby('did').sim_norm.idxmax()
norm_pairs = df.iloc[idx]
norm_pairs

Unnamed: 0,did,pr,sim,sim_norm
1390,Airflow - Migrate dev stage workflow to new cl...,Report/GeoCohort - use bbox coords to pull pos...,0.803692,0.740882
385,QL - refactor modal setup,Modal/refactor,0.701437,0.837877
1424,Review the data select process design,journal - setup Notion dev-journal automation ...,0.794472,0.833235
355,atom - test Quebec cities with geocoder and ap...,Report/GeoCohort - use bbox coords to pull pos...,0.860691,0.853623
1367,connector-gcs - create connection_hub_gcs_dev ...,Report/GeoCohort - use bbox coords to pull pos...,0.83413,0.852461
1194,data - review https://github.com/EQWorks/ws-pr...,journal - setup Notion dev-journal automation ...,0.844592,0.863798
1217,data - review https://github.com/EQWorks/ws-pr...,journal - setup Notion dev-journal automation ...,0.849167,0.865895
1240,data - review https://github.com/EQWorks/ws-pr...,journal - setup Notion dev-journal automation ...,0.847276,0.865414
1263,data - review https://github.com/EQWorks/ws-pr...,journal - setup Notion dev-journal automation ...,0.845011,0.863669
566,design - add new buttons for rest/query/cancel...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.7261,0.811494


In [110]:
dedupe = norm_pairs.query('sim_norm >= 0.85')
printer(dedupe)

did: atom - test Quebec cities with geocoder and api calls and collaborate with Ianec to fix Quebec cities geom pull from DB
pr: Report/GeoCohort - use bbox coords to pull postal code level insights
sim: 0.8606913685798645
sim_norm: 0.8536229133605957
---------- 

did: connector-gcs - create connection_hub_gcs_dev test bucket (under EQ Hyperlocal, no org google cloud proj)
pr: Report/GeoCohort - use bbox coords to pull postal code level insights
sim: 0.8341295719146729
sim_norm: 0.8524613380432129
---------- 

did: data - review https://github.com/EQWorks/ws-problems/issues/127
pr: journal - setup Notion dev-journal automation workflow
sim: 0.8445922136306763
sim_norm: 0.8637979626655579
---------- 

did: data - review https://github.com/EQWorks/ws-problems/issues/145
pr: journal - setup Notion dev-journal automation workflow
sim: 0.8491669297218323
sim_norm: 0.8658953905105591
---------- 

did: data - review https://github.com/EQWorks/ws-problems/issues/150
pr: journal - setup Notion 

In [111]:
len(did), len(dedupe)

(90, 16)