In [86]:
import fasttext

In [87]:
# english wiki first 1 billion bytes
# https://fasttext.cc/docs/en/unsupervised-tutorial.html
model_path = './wiki_1b.bin'
# model = fasttext.train_unsupervised('data/fil9')
# model.save_model('wiki_1b.bin')

In [88]:
model = fasttext.load_model(model_path)



In [89]:
import os
 
file_size = os.path.getsize('./wiki_1b.bin')
print(f'{round(file_size / 1024 / 1024, 2)} MB')

933.15 MB


In [90]:
# https://masongallo.github.io/machine/learning,/python/2016/07/29/cosine-similarity.html
import numpy as np

def cos_sim(a, b):
    """Takes 2 vectors a, b and returns the cosine similarity according 
    to the definition of the dot product
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [123]:
from gensim.utils import simple_preprocess, tokenize
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string

In [125]:
def join_list(l, d = ' '):
    return d.join(l)

In [127]:
pr = 'builder - fix issue on loading beacons for VWI job creation'
did = 'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)'

for s in (pr, did):
    print(s)
    for fn in (simple_preprocess, tokenize):
        print('fn:', fn.__name__)
        print(join_list(fn(s)))
    print('-----')

builder - fix issue on loading beacons for VWI job creation
fn: simple_preprocess
builder fix issue on loading beacons for vwi job creation
fn: tokenize
builder fix issue on loading beacons for VWI job creation
-----
snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)
fn: simple_preprocess
snoke fix beacons list not loading in vwi job creation raised in https app asana com
fn: tokenize
snoke fix beacons list not loading in VWI job creation raised in https app asana com
-----


In [128]:
def sim(model, a, b, normalizations = (simple_preprocess, join_list)):
    if type(model) is str:
        model = fasttext.load_model(model)

    for fn in normalizations:
        a, b = map(fn, (a, b))

    return cos_sim(*map(model.get_sentence_vector, (a, b)))

In [129]:
sim(
    model,
    'builder - fix issue on loading beacons for VWI job creation',
    'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',
)

0.91939306

In [130]:
sim(
    model,
    'builder - fix issue on loading beacons for VWI job creation',
    'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',
    normalizations=[tokenize, join_list],
)

0.9049676

In [131]:
sim(
    model,
    'builder - fix issue on loading beacons for VWI job creation',
    'snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)',
    normalizations = [],
)

0.9264801

In [132]:
prs = [
    pr.strip() for pr in '''
        builder - fix issue on loading beacons for VWI job creation
        Hub - change connection type to direct when doing direct upload
        QL / New Taxonomies + Missing views + Layer views
        Popular Times - add ConscientAI for access
        Hub - add direct upload as connection type
        DevOps/Fix - Update cluster names
        New Airflow cluster + task priority + sensor to reschedule
        Camp/Insights/GeoCohort - Add bounded box filtering
        Athena/BeaconConv - Fix - SQL dict to json
        Adserver/Beacons - Add support for custom content as url params
        Report/GeoCohort - use bbox coords to pull postal code level insights
        package - update package to v0.6.1
        Geocohort mvt
        Intelligence map
        Map/GeoCohortMap - add setViewportBBox , setApiBBox
        13 - Data processing
        Work with generated/fake data
        Modal/refactor
        Add map widget
        Lumen tree/tailwindcss
        Commit watch integration and mention in git conventions
        journal - setup Notion dev-journal automation workflow
        losses visualization
    '''.strip().split('\n')
]

dids = [
    did.strip() for did in '''
        react-maps - merge [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76 with master
        react-maps- upgrade package and make new release 0.6.1
        overlord/geocohort - fix passing the clean list of GeoCohortFSA aggregated data to map #1833
        overlord/geocohort - upgrade @eqworks/react-maps package in #1833 and merge with master
        overlord/geocohort - review [G2M] table/cluster chart enhancement #1829
        overlord/geocohort - add extra tab for GeoCohort overview #1834
        overseer - test last master branch with bbox added to insights/geocohort
        overlord/geocohort - adjust tooltipFormatX and axisBottomLabelDisplayFn for dates in geocohort aggregated data #1834
        overlord/geocohort - add all geocohorts to the time line chart #1834
        react-maps/GeoCohortMap - open for review [G2M] Map/GeoCohortMap - add setViewportBBox , setApiBBox #76
        overlord/GeoCohortMap - clean up and open for review [G2M] Report/GeoCohort - use bbox coords to pull postal code level insights
        react-maps - reviewed https://github.com/EQWorks/react-maps/pull/75
        overlord/geocohort - use bbox coords to pull postal code level insights #1833
        overlord/
        react-maps/GeoCohortMap/Map - design both maps to send out bbox coords of the current viewport #76
        atom - test Quebec cities with geocoder and api calls and collaborate with Ianec to fix Quebec cities geom pull from DB
        QL - refactor modal setup
        modal - explore popstate events & add default onbeforeunload alert if there are unsaved changes
        ql-connect - ql design prototype overview + ui/backend updates + discussion on Tailwindcss exploration experience
        modal/refactor - isolate query save-success modal configs
        modal/refactor - isolate query delete modal configs
        modal/refactor - isolate execution cancel modal configs + file cleanups
        modal/refactor - add reset modal configs
        modal/refactor - cleanups + remove unused modal config states
        design - add new buttons for rest/query/cancel-executions
        design/tree-selector - organize classes with clsx
        modal/refactor - re-style Card component
        modal/refactor - isolate Modal component
        modal/refactor - separate common vs ql components
        modal/refactor - re-style Textfield component
        modal/refactor - isolate query save modal configs
        design/tree-selector - replace Tree selector with tailwindcss
        design/tree-selector - create & style List component
        design/tree-selector - create & style Dialog Component
        design/tree-selector - handle search & support ListMenu for tree selector
        design/tree-selector - support TreeMenu for tree selector
        design/explore - explore tailwindcss
        design/explore - explore headlessui
        design/tree-selector - project setup + init tailwindcss
        design/tree-selector - create & style Textfield component
        notion/journal - fix bug with template literals
        notion/journal - add name-transform for displaying number of prev-day incomplete tasks
        notion/journal - setup db retrieving process
        notion/journal - setup journal routine automation workflows
        product - review https://github.com/EQWorks/ws-problems/issues/165 (client-side crashes on server hibernation)
        common - revise https://github.com/EQWorks/common/pull/27
        data - interview https://eqworks.workable.com/backend/jobs/547053/browser/interview/candidate/129031036
        data - interview https://eqworks.workable.com/backend/jobs/547053/browser/interview/candidate/126188189
        snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)
        python-curriculum/12 - elaborate on the use of Categorical columns, hash device ids
        tech-evan - review medium post "Bridging Web UI into Notebooks"
        data - review https://github.com/EQWorks/ws-problems/issues/127
        data - review https://github.com/EQWorks/ws-problems/issues/145
        data - review https://github.com/EQWorks/ws-problems/issues/150
        data - review https://github.com/EQWorks/ws-problems/issues/151
        python-curriculum/12 - finalize/grammar cleanup, diversify interactive example (filter by both region/name)
        data - review candidate/120151018 and candidate/121056418
        python-curriculum/12 - written material
        python-curriculum/12 - widgets (nested to select columns and values)
        connector-gcs - create connection_hub_gcs_dev test bucket (under EQ Hyperlocal, no org google cloud proj)
        Airflow - Migrate dev stage workflow to new cluster
        Review the data select process design
        locus-ql - make a user flow map for the views selection process
        design/locus-ql - refractor design so that selected columns are differentiable by view categories
        design/locus-ql - polish the user flow for selecting views and columns, check on possible interactions at each step
        design/locus-ql - incorporate team's feedback into the next iteration of views shopping cart prototype
        locus-ql - wireframe different user flows for views shopping cart feature
        design/locus-ql - built new views popup sequence, selection panel, individual column cards
    '''.strip().split('\n')
]

In [133]:
import pandas as pd

In [134]:
data = []
for did in dids:
    for pr in prs:
        data.append({
            'did': did,
            'pr': pr,
            'sim': sim(model, pr, did, normalizations=[]),
            'sim_norm': sim(model, pr, did),
            'sim_tok': sim(model, pr, did, normalizations=[tokenize, join_list]),
        })

df = pd.DataFrame(data)
df

Unnamed: 0,did,pr,sim,sim_norm,sim_tok
0,react-maps - merge [G2M] Map/GeoCohortMap - ad...,builder - fix issue on loading beacons for VWI...,0.726238,0.775629,0.764320
1,react-maps - merge [G2M] Map/GeoCohortMap - ad...,Hub - change connection type to direct when do...,0.753736,0.758098,0.787245
2,react-maps - merge [G2M] Map/GeoCohortMap - ad...,QL / New Taxonomies + Missing views + Layer views,0.481809,0.741818,0.597494
3,react-maps - merge [G2M] Map/GeoCohortMap - ad...,Popular Times - add ConscientAI for access,0.747702,0.728598,0.708794
4,react-maps - merge [G2M] Map/GeoCohortMap - ad...,Hub - add direct upload as connection type,0.756644,0.776477,0.786975
...,...,...,...,...,...
1559,design/locus-ql - built new views popup sequen...,Add map widget,0.527497,0.659233,0.601145
1560,design/locus-ql - built new views popup sequen...,Lumen tree/tailwindcss,0.601705,0.678209,0.662537
1561,design/locus-ql - built new views popup sequen...,Commit watch integration and mention in git co...,0.661903,0.670235,0.677619
1562,design/locus-ql - built new views popup sequen...,journal - setup Notion dev-journal automation ...,0.749851,0.743611,0.749648


In [135]:
# filter by threshold of sim
dff = df.query('sim >= 0.9 or sim_norm >= 0.9 or sim_tok >= 0.9')
dff

Unnamed: 0,did,pr,sim,sim_norm,sim_tok
14,react-maps - merge [G2M] Map/GeoCohortMap - ad...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.862476,0.94175,0.902895
221,react-maps/GeoCohortMap - open for review [G2M...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.885163,0.956354,0.917622
240,overlord/GeoCohortMap - clean up and open for ...,Report/GeoCohort - use bbox coords to pull pos...,0.934373,0.958219,0.94246
286,overlord/geocohort - use bbox coords to pull p...,Report/GeoCohort - use bbox coords to pull pos...,0.976856,0.981589,0.977886
638,modal/refactor - isolate Modal component,Modal/refactor,0.787151,0.923675,0.900247
1010,notion/journal - setup journal routine automat...,journal - setup Notion dev-journal automation ...,0.965812,0.960012,0.952789
1104,snoke - fix beacons list not loading in VWI jo...,builder - fix issue on loading beacons for VWI...,0.92648,0.919393,0.904968


In [136]:
def printer(df):
    for row in df.itertuples():
        for k, v in row._asdict().items():
            if k != 'Index':
                print(f'{k}: {v}')

        print('-' * 10, '\n')

In [137]:
# get the pair of pr, did with the highest sim score
idx = dff.groupby('did').sim.idxmax()
sim_pairs = df.iloc[idx]
sim_pairs

Unnamed: 0,did,pr,sim,sim_norm,sim_tok
638,modal/refactor - isolate Modal component,Modal/refactor,0.787151,0.923675,0.900247
1010,notion/journal - setup journal routine automat...,journal - setup Notion dev-journal automation ...,0.965812,0.960012,0.952789
240,overlord/GeoCohortMap - clean up and open for ...,Report/GeoCohort - use bbox coords to pull pos...,0.934373,0.958219,0.94246
286,overlord/geocohort - use bbox coords to pull p...,Report/GeoCohort - use bbox coords to pull pos...,0.976856,0.981589,0.977886
14,react-maps - merge [G2M] Map/GeoCohortMap - ad...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.862476,0.94175,0.902895
221,react-maps/GeoCohortMap - open for review [G2M...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.885163,0.956354,0.917622
1104,snoke - fix beacons list not loading in VWI jo...,builder - fix issue on loading beacons for VWI...,0.92648,0.919393,0.904968


In [138]:
# get the pair of pr, did with the highest sim_norm score
idx = dff.groupby('did').sim_norm.idxmax()
norm_pairs = df.iloc[idx]
norm_pairs

Unnamed: 0,did,pr,sim,sim_norm,sim_tok
638,modal/refactor - isolate Modal component,Modal/refactor,0.787151,0.923675,0.900247
1010,notion/journal - setup journal routine automat...,journal - setup Notion dev-journal automation ...,0.965812,0.960012,0.952789
240,overlord/GeoCohortMap - clean up and open for ...,Report/GeoCohort - use bbox coords to pull pos...,0.934373,0.958219,0.94246
286,overlord/geocohort - use bbox coords to pull p...,Report/GeoCohort - use bbox coords to pull pos...,0.976856,0.981589,0.977886
14,react-maps - merge [G2M] Map/GeoCohortMap - ad...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.862476,0.94175,0.902895
221,react-maps/GeoCohortMap - open for review [G2M...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.885163,0.956354,0.917622
1104,snoke - fix beacons list not loading in VWI jo...,builder - fix issue on loading beacons for VWI...,0.92648,0.919393,0.904968


In [139]:
# get the pair of pr, did with the highest sim_tok score
idx = dff.groupby('did').sim_tok.idxmax()
tok_pairs = df.iloc[idx]
tok_pairs

Unnamed: 0,did,pr,sim,sim_norm,sim_tok
638,modal/refactor - isolate Modal component,Modal/refactor,0.787151,0.923675,0.900247
1010,notion/journal - setup journal routine automat...,journal - setup Notion dev-journal automation ...,0.965812,0.960012,0.952789
240,overlord/GeoCohortMap - clean up and open for ...,Report/GeoCohort - use bbox coords to pull pos...,0.934373,0.958219,0.94246
286,overlord/geocohort - use bbox coords to pull p...,Report/GeoCohort - use bbox coords to pull pos...,0.976856,0.981589,0.977886
14,react-maps - merge [G2M] Map/GeoCohortMap - ad...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.862476,0.94175,0.902895
221,react-maps/GeoCohortMap - open for review [G2M...,"Map/GeoCohortMap - add setViewportBBox , setAp...",0.885163,0.956354,0.917622
1104,snoke - fix beacons list not loading in VWI jo...,builder - fix issue on loading beacons for VWI...,0.92648,0.919393,0.904968


In [147]:
def check(df, query):
    dedupe = df.query(query)
    print(len(did), len(dedupe), f'{round(len(dedupe) / len(did) * 100, 3)}%')
    printer(dedupe)

In [148]:
check(norm_pairs, 'sim_norm >= 0.9')

90 7 7.778%
did: modal/refactor - isolate Modal component
pr: Modal/refactor
sim: 0.7871510982513428
sim_norm: 0.9236753582954407
sim_tok: 0.9002470374107361
---------- 

did: notion/journal - setup journal routine automation workflows
pr: journal - setup Notion dev-journal automation workflow
sim: 0.9658119082450867
sim_norm: 0.9600117206573486
sim_tok: 0.9527885317802429
---------- 

did: overlord/GeoCohortMap - clean up and open for review [G2M] Report/GeoCohort - use bbox coords to pull postal code level insights
pr: Report/GeoCohort - use bbox coords to pull postal code level insights
sim: 0.9343725442886353
sim_norm: 0.9582189321517944
sim_tok: 0.9424603581428528
---------- 

did: overlord/geocohort - use bbox coords to pull postal code level insights #1833
pr: Report/GeoCohort - use bbox coords to pull postal code level insights
sim: 0.9768564105033875
sim_norm: 0.981589138507843
sim_tok: 0.977886438369751
---------- 

did: react-maps - merge [G2M] Map/GeoCohortMap - add setView

In [149]:
check(sim_pairs, 'sim >= 0.9')

90 4 4.444%
did: notion/journal - setup journal routine automation workflows
pr: journal - setup Notion dev-journal automation workflow
sim: 0.9658119082450867
sim_norm: 0.9600117206573486
sim_tok: 0.9527885317802429
---------- 

did: overlord/GeoCohortMap - clean up and open for review [G2M] Report/GeoCohort - use bbox coords to pull postal code level insights
pr: Report/GeoCohort - use bbox coords to pull postal code level insights
sim: 0.9343725442886353
sim_norm: 0.9582189321517944
sim_tok: 0.9424603581428528
---------- 

did: overlord/geocohort - use bbox coords to pull postal code level insights #1833
pr: Report/GeoCohort - use bbox coords to pull postal code level insights
sim: 0.9768564105033875
sim_norm: 0.981589138507843
sim_tok: 0.977886438369751
---------- 

did: snoke - fix beacons list not loading in VWI job creation (raised in https://app.asana.com/0/1149050960825993/1200436825375992)
pr: builder - fix issue on loading beacons for VWI job creation
sim: 0.9264801144599915

In [150]:
check(tok_pairs, 'sim_tok >= 0.9')

90 7 7.778%
did: modal/refactor - isolate Modal component
pr: Modal/refactor
sim: 0.7871510982513428
sim_norm: 0.9236753582954407
sim_tok: 0.9002470374107361
---------- 

did: notion/journal - setup journal routine automation workflows
pr: journal - setup Notion dev-journal automation workflow
sim: 0.9658119082450867
sim_norm: 0.9600117206573486
sim_tok: 0.9527885317802429
---------- 

did: overlord/GeoCohortMap - clean up and open for review [G2M] Report/GeoCohort - use bbox coords to pull postal code level insights
pr: Report/GeoCohort - use bbox coords to pull postal code level insights
sim: 0.9343725442886353
sim_norm: 0.9582189321517944
sim_tok: 0.9424603581428528
---------- 

did: overlord/geocohort - use bbox coords to pull postal code level insights #1833
pr: Report/GeoCohort - use bbox coords to pull postal code level insights
sim: 0.9768564105033875
sim_norm: 0.981589138507843
sim_tok: 0.977886438369751
---------- 

did: react-maps - merge [G2M] Map/GeoCohortMap - add setView