In [1]:
import numpy as np
import pandas as pd

import gensim
import nltk
import stanza

## Downloads

In [2]:
import gensim.downloader as gensim_api
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
from nltk.corpus import wordnet

nltk.download('omw-1.4')

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse,ner')
wv = gensim_api.load('fasttext-wiki-news-subwords-300')
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
sentiment_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/brianwilliams/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-09-18 21:31:21 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| ner       | ontonotes |

2022-09-18 21:31:21 INFO: Use device: cpu
2022-09-18 21:31:21 INFO: Loading: tokenize
2022-09-18 21:31:21 INFO: Loading: pos
2022-09-18 21:31:21 INFO: Loading: lemma
2022-09-18 21:31:21 INFO: Loading: depparse
2022-09-18 21:31:21 INFO: Loading: ner
2022-09-18 21:31:22 INFO: Done loading processors!


## Load and preprocess data

In [3]:
sheet_id = "1Bk426YSho-npLlcpt8zTwuiVosQtGeMMJ_hj3rqZAqA"
sheet_name = "main"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

goals_df = pd.read_csv(url)
goals = goals_df["Goal"].copy()
goals = goals.str.translate(str.maketrans('', '', '.,'))
goals

0                                                   Blog
1                                            Lose weight
2                                           Write a book
3                                   Stop procrastinating
4                                           Fall in love
                             ...                        
663                                         Cooking more
664                               Mastering bread making
665    Volunteer as a costumed interpreter at a livin...
666    Do more arts and crafts and nature activities ...
667                  Traveling and visiting friends more
Name: Goal, Length: 668, dtype: object

## Sentence embeddings

In [4]:
SIMILARITY_THRESHOLD = 0.5

goal_embeddings = sentence_model.encode(goals, convert_to_tensor=True)
cosine_scores = util.cos_sim(goal_embeddings, goal_embeddings)

In [5]:
# Rank all similarities between existing goals
goal_similarities = pd.DataFrame(columns=["goal_x", "goal_y", "similarity"])

for i in range(len(goal_embeddings) - 1):
    for j in range(i + 1, len(goal_embeddings) - 1):
        if SIMILARITY_THRESHOLD < cosine_scores[i][j] < 0.99:
            row = pd.DataFrame({
                "goal_x": goals[i],
                "goal_y": goals[j],
                "similarity": float(cosine_scores[i][j])
                }, index=[0])
            goal_similarities = pd.concat([goal_similarities, row])

goal_similarities = goal_similarities.sort_values(by="similarity", ascending=False).reset_index(drop=True)
goal_similarities.head()

Unnamed: 0,goal_x,goal_y,similarity
0,Practice gratitude everyday,Practice gratitude daily,0.989245
1,Learn how to cook,learn to cook,0.976222
2,Learn to cook,Learn how to cook,0.976222
3,get 8 hours of sleep each night,get eight hours of sleep every night,0.96655
4,get better at communicating,be better at communicating,0.962177


## Dependency parsing

In [6]:
def extract_root(phrase):
  roots = [word.lemma for sent in nlp(phrase).sentences for word in sent.words if word.deprel == 'root']
  return roots[0] if len(roots) > 0 else None

def extract_words(phrase):
  words = [word for sent in nlp(phrase).sentences for word in sent.words]

  return words

In [7]:
def extract_important(phrase, asdict=False):
  words = [word for sent in nlp(phrase).sentences for word in sent.words]

  roots = [word.lemma for word in filter(lambda w: w.deprel == 'root', words)]
  xcomps = [word.lemma for word in filter(lambda w: w.deprel == 'xcomp', words)]
  quantities = [{
    'item': words[word.head - 1].text,
    'amount': word.lemma
    } for word in filter(lambda w: w.deprel == 'nummod', words)]

  root = roots[0] if len(roots) > 0 else None

  important = {
    'root': root,
    'xcomps': xcomps,
    'quantities': quantities
  }
  return list(important.values()) if not asdict else important

extract_important('Do 12 pushups every three days', asdict=True)

{'root': 'do',
 'xcomps': [],
 'quantities': [{'item': 'pushups', 'amount': '12'},
  {'item': 'days', 'amount': 'three'}]}

In [8]:
goals_df[['root', 'xcomps', 'quantities']] = pd.DataFrame(goals.transform(extract_important).tolist(), index=goals_df.index)
goals_df

Unnamed: 0,Goal,Source,Style,Type,root,Unnamed: 5,Unnamed: 6,xcomps,quantities
0,Blog,,Practice,,blog,,,[],[]
1,Lose weight,,Lifestyle,,lose,,,[],[]
2,Write a book,,Target,,write,,,[],[]
3,Stop procrastinating,,Lifestyle,,stop,,,[],[]
4,Fall in love,,Target,,fall,,,[],[]
...,...,...,...,...,...,...,...,...,...
599,earn more money,,,,earn,,,[],[]
600,study,,,,study,,,[],[]
601,get better at makeup,,,,get,,,[good],[]
602,be pretty,,,,be,,,[],[]


In [9]:
def get_antonyms(word):
    antonyms = set()

    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            if l.antonyms():
                antonyms.add(l.antonyms()[0].name())
    
    return antonyms

In [10]:
get_antonyms('gain')

{'fall_back', 'lose', 'loss', 'reduce'}

# Sentiment (currently unused)

In [11]:
goals_df[['sentiment', 'sentiment_confidence']] = pd.DataFrame(goals_df['Goal'].transform(lambda x: list(sentiment_classifier(x)[0].values())).tolist(), index=goals_df.index)
goals_df

Unnamed: 0,Goal,Source,Style,Type,root,Unnamed: 5,Unnamed: 6,xcomps,quantities,sentiment,sentiment_confidence
0,Blog,,Practice,,blog,,,[],[],POSITIVE,0.788019
1,Lose weight,,Lifestyle,,lose,,,[],[],NEGATIVE,0.999560
2,Write a book,,Target,,write,,,[],[],POSITIVE,0.996259
3,Stop procrastinating,,Lifestyle,,stop,,,[],[],NEGATIVE,0.998419
4,Fall in love,,Target,,fall,,,[],[],POSITIVE,0.998757
...,...,...,...,...,...,...,...,...,...,...,...
599,earn more money,,,,earn,,,[],[],POSITIVE,0.994392
600,study,,,,study,,,[],[],POSITIVE,0.993381
601,get better at makeup,,,,get,,,[good],[],NEGATIVE,0.893811
602,be pretty,,,,be,,,[],[],POSITIVE,0.999698


In [12]:
goals_df.to_csv('data/goals_df.csv', index=False)

## Most similar

In [6]:
def wv_similarity(word1, word2):
  try:
    return wv.similarity(word1, word2)
  except:
    return 0

def most_similar_goal(goal):
  goal_encoded = sentence_model.encode(goal, convert_to_tensor=True)
  similarities = util.cos_sim(goal_encoded, goal_embeddings)[0]
  return (goals[np.argmax(similarities).item()], max(similarities).item())


def most_similar_goals(goal):
  goal_encoded = sentence_model.encode(goal, convert_to_tensor=True)
  similarities = util.cos_sim(goal_encoded, goal_embeddings)[0]

  root = extract_root(goal)
  root_similarities = pd.Series([wv_similarity(root, r) for r in goals_df['root']])
  has_root_antonym = goals_df['root'].isin(get_antonyms(root))
  
  return pd.DataFrame({
    "goals": goals,
    "similarities": similarities,
    "root_similarities": root_similarities,
    "has_root_antonym": has_root_antonym,
  }).sort_values(by='similarities', ignore_index=True, ascending=False)

In [57]:
sentence_model.encode("get strong", convert_to_tensor=True).shape

torch.Size([384])

In [15]:
most_similar_goals('create a budget').head(20)

Unnamed: 0,goals,similarities,root_similarities,has_root_antonym
0,create a budget,1.0,1.0,False
1,create a budget,1.0,1.0,False
2,start budgeting,0.845461,0.547429,False
3,budget,0.809851,0.359036,False
4,Live Within A Budget,0.789505,0.524588,False
5,stop spending extra money,0.558515,0.499701,False
6,manage my finances better,0.555741,0.677159,False
7,Get my finances in order,0.534355,0.595275,False
8,get my finances in order,0.534355,0.595275,False
9,get better at tracking my finances,0.498261,0.595275,False


## Visualization

In [7]:
goal_embeddings = goals.transform(lambda goal: sentence_model.encode(goal, convert_to_numpy=True)).values
goal_embeddings = np.stack(goal_embeddings)
goal_embeddings

array([[-0.02779194, -0.04308044, -0.01668005, ...,  0.02978688,
        -0.00123824,  0.00204039],
       [-0.04537829,  0.12193567,  0.04003958, ..., -0.03852352,
         0.03844897, -0.01332329],
       [ 0.03314275,  0.02405508,  0.01464383, ..., -0.01486317,
        -0.04343279, -0.02216911],
       ...,
       [-0.02241089,  0.07949603, -0.03279972, ...,  0.00669243,
        -0.03600563, -0.08714294],
       [ 0.04372225,  0.02791812,  0.04918401, ..., -0.01652369,
        -0.02264436,  0.02987081],
       [ 0.10038289, -0.04439367,  0.04613184, ...,  0.03618219,
        -0.06687228, -0.04546971]], dtype=float32)

In [41]:
from sklearn.manifold import TSNE

goal_2d_embeddings = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=30).fit_transform(goal_embeddings)
goal_2d_embeddings = pd.DataFrame(goal_2d_embeddings, columns=["x", "y"])
goal_2d_embeddings["goal"] = goals.copy()
goal_2d_embeddings

Unnamed: 0,x,y,goal
0,-32.421833,-18.524136,Blog
1,36.612808,1.947893,Lose weight
2,9.791948,-8.938395,Write a book
3,9.168240,-15.032077,Stop procrastinating
4,-17.399164,-28.054218,Fall in love
...,...,...,...
663,21.181271,-0.282225,Cooking more
664,17.815353,0.759695,Mastering bread making
665,-17.865107,-6.374341,Volunteer as a costumed interpreter at a livin...
666,-5.035253,4.645041,Do more arts and crafts and nature activities ...


In [16]:
for _, x, y, goal in goal_2d_embeddings.itertuples(name=None):
  print(f"{{goal: \"{goal}\", x: {x}, y: {y}}},")

{goal: "Blog", x: 2.9466679096221924, y: -8.509551048278809},
{goal: "Lose weight", x: 16.204334259033203, y: 34.47350311279297},
{goal: "Write a book", x: 0.6854141354560852, y: -3.2502517700195312},
{goal: "Stop procrastinating", x: 16.353883743286133, y: 2.970745325088501},
{goal: "Fall in love", x: 15.65584945678711, y: -36.910377502441406},
{goal: "Be happy", x: 12.89354419708252, y: -16.611835479736328},
{goal: "Journal", x: 2.1385254859924316, y: -9.497031211853027},
{goal: "Get a tattoo", x: -16.483028411865234, y: -0.2702723443508148},
{goal: "Go on a road trip with no predetermined destination", x: -23.579370498657227, y: 22.347640991210938},
{goal: "Get married", x: 17.320302963256836, y: -34.3245964050293},
{goal: "Travel the world", x: -24.621761322021484, y: 19.57588005065918},
{goal: "Drink more water", x: 18.640762329101562, y: 25.23102378845215},
{goal: "See the Northern Lights", x: -28.77141761779785, y: 27.473140716552734},
{goal: "Learn Spanish", x: -11.099313735961

In [9]:
from sklearn.manifold import TSNE

goal_3d_embeddings = TSNE(n_components=3, learning_rate='auto', init='random', perplexity=30).fit_transform(goal_embeddings)
goal_3d_embeddings = pd.DataFrame(goal_3d_embeddings, columns=["x", "y", "z"])
goal_3d_embeddings["goal"] = goals.copy()
goal_3d_embeddings

Unnamed: 0,x,y,z,goal
0,0.623445,-30.428440,-26.715790,Blog
1,12.531246,23.364470,30.647419,Lose weight
2,-16.691208,-24.179703,-24.791403,Write a book
3,0.163784,-8.824674,26.049032,Stop procrastinating
4,35.414791,-37.218540,-5.225401,Fall in love
...,...,...,...,...
663,-12.101492,6.650691,18.665855,Cooking more
664,-24.019661,12.735850,15.024239,Mastering bread making
665,10.278896,-12.864866,3.797512,Volunteer as a costumed interpreter at a livin...
666,28.563454,-4.916693,11.320958,Do more arts and crafts and nature activities ...


In [42]:
goal_2d_embeddings["source"] = goals_df["Source"].copy().fillna("unknown")
# goal_2d_embeddings["source"] = goal_2d_embeddings["source"].replace({"safa": "hidden"})
goal_2d_embeddings

Unnamed: 0,x,y,goal,source
0,-32.421833,-18.524136,Blog,web
1,36.612808,1.947893,Lose weight,web
2,9.791948,-8.938395,Write a book,web
3,9.168240,-15.032077,Stop procrastinating,web
4,-17.399164,-28.054218,Fall in love,web
...,...,...,...,...
663,21.181271,-0.282225,Cooking more,reddit
664,17.815353,0.759695,Mastering bread making,reddit
665,-17.865107,-6.374341,Volunteer as a costumed interpreter at a livin...,reddit
666,-5.035253,4.645041,Do more arts and crafts and nature activities ...,reddit


In [61]:
from sklearn.cluster import KMeans, AgglomerativeClustering

X = goal_2d_embeddings[["x", "y"]].to_numpy()
clusters = AgglomerativeClustering(n_clusters=None, distance_threshold=10).fit(X)

goal_2d_embeddings["label"] = list(clusters.labels_)
goal_2d_embeddings["label"] = goal_2d_embeddings["label"].astype(str)
goal_2d_embeddings

Unnamed: 0,x,y,goal,source,label
0,-32.421833,-18.524136,Blog,web,21
1,36.612808,1.947893,Lose weight,web,43
2,9.791948,-8.938395,Write a book,web,71
3,9.168240,-15.032077,Stop procrastinating,web,73
4,-17.399164,-28.054218,Fall in love,web,34
...,...,...,...,...,...
663,21.181271,-0.282225,Cooking more,reddit,46
664,17.815353,0.759695,Mastering bread making,reddit,46
665,-17.865107,-6.374341,Volunteer as a costumed interpreter at a livin...,reddit,13
666,-5.035253,4.645041,Do more arts and crafts and nature activities ...,reddit,94


In [62]:
import plotly.express as px

fig = px.scatter(goal_2d_embeddings, x="x", y="y", color="label", text="goal", width=800, height=800)
fig.for_each_trace(lambda t: t.update(
  textfont_color=t.marker.color
))
fig.update_layout(
  font={
    "size": 8
  },
)
fig.update_traces(
  marker={
    "opacity": 0.2
  }
)

In [65]:
def get_2d_embedding(goal):
  return pca.transform(sentence_model.encode(goal, convert_to_numpy=True).reshape(1, -1)).reshape(-1)