In [361]:
import numpy as np
import pandas as pd

import gensim
import nltk
import stanza

In [365]:
from nltk.corpus import wordnet

nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/brianwilliams/nltk_data...


True

## Downloads

In [35]:
import gensim.downloader as gensim_api
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline

nltk.download('omw-1.4')

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse,ner')
wv = gensim_api.load('fasttext-wiki-news-subwords-300')
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
sentiment_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …



Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/ner/ontonotes.pt:   0%|        …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/forward_charlm/1billion.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/backward_charlm/1billion.pt:   …

2022-08-13 10:41:54 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| ner       | ontonotes |

2022-08-13 10:41:54 INFO: Use device: cpu
2022-08-13 10:41:54 INFO: Loading: tokenize
2022-08-13 10:41:54 INFO: Loading: pos
2022-08-13 10:41:54 INFO: Loading: lemma
2022-08-13 10:41:54 INFO: Loading: depparse
2022-08-13 10:41:55 INFO: Loading: ner
2022-08-13 10:41:55 INFO: Done loading processors!


## Load and preprocess data

In [268]:
sheet_id = "1Bk426YSho-npLlcpt8zTwuiVosQtGeMMJ_hj3rqZAqA"
sheet_name = "main"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

goals_df = pd.read_csv(url)
goals = goals_df["Goal"].copy()
goals = goals.str.translate(str.maketrans('', '', '.,'))
goals

0                      Blog
1               Lose weight
2              Write a book
3      Stop procrastinating
4              Fall in love
               ...         
504        be a good friend
505                 hug mom
506               bake more
507     be a better dog mom
508         reduce body fat
Name: Goal, Length: 509, dtype: object

## Sentence embeddings

In [362]:
SIMILARITY_THRESHOLD = 0.5

goal_embeddings = sentence_model.encode(goals, convert_to_tensor=True)
cosine_scores = util.cos_sim(goal_embeddings, goal_embeddings)

In [363]:
# Rank all similarities between existing goals
goal_similarities = pd.DataFrame(columns=["goal_x", "goal_y", "similarity"])

for i in range(len(goal_embeddings) - 1):
    for j in range(i + 1, amt):
        if SIMILARITY_THRESHOLD < cosine_scores[i][j] < 0.99:
            row = pd.DataFrame({
                "goal_x": goals[i],
                "goal_y": goals[j],
                "similarity": float(cosine_scores[i][j])
                }, index=[0])
            goal_similarities = pd.concat([goal_similarities, row])

goal_similarities = goal_similarities.sort_values(by="similarity", ascending=False).reset_index(drop=True)
goal_similarities.head()

Unnamed: 0,goal_x,goal_y,similarity
0,Practice gratitude everyday,Practice gratitude daily,0.989245
1,Learn how to cook,learn to cook,0.976222
2,Learn to cook,Learn how to cook,0.976222
3,get 8 hours of sleep each night,get eight hours of sleep every night,0.96655
4,Buy my first house,Buy my first home,0.958472


## Dependency parsing

In [409]:
def extract_root(phrase):
  roots = [word.lemma for sent in nlp(phrase).sentences for word in sent.words if word.deprel == 'root']
  return roots[0] if len(roots) > 0 else None

def extract_words(phrase):
  words = [word for sent in nlp(phrase).sentences for word in sent.words]

  return words

In [438]:
def extract_important(phrase, asdict=False):
  words = [word for sent in nlp(phrase).sentences for word in sent.words]

  roots = [word.lemma for word in filter(lambda w: w.deprel == 'root', words)]
  xcomps = [word.lemma for word in filter(lambda w: w.deprel == 'xcomp', words)]
  quantities = [{
    'item': words[word.head - 1].text,
    'amount': word.lemma
    } for word in filter(lambda w: w.deprel == 'nummod', words)]

  root = roots[0] if len(roots) > 0 else None

  important = {
    'root': root,
    'xcomps': xcomps,
    'quantities': quantities
  }
  return list(important.values()) if not asdict else important

extract_important('Do 12 pushups every three days', asdict=True)

{'root': 'do',
 'xcomps': [],
 'quantities': [{'item': 'pushups', 'amount': '12'},
  {'item': 'days', 'amount': 'three'}]}

In [439]:
goals_df[['root', 'xcomps', 'quantities']] = pd.DataFrame(goals.transform(extract_important).tolist(), index=goals_df.index)
goals_df

Unnamed: 0,Goal,Source,Style,Type,root,sentiment,sentiment_confidence,xcomps,quantities
0,Blog,,Practice,,blog,POSITIVE,0.788019,[],[]
1,Lose weight,,Lifestyle,,lose,NEGATIVE,0.999560,[],[]
2,Write a book,,Target,,write,POSITIVE,0.996259,[],[]
3,Stop procrastinating,,Lifestyle,,stop,NEGATIVE,0.998419,[],[]
4,Fall in love,,Target,,fall,POSITIVE,0.998757,[],[]
...,...,...,...,...,...,...,...,...,...
504,be a good friend,,,,friend,POSITIVE,0.999849,[],[]
505,hug mom,,,,hug,POSITIVE,0.999575,[],[]
506,bake more,,,,bake,NEGATIVE,0.985117,[],[]
507,be a better dog mom,,,,mom,NEGATIVE,0.980848,[],[]


In [400]:
def get_antonyms(word):
    antonyms = set()

    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            if l.antonyms():
                antonyms.add(l.antonyms()[0].name())
    
    return antonyms

In [402]:
get_antonyms('gain')

{'fall_back', 'lose', 'loss', 'reduce'}

# Sentiment (currently unused)

In [347]:
goals_df[['sentiment', 'sentiment_confidence']] = pd.DataFrame(goals_df['Goal'].transform(lambda x: list(sentiment_classifier(x)[0].values())).tolist(), index=goals_df.index)
goals_df

Unnamed: 0,Goal,Source,Style,Type,root,sentiment,sentiment_confidence
0,Blog,,Practice,,blog,POSITIVE,0.788019
1,Lose weight,,Lifestyle,,lose,NEGATIVE,0.999560
2,Write a book,,Target,,write,POSITIVE,0.996259
3,Stop procrastinating,,Lifestyle,,stop,NEGATIVE,0.998419
4,Fall in love,,Target,,fall,POSITIVE,0.998757
...,...,...,...,...,...,...,...
504,be a good friend,,,,friend,POSITIVE,0.999849
505,hug mom,,,,hug,POSITIVE,0.999575
506,bake more,,,,bake,NEGATIVE,0.985117
507,be a better dog mom,,,,mom,NEGATIVE,0.980848


In [352]:
goals_df.to_csv('data/goals_df.csv', index=False)

## Most similar

In [462]:
def wv_similarity(word1, word2):
  try:
    return wv.similarity(word1, word2)
  except:
    return 0

def most_similar_goal(goal):
  goal_encoded = sentence_model.encode(goal, convert_to_tensor=True)
  similarities = util.cos_sim(goal_encoded, goal_embeddings)[0]
  return (goals[np.argmax(similarities).item()], max(similarities).item())


def most_similar_goals(goal):
  goal_encoded = sentence_model.encode(goal, convert_to_tensor=True)
  similarities = util.cos_sim(goal_encoded, goal_embeddings)[0]

  root = extract_root(goal)
  root_similarities = pd.Series([wv_similarity(root, r) for r in goals_df['root']])
  has_root_antonym = goals_df['root'].isin(get_antonyms(root))
  
  return pd.DataFrame({
    "goals": goals,
    "similarities": similarities,
    "root_similarities": root_similarities,
    "has_root_antonym": has_root_antonym,
  }).sort_values(by='similarities', ignore_index=True, ascending=False)

In [465]:
most_similar_goals('start going to the gym').head(20)

Unnamed: 0,goals,similarities,root_similarities,has_root_antonym
0,hit the gym,0.894069,0.512382,False
1,Go to the gym every day,0.893398,0.656043,False
2,Go to the gym 3 times a week,0.851059,0.656043,False
3,join a gym,0.695223,0.547596,False
4,Exercise more,0.611583,0.195502,False
5,exercise more,0.611583,0.459847,False
6,Exercise regularly,0.59497,0.195502,False
7,Exercise regularly,0.59497,0.195502,False
8,Get Regular Exercise,0.576466,0.603532,False
9,Get in shape,0.574967,0.603532,False
