In [1]:
!pip install umap --quiet
!pip install hdbscan --quiet
!pip install transformers --quiet
!pip install sentence-transformers --quiet

In [2]:
import numpy as np
import pandas as pd
import umap
import hdbscan
import ipywidgets as widgets
import pprint

from sentence_transformers import SentenceTransformer, util
import torch

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('punkt')
from nltk import tokenize

import datetime as datetime
import os

import transformers


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import transformers

***Pre-Trained Models List***

In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

***Semantic Textual Similarity***

In [5]:
group_1 = pd.read_csv('/content/300_TRANSCRIPT.csv')
group_2 = pd.read_csv('/content/301_TRANSCRIPT.csv')

In [6]:
group_1.head()

Unnamed: 0,start_time\tstop_time\tspeaker\tvalue
0,36.588\t39.668\tEllie\thi i'm ellie thanks for...
1,39.888\t43.378\tEllie\ti was created to talk t...
2,43.728\t48.498\tEllie\tthink of me as a friend...
3,49.188\t52.388\tEllie\ti'm here to learn about...
4,52.658\t58.958\tEllie\ti'll ask a few question...


In [7]:
group_2.head()

Unnamed: 0,start_time\tstop_time\tspeaker\tvalue
0,29.428\t35.888\tEllie\thi i'm ellie thanks for...
1,32.738\t33.068\tParticipant\tthank you
2,36.598\t40.948\tEllie\tthink of me as a friend...
3,42.088\t42.518\tParticipant\tmmm k
4,42.358\t51.738\tEllie\ti'm here to learn about...


In [8]:
list1 = group_1.values.tolist()
list2 = group_2.values.tolist()

In [9]:
print(list1)

[["36.588\t39.668\tEllie\thi i'm ellie thanks for coming in today"], ['39.888\t43.378\tEllie\ti was created to talk to people in a safe and secure environment'], ["43.728\t48.498\tEllie\tthink of me as a friend i don't judge i can't i'm a computer"], ["49.188\t52.388\tEllie\ti'm here to learn about people and would love to learn about you"], ["52.658\t58.958\tEllie\ti'll ask a few questions to get us started and please feel free to tell me anything your answers are totally confidential"], ['60.028\t61.378\tEllie\thow are you doing today'], ['62.328\t63.178\tParticipant\tgood'], ["63.798\t64.738\tEllie\tthat's good"], ['65.858\t67.528\tEllie\twhere are you from originally'], ['68.978\t70.288\tParticipant\tatlanta georgia'], ['70.978\t71.868\tEllie\treally'], ["72.788\t74.198\tEllie\twhy'd you move to l_a"], ['75.028\t78.128\tParticipant\tum my parents are from here um'], ['82.218\t83.578\tEllie\thow do you like l_a'], ['83.808\t84.588\tParticipant\ti love it'], ['85.558\t87.898\tEllie\t

In [10]:
print(list2)

[["29.428\t35.888\tEllie\thi i'm ellie thanks for coming in today i was created to talk to people in a safe and secure environment"], ['32.738\t33.068\tParticipant\tthank you'], ["36.598\t40.948\tEllie\tthink of me as a friend i don't judge i can't i'm a computer"], ['42.088\t42.518\tParticipant\tmmm k'], ["42.358\t51.738\tEllie\ti'm here to learn about people and would love to learn about you i'll ask a few questions to get us started and please feel free to tell me anything your answers are totally confidential"], ['52.928\t53.868\tEllie\thow are you doing today'], ["54.328\t55.758\tParticipant\ti'm doing good thank you"], ["55.938\t56.378\tEllie\tthat's good"], ['58.308\t59.518\tEllie\twhere are you from originally'], ["59.858\t60.948\tParticipant\ti'm from los angeles"], ['61.718\t63.048\tEllie\treally me too'], ['63.538\t64.108\tParticipant\toh great'], ['66.308\t67.028\tEllie\twhere do you live'], ['67.388\t69.858\tParticipant\ti live in west los angeles the west side'], ['70.828

In [11]:
list1 = str(list1)
list2 = str(list2)

In [12]:
list1 = [i.replace("\\", "") for i in list1 ]
list2 = [i.replace("\\", "") for i in list2 ]

In [13]:
list1 = "".join(list1)
list2 = "".join(list2)

In [14]:
print(list1)
print(list2)

[["36.588t39.668tElliethi i'm ellie thanks for coming in today"], ['39.888t43.378tEllieti was created to talk to people in a safe and secure environment'], ["43.728t48.498tEllietthink of me as a friend i don't judge i can't i'm a computer"], ["49.188t52.388tEllieti'm here to learn about people and would love to learn about you"], ["52.658t58.958tEllieti'll ask a few questions to get us started and please feel free to tell me anything your answers are totally confidential"], ['60.028t61.378tElliethow are you doing today'], ['62.328t63.178tParticipanttgood'], ["63.798t64.738tEllietthat's good"], ['65.858t67.528tEllietwhere are you from originally'], ['68.978t70.288tParticipanttatlanta georgia'], ['70.978t71.868tEllietreally'], ["72.788t74.198tEllietwhy'd you move to l_a"], ['75.028t78.128tParticipanttum my parents are from here um'], ['82.218t83.578tElliethow do you like l_a'], ['83.808t84.588tParticipantti love it'], ['85.558t87.898tEllietwhat are some things you really like about l_a']

In [15]:
import re

In [16]:
list1 = re.sub(r'[0-9]+', '', list1)
list2 = re.sub(r'[0-9]+', '', list2)

In [17]:
list1 = list1.split('.t.')
list2 = list2.split('.t.')

In [18]:
list1 = [item.replace("tParticipantt", " ") for item in list1]
list2 = [item.replace("tParticipantt", " ") for item in list2]

In [19]:
list1 = [item.replace("tElliet", " ") for item in list1]
list2 = [item.replace("tElliet", " ") for item in list2]

In [20]:
print(list1)
print(list2)

['[["', ' hi i\'m ellie thanks for coming in today"], [\'', ' i was created to talk to people in a safe and secure environment\'], ["', ' think of me as a friend i don\'t judge i can\'t i\'m a computer"], ["', ' i\'m here to learn about people and would love to learn about you"], ["', ' i\'ll ask a few questions to get us started and please feel free to tell me anything your answers are totally confidential"], [\'', " how are you doing today'], ['", ' good\'], ["', ' that\'s good"], [\'', " where are you from originally'], ['", " atlanta georgia'], ['", ' really\'], ["', ' why\'d you move to l_a"], [\'', " um my parents are from here um'], ['", " how do you like l_a'], ['", " i love it'], ['", " what are some things you really like about l_a'], ['", " i like the weather'], ['", " i like the opportunities'], ['", " um'], ['", " yes'], ['", " how easy was it for you to get used to living in l_a'], ['", " um'], ['", " it took a minute'], ['", ' somewhat easy\'], ["', ' what are some thing

In [21]:
pp = pprint.PrettyPrinter(indent=4)

In [22]:
embeddings1 = model.encode(list1, convert_to_tensor=True)
embeddings2 = model.encode(list2, convert_to_tensor=True)

cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

for i in range(len(list1)):
  pp.pprint("{}  {}  Score: {:.4f}".format(list1[i], list2[i], cosine_scores[i][i])) 


'[["  [["  Score: 1.0000'
(' hi i\'m ellie thanks for coming in today"], [\'   hi i\'m ellie thanks for '
 'coming in today i was created to talk to people in a safe and secure '
 'environment"], [\'  Score: 0.7891')
(' i was created to talk to people in a safe and secure environment\'], ["   '
 'thank you\'], ["  Score: 0.4386')
(' think of me as a friend i don\'t judge i can\'t i\'m a computer"], ["   '
 'think of me as a friend i don\'t judge i can\'t i\'m a computer"], [\'  '
 'Score: 0.9873')
(' i\'m here to learn about people and would love to learn about you"], ["   '
 'mmm k\'], ["  Score: 0.5092')
(" i'll ask a few questions to get us started and please feel free to tell me "
 'anything your answers are totally confidential"], [\'   i\'m here to learn '
 "about people and would love to learn about you i'll ask a few questions to "
 'get us started and please feel free to tell me anything your answers are '
 'totally confidential"], [\'  Score: 0.7153')
(' how are you doing tod

***Using Unbiased Embeddings***

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text1 = list1
text2 = list2
encoded_input1 = tokenizer(text1, return_tensors='pt', padding=True, truncation=True)
encoded_input2 = tokenizer(text2, return_tensors='pt', padding=True, truncation=True)
output1 = model(**encoded_input1)
#output2 = model(**encoded_input2)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
pp.pprint(output1)

In [None]:
#pp.pprint(output2)