In [1]:
import pandas as pd
from gcsfs import GCSFileSystem

# Store the current dataset version
GCP_PROJECT_ID = "sem-nav-eva-005"
DATASET_VERSION = "2023-05-02"

# Create a GCS File system with anon creds
FS = GCSFileSystem(GCP_PROJECT_ID, token="anon")

# Read the dataset from remote
DATASET = pd.read_parquet(
    f"gs://{GCP_PROJECT_ID}/{DATASET_VERSION}/dataset.parquet",
    storage_options={"token": "anon"},
)
DATASET.columns

Index(['chunk_id', 'event_id', 'start_time', 'session_id', 'session_index',
       'session_datetime', 'chunk_storage_path', 'embedding'],
      dtype='object')

In [2]:
from sentence_transformers import SentenceTransformer
from semantic_navigator.constants import EMBEDDING_MODEL_NAME

# Load the embedding model
model = SentenceTransformer(EMBEDDING_MODEL_NAME)

# Embed the query
query_embedding = model.encode("defund the police")
query_embedding.shape

  from .autonotebook import tqdm as notebook_tqdm


(384,)

In [3]:
from sentence_transformers.util import cos_sim

# Get similarity of query against all docs
DATASET["similarity"] = DATASET.embedding.apply(lambda e: cos_sim(e, query_embedding).item())

# Get top 10 most similar text chunks
TOP_10_START = DATASET.sort_values(by="similarity", ascending=False)[:10]
TOP_10_START[["chunk_id", "similarity"]]

Unnamed: 0,chunk_id,similarity
4002,c52cabcf-389b-4222-ab25-b42bfae70c72,0.734023
6482,28798b41-7a0d-42ce-8dae-8439125a676f,0.688331
6483,dd2e8dbf-898a-4bf7-be05-a6d085b637e9,0.611672
3652,f672510a-a0ff-47cb-ac95-1180e6086198,0.595868
3913,9fc0fb64-ff33-4410-9ae5-32d9cde71744,0.593337
6481,93aa6037-a12a-4db4-99bb-420eee091fbf,0.589277
7644,103d0965-e88a-4f61-b475-bfdf3cf3c2ca,0.578662
7575,472d644b-7e45-427e-a70e-121da305f645,0.556719
5697,97911f1f-c84e-4d18-9865-df44c89a9687,0.537277
2072,8a0bd951-0a92-4905-bb1f-5af003711477,0.535628


In [4]:
# Read the text for these examples
for _, row in TOP_10_START.iterrows():
    # Read the text
    with FS.open(row.chunk_storage_path, "r") as open_f:
        text = open_f.read()
    
    print(f"Chunk Id '{row.chunk_id}':")
    print(f"\t'{text}'")
    print()
    print("-" * 40)
    print()

Chunk Id 'c52cabcf-389b-4222-ab25-b42bfae70c72':
	'So as we discuss what defunding the police department really means, because let's be honest, we haven't asked ourselves these questions yet, we should ask what do the police do all day? And if what they do is respond to people dealing with mental illness, to people experiencing homelessness, to young people who are hanging out, then we have a problem. The police shouldn't be responding to those calls. It's not their job. They aren't trained social workers. And these things are not crimes. So we really need to build community safety in a way that doesn't center police. And when we say defund the police, you know, what we mean is shrink their responsibilities and shrink their funding. In organizing spaces that really focus on disruption, and I will say many of those organizations come from my district, so, you know, there is an important guiding principle to that work, which is that we spend about 20% of our time dismantling the current 

In [5]:
# My positive and negative annotations
# a list of positives and a list of negatives
# the items in each list are the indices of the items

# For the sake of this example, I am selected texts as positive if I think they were public commments
# rather than comments / discussion from councilmembers themselves
positives = [
    "dd2e8dbf-898a-4bf7-be05-a6d085b637e9",
    "f672510a-a0ff-47cb-ac95-1180e6086198",
    "9fc0fb64-ff33-4410-9ae5-32d9cde71744",
    "472d644b-7e45-427e-a70e-121da305f645",
    "8a0bd951-0a92-4905-bb1f-5af003711477",
]

negatives = [
    "c52cabcf-389b-4222-ab25-b42bfae70c72",
    "28798b41-7a0d-42ce-8dae-8439125a676f",
    "93aa6037-a12a-4db4-99bb-420eee091fbf",
    "103d0965-e88a-4f61-b475-bfdf3cf3c2ca",
    "97911f1f-c84e-4d18-9865-df44c89a9687",
]

In [6]:
import numpy as np
from sklearn.linear_model import LogisticRegression

# set random seed for reproducibility
# and set hyper parameters for random negative examples
np.random.seed(0)
n_negative_examples = 400

# using the stored chunk ids, pull the positive embeddings
positive_embeddings = np.stack(DATASET.loc[DATASET.chunk_id.isin(positives)].embedding)

# using the stored chunk ids, pull the negative embeddings
negative_embeddings = np.stack(DATASET.loc[DATASET.chunk_id.isin(negatives)].embedding)

# randomly draw embeddings to be additional negative examples
random_embeddings_for_negative = np.stack(DATASET.sample(n_negative_examples).embedding)

# technically the user could have not given any negative examples
# so safety check, "should these embeddings be combined or not"
if len(negative_embeddings) > 0:
    complete_negative_embeddings = np.concatenate(
        [negative_embeddings, random_embeddings_for_negative],
        axis=0,
    )
else:
    complete_negative_embeddings = random_embeddings_for_negative

# Construct training data
train_embeddings = np.concatenate((positive_embeddings, complete_negative_embeddings), axis=0)
train_labels = np.concatenate((
    # positives are 1
    np.ones(len(positive_embeddings)),
    # negatives are 0
    np.zeros(len(complete_negative_embeddings)),
))

# Create classifier
clf = LogisticRegression(class_weight="balanced", random_state=1, max_iter=100000)

# Fit the model
clf.fit(train_embeddings, train_labels)

In [7]:
# Get a new dataset with the prior examples (already annotated) removed
THE_REST = DATASET.sort_values(by="similarity", ascending=False)[10:]

# Generate probabilities to use for next annotation cycle
predictions = clf.predict_proba(np.stack(THE_REST.embedding))
predictions.shape

(8272, 2)

In [8]:
# It gives a probability for both negative and positive -- in that order
# We really only care about probability of positive
proba_positive = predictions[:,1]

# Attach these back to the dataframe for sorting and selection again
THE_REST["proba"] = proba_positive
NEW_TOP_10_SELECTION = THE_REST.sort_values(by="proba", ascending=False)[:10]
NEW_TOP_10_SELECTION[["chunk_id", "proba"]]

Unnamed: 0,chunk_id,proba
1927,37de3ce5-be66-4e9d-ae3d-cf834bbfb3c7,0.74997
3964,83509246-6762-4d28-a521-2adba4403732,0.744081
2068,522fde6c-33b9-41fb-8e47-b4d7c0cda3bf,0.666073
4746,2279307e-bb84-432a-88d5-355c89523af3,0.648679
1940,79e6d909-8dd8-4571-9da7-83f6ca511c8c,0.633587
1922,66708a98-03b6-4620-b661-ec188d5cc2ad,0.629224
6381,a0eb4fae-6f18-4d76-9e81-6155f7a9e5d4,0.61364
5703,30699f70-852a-4f03-936e-bf85ef822708,0.601795
5065,b940d5ea-6a54-4418-af7e-6a2e5eedee3a,0.591204
3931,3360311e-4e79-48d7-ab9a-7a87109cdcd2,0.58874


In [9]:
# Read the text for these examples
for _, row in NEW_TOP_10_SELECTION.iterrows():
    # Read the text
    with FS.open(row.chunk_storage_path, "r") as open_f:
        text = open_f.read()
    
    print(f"Chunk Id '{row.chunk_id}':")
    print(f"\t'{text}'")
    print()
    print("-" * 40)
    print()

Chunk Id '37de3ce5-be66-4e9d-ae3d-cf834bbfb3c7':
	'But now there's talk of leaving loopholes in the ban, putting us right back where we started. If the council is committed to taking responsibility for the damage done to all the people who stood up for the serious and real injustice they saw within their community, they need to uphold the ban with no loopholes, no pepper spray, no rubber bullets, no tear gas. Leaving the use of these less lethal weapons up to police officers' discretion is clearly not working, which is why we need real police accountability, a real, like, community oversight board over the police. Thank you for your time. Thank you. Our next speaker is Sarah Gonzer, followed by Margo Stewart. Sarah. Hi. Yes. My name is Sarah, and I'm a renter in District 3. I'm also calling about the chemical weapons and so-called crowd control bill, and to talk about why it's so incredibly exciting, actually, that Seattle was the first in the country to put a ban on these violent so-c

In [10]:
# Storing for later
# # Construct sample weights to attach during training
# sample_weight = np.ones(len(positive_embeddings) + len(negative_embeddings))
# # True positives + true negatives are given the same weight
# sample_weight[:len(positive_embeddings) + len(negative_embeddings)] = 10
# # Random sample negatives are given a smaller weight
# # this is randomly selected
# sample_weight[-len(complete_negative_embeddings):] = 6