In [2]:
import abc
from itertools import accumulate
import json
import gc
import time
from typing import NamedTuple

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn.functional as F
from transformers import (
    AutoModel,
    AutoTokenizer,
    BitsAndBytesConfig,
)

In [4]:
DEVICE = "mps"
BATCH_SIZE = 2
MODEL_ID = "jinaai/jina-embeddings-v2-base-en"

In [5]:
def batched(xs: list, batch_size: int) -> list[list]:
    return [
        xs[i:i + batch_size]
        for i in range(0, len(xs), batch_size)
    ]

In [6]:
def get_source_doc_lines() -> list[str]:
    contents: str
    with open("./zoom_earnings_call_2024_q3.txt", "r") as f:
        contents = f.read()
        pass
    return [
        line.strip()
        for line in contents.split("\n")
        if line.strip()
    ]

In [7]:
def load_model():
    return AutoModel.from_pretrained(
        MODEL_ID,
        trust_remote_code=True,
        device_map=DEVICE,
    )

def load_tokenizer():
    return AutoTokenizer.from_pretrained(
        MODEL_ID,
        trust_remote_code=True,
    )

In [8]:
model = load_model()

In [9]:
tokenizer = load_tokenizer()

In [10]:
WINDOW_LINES = 10
WINDOW_OVERLAP = 1
WINDOW_STEP_SIZE = WINDOW_LINES - WINDOW_OVERLAP

In [12]:
lines = get_source_doc_lines()

# Pooled

In [15]:
def mean_pool(
        *,
        all_embeddings: torch.Tensor, # 2-d
        line_lengths: list[int],
) -> list[torch.Tensor]:
    assert len(all_embeddings.shape) == 2, "Expected a 2-d tensor"
    assert all_embeddings.shape[0] - 2 == sum(line_lengths), f"Expected {all_embeddings.shape[0]} lines, got {sum(line_lengths)}"
    line_offsets = [0, *accumulate(line_lengths)]
    meaningful_embeddings = all_embeddings[1:-1]
    pool_groups = [
        meaningful_embeddings[lo:hi]
        for lo, hi in zip(line_offsets, line_offsets[1:])
    ]
    pool_groups[0] = torch.cat((pool_groups[0], all_embeddings[:1]))
    pool_groups[-1] = torch.cat((pool_groups[-1], all_embeddings[-1:]))
    return [
        group.mean(dim=0)
        for group in pool_groups
    ]

In [16]:
line_groups = [
    lines[i:i+WINDOW_LINES]
    for i in range(0, len(lines), WINDOW_STEP_SIZE)
]
len(line_groups)

31

In [46]:
pooled_line_embeddings: list[torch.tensor] = []

for i, line_group in enumerate(line_groups):
    # print(i)
    with torch.no_grad():
        complete_chunk_tokenized = tokenizer(
            "\n".join(line_group),
            return_tensors="pt"
        ).to(DEVICE)
        line_lengths = [
            len(tokenizer(line)["input_ids"]) - 2 # account for '[CLS]' and '[SEP]'
            for line in line_group
        ]
        all_token_embeddings = model(
            **complete_chunk_tokenized
        )["last_hidden_state"].squeeze() # no batch dimension
        line_group_line_embeddings = mean_pool(
            all_embeddings=all_token_embeddings,
            line_lengths=line_lengths,
        )
        if i == 0:
            # we get all of the first group
            pooled_line_embeddings.extend(line_group_line_embeddings)
            continue
        # otherwise, take only the new lines from this group
        pooled_line_embeddings.extend(line_group_line_embeddings[WINDOW_OVERLAP:])
        pass
    pass

# Unpooled

In [30]:
unpooled_line_embeddings: list[torch.Tensor] = []
for i, line in enumerate(lines):
    # print(i)
    with torch.no_grad():
        tokenized_line = tokenizer(
            line,
            return_tensors="pt",
        ).to(DEVICE)
        line_embedding = model(
            **tokenized_line
        )["pooler_output"].squeeze()
        unpooled_line_embeddings.append(line_embedding)
        pass
    pass

In [39]:
class EmbeddedLine(NamedTuple):
    line_number: int
    contents: str
    embedding: list[float]
    pass


class EmbeddingSearcher:
    def __init__(
            self,
            embedded_lines: list[torch.Tensor],
    ):
        self.line_embeddings = [
            np.array(e.to("cpu"))
            for e in embedded_lines
        ]
        self.lines = {
            i: EmbeddedLine(
                line_number=i,
                embedding=e,
                contents=lines[i],
            )
            for i, e in enumerate(embedded_lines)
        }
        return

    def _find_nearest_line_indexes(self, query: str, k: int) -> list[int]:
        # could do this once per query...
        query_embed = model.encode([query])
        similarities = cosine_similarity(
            query_embed,
            self.line_embeddings,
        ).squeeze()
        top = similarities.argsort()[-k:]
        return list(reversed(top.tolist()))

    def search(self, query: str, k: int = 4) -> list[EmbeddedLine]:
        indexes = self._find_nearest_line_indexes(query, k=k)
        return [
            self.lines[i]
            for i in indexes
        ]

    pass

In [40]:
pooled_searcher = EmbeddingSearcher(pooled_line_embeddings)
unpooled_searcher = EmbeddingSearcher(unpooled_line_embeddings)

  np.array(e.to("cpu"))


In [41]:
def compare_search_results(query: str):
    default_results = unpooled_searcher.search(query, k=3)
    pooled_results = pooled_searcher.search(query, k=3)
    print("UNPOOLED RESULTS:")
    for result in default_results:
        print("line #", result.line_number)
        print(result.contents)
        pass

    print("\n")

    print("POOLED RESULTS:")
    for result in pooled_results:
        print("line #", result.line_number)
        print(result.contents)
        pass
    return


In [42]:
compare_search_results("Who were the attendees on the call?")

Encoding:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding:   0%|          | 0/1 [00:00<?, ?it/s]

UNPOOLED RESULTS:
line # 256
So, I'm just curious how many customers are showing that interest? What kind of scenarios they can design and, therefore, maybe how to think through the monetization potential at that price point?
line # 57
And our first question will come from Meta Marshall with Morgan Stanley.
line # 113
But as Eric and I began our conversations over the interview process, I got more and more excited about where I saw Zoom going to an AI-first platform company and could see a lot of the seeds, if you will, of growth being planted and starting to come to fruition. So, got very excited about that. And maybe, you know, my learning sense has been delightful, honestly, to see the customer love and the pace of innovation. I think you'd heard about it before, but to be among it, I think, has been a delight.


POOLED RESULTS:
line # 18
Kelcey McKinley -- Event Consultant
line # 16
Mark Murphy -- Analyst
line # 11
Siti Panigrahi -- Analyst


In [43]:
compare_search_results("What are the major challenges facing Zoom?")

Encoding:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding:   0%|          | 0/1 [00:00<?, ?it/s]

UNPOOLED RESULTS:
line # 50
Operating cash flow and free cash flow margins in the quarter were 41% and 38.9%, respectively. We ended the quarter with approximately $7.7 billion in cash, cash equivalents, and marketable securities, excluding restricted cash. Under the pre-existing $1.5 billion share buy-back plan, in Q3, we purchased 4.4 million shares for $302 million, increasing our repurchases quarter over quarter by $14 million. And at the end of Q3, we repurchased 11.6 million shares for $739 million.
line # 221
OK.
line # 142
OK.


POOLED RESULTS:
line # 207
We want to build a long-term trust. Given some time, the customer realized Zoom not only very stable ease of use, and also, we introduced more and more services, they would like to consolidate it into the Zoom platform. From that perspective, I think more opportunities for us to monetize as a platform player and in order to mention AI as well. So, that's our strategy.
line # 201
So, you really create an iconic brand with Zoom 

In [44]:
compare_search_results("What are the major opportunities for growth available to Zoom?")

Encoding:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding:   0%|          | 0/1 [00:00<?, ?it/s]

UNPOOLED RESULTS:
line # 256
So, I'm just curious how many customers are showing that interest? What kind of scenarios they can design and, therefore, maybe how to think through the monetization potential at that price point?
line # 217
I know Meta announced that it would be sunsetting the Meta Workplace product, and it would be in stages over '25 and '26 in the -- to you customers toward Workvivo. Can you guys help us think about how you're thinking about that ramp? You've had good momentum there. Customers grew 72% year over year. Can you quantify maybe how much of the growth is coming from Meta and how we should think about that momentum going forward?
line # 74
I'm curious, where are the budgets for AI coming from? Is it from a separate pool from your customers? Or are they taking the budgets out of budgets -- or AI money out of budgets that were designed for Zoom? And also, a follow-up question on the macro, a tone of customer conversations post the elections, do you sense that th