This notebook computes semantic similarity between TensorFlow and PyTorch API documentation using BERT embeddings.

**Workflow:**
1. Load API documentation from specified paths
2. Preprocess text data
3. Generate BERT embeddings
4. Compute cross-framework similarity scores
5. Identify and display most similar API pairs

## 1. Install and Import Dependencies

In [None]:
%pip install transformers torch 'numpy<2' pandas

import numpy as np
import torch

# Set seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

## 2. Data Loading and Preprocessing

In [2]:
import os
import ast
import textwrap


class APIVisitor(ast.NodeVisitor):
    def __init__(self):
        self.apis = []
        self.current_class = None

    def visit_FunctionDef(self, node):
        if node.name.startswith("_"):
            return

        func_doc = ast.get_docstring(node)

        if func_doc is None:
            return

        prefix = self.current_class + "." if self.current_class else ""

        self.apis.append(
            {
                "name": prefix + node.name,
                "doc": func_doc.strip(),
                "type": "method" if self.current_class else "function",
            }
        )

        self.generic_visit(node)


def extract_api_info(source_path):
    with open(source_path, "r", encoding="utf-8") as f:
        try:
            tree = ast.parse(f.read())
        except SyntaxError:
            return []

    visitor = APIVisitor()
    visitor.visit(tree)

    return visitor.apis


def scan_project(project_root, project_name):
    all_apis = []

    for root, _, files in os.walk(project_root):
        for file in files:
            if file.endswith(".py") and not file.startswith("_"):
                file_path = os.path.join(root, file)
                rel_path = os.path.relpath(file_path, project_root)

                module_path = rel_path.replace("/", ".").replace("\\", ".")[:-3]

                for api in extract_api_info(file_path):
                    full_name = (
                        f"{module_path}.{api['name']}"
                        if module_path != "."
                        else api["name"]
                    )
                    all_apis.append(
                        {
                            "project": project_name,
                            "name": full_name,
                            "content": textwrap.dedent(api["doc"]).strip(),
                            "type": api["type"],
                        }
                    )

    return all_apis

In [3]:
# Load documentation
tf_docs = scan_project("data/tensorflow-2.17.0", "tensorflow")
torch_docs = scan_project("data/pytorch-2.4.0", "pytorch")

print(f"Loaded {len(tf_docs)} TensorFlow APIs")
print(f"Loaded {len(torch_docs)} PyTorch APIs")


Loaded 7636 TensorFlow APIs
Loaded 3685 PyTorch APIs


## 3. Text Preprocessing

In [4]:
def preprocess_text(text):
    # Remove code blocks
    text = text.replace("```", "")
    # Collapse whitespace
    text = " ".join(text.split())
    # Truncate to first 2000 characters to maintain context
    return text[:2000]


# Preprocess all documents
for doc in tf_docs + torch_docs:
    doc["processed_text"] = preprocess_text(doc["content"])

## 4. BERT Embedding Generation

In [5]:
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64 if device.type == "cuda" else 16

# Initialize BERT model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
model.to(device)

# Disable parallelism to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"


def get_bert_embeddings(texts):
    embeddings = []

    with tqdm(total=len(texts), desc="Generating BERT embeddings") as pbar:
        for i in range(0, len(texts), batch_size):
            batch = texts[i : i + batch_size]
            inputs = tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt",
            )
            inputs.to(device)

            with torch.no_grad():
                outputs = model(**inputs)

            # Use mean pooling of last hidden states
            hidden_states = outputs.last_hidden_state.mean(dim=1)
            batch_embeddings = hidden_states.to("cpu").numpy()
            embeddings.append(batch_embeddings)
            pbar.update(len(batch))

    return np.vstack(embeddings)


# Generate embeddings for both frameworks
tf_embeddings = get_bert_embeddings([doc["processed_text"] for doc in tf_docs])
torch_embeddings = get_bert_embeddings([doc["processed_text"] for doc in torch_docs])

print(f"TensorFlow embedding matrix shape: {tf_embeddings.shape}")
print(f"PyTorch embedding matrix shape: {torch_embeddings.shape}")

Generating BERT embeddings: 100%|██████████| 7636/7636 [00:28<00:00, 270.97it/s]
Generating BERT embeddings: 100%|██████████| 3685/3685 [00:12<00:00, 303.87it/s]

TensorFlow embedding matrix shape: (7636, 768)
PyTorch embedding matrix shape: (3685, 768)





## 5. Cross-Framework Similarity Analysis

In [6]:
def compute_cross_similarity(matrix_a, matrix_b):
    """Compute pairwise cosine similarity between two embedding matrices

    Args:
        matrix_a (np.ndarray): N x D embedding matrix
        matrix_b (np.ndarray): M x D embedding matrix

    Returns:
        np.ndarray: N x M similarity matrix
    """
    # Normalize embeddings
    matrix_a_norm = matrix_a / np.linalg.norm(matrix_a, axis=1, keepdims=True)
    matrix_b_norm = matrix_b / np.linalg.norm(matrix_b, axis=1, keepdims=True)

    return np.dot(matrix_a_norm, matrix_b_norm.T)


# Compute similarity matrix
similarity_matrix = compute_cross_similarity(tf_embeddings, torch_embeddings)
print(f"Similarity matrix shape: {similarity_matrix.shape}")

Similarity matrix shape: (7636, 3685)


## 6. Identify Top Similar API Pairs

In [None]:
import pandas as pd


def get_top_matches(sim_matrix, tf_docs, torch_docs):
    matches = []
    rows, cols = sim_matrix.shape

    for i in range(rows):
        for j in range(cols):
            matches.append(
                {
                    "tf_api": tf_docs[i]["name"],
                    "tf_doc": tf_docs[i]["processed_text"],
                    "pytorch_api": torch_docs[j]["name"],
                    "pytorch_doc": torch_docs[j]["processed_text"],
                    "similarity": sim_matrix[i, j],
                }
            )

    df = pd.DataFrame(matches)
    return (
        df.sort_values("similarity", ascending=False)
        .drop_duplicates(subset=["tf_api"], keep="first")
        .drop_duplicates(subset=["pytorch_api"], keep="first")
        .head(500)
    )


# Get and display top matches
top_matches = get_top_matches(similarity_matrix, tf_docs, torch_docs)

with open("api_documentation_db.csv", "w") as f:
    top_matches.to_csv(f, index=False)

top_matches[["tf_api", "pytorch_api", "similarity"]]


Unnamed: 0,tf_api,pytorch_api,similarity
5761656,tensorflow.python.summary.writer.event_file_wr...,torch.utils.tensorboard.writer.flush,1.000000
27319916,tensorflow.python.data.ops.dataset_ops.is_subt...,torch.distributed.elastic.rendezvous.dynamic_r...,1.000000
6772369,tensorflow.python.distribute.tpu_strategy.run,torch.distributed.elastic.rendezvous.dynamic_r...,1.000000
27323652,tensorflow.python.data.ops.dataset_ops.most_sp...,torch.distributed.elastic.rendezvous.c10d_rend...,1.000000
18954975,tensorflow.python.training.monitored_session.run,torch.distributed.elastic.rendezvous.dynamic_r...,1.000000
...,...,...,...
1826536,tensorflow.python.tpu.feature_column.shared_em...,torch.nn.functional.multi_head_attention_forward,0.951313
13932624,tensorflow.python.ops.numpy_ops.tests.np_test....,torch._dynamo.bytecode_transformation.encode_v...,0.951276
4017822,tensorflow.python.framework.extension_type_tes...,torch._inductor.codegen.cpp.masked,0.951269
25281738,tensorflow.python.keras.engine.base_layer.add_...,torch.nn.modules.module.state_dict,0.951255
