# Imports

In [1]:
!pip install torch-geometric
!pip install transformers



In [2]:
# Standard libraries
import re
import requests
import urllib.request
from typing import List, Dict, Set, Tuple, Any

# PyTorch and Torch Geometric
import torch
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

# Transformers
from transformers import BertTokenizer, BertModel

# tqdm for progress bars
from tqdm import tqdm

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# NetworkX and Plotly for graph visualization
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Web scraping with BeautifulSoup
from bs4 import BeautifulSoup

# Utils

In [3]:
def fetch_package_data() -> List[Dict[str, Any]]:
    """
    Fetches package data from PyPI.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing package information.
    """
    package_list = []
    BASE_URL = 'https://pypi.org/search/?c=Programming+Language+%3A%3A+Python+%3A%3A+3&page='
    DEPENDANCY_URL = 'https://pypi.org/pypi/{}/json'

    for i in tqdm(range(0, 502)):  # Adjust range for demonstration
        try:
            fp = urllib.request.urlopen(BASE_URL + str(i))
        except urllib.error.HTTPError as err:
            print(f'\n{BASE_URL + str(i)}, {err}')
            break

        mybytes = fp.read()
        mystr = mybytes.decode("utf8")
        fp.close()
        soup = BeautifulSoup(mystr, 'html.parser')
        items = soup.findAll("a", {"class": "package-snippet"})

        for child_soup in items:
            package_name = child_soup.find("span", {"class": "package-snippet__name"}).get_text()
            description = child_soup.find("p", {"class": "package-snippet__description"}).get_text()
            json = requests.get(DEPENDANCY_URL.format(package_name)).json()

            if 'message' in json and json['message'].lower() == 'not found':
                continue

            dependancies = [re.split('[^a-zA-Z-_]', x)[0] for x in json['info']['requires_dist']] if json['info']['requires_dist'] else []

            package_list.append({
                'name': package_name,
                'description': description,
                'dependancies': dependancies,
                'topic': [x.split('::')[1].strip() for x in json['info']['classifiers'] if x.lower().startswith('topic')],
                'intended audience': [x.split('::')[-1].strip() for x in json['info']['classifiers'] if x.lower().startswith('intended audience')]
            })

    return package_list


In [4]:
def get_bert_embedding(text: str, tokenizer: BertTokenizer, model: BertModel) -> torch.Tensor:
    """
    Generates the BERT embedding for a given text.

    Args:
        text (str): The text to embed.
        tokenizer (BertTokenizer): A BERT tokenizer.
        model (BertModel): A BERT model.

    Returns:
        torch.Tensor: The BERT embedding of the text.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs['last_hidden_state'][:,0,:]


In [5]:
def create_global_graph(package_list: List[Dict[str, Any]]) -> torch.Tensor:
    """
    Creates a global graph from the package list.

    Args:
        package_list (List[Dict[str, Any]]): A list of package data.

    Returns:
        torch.Tensor: An edge index tensor representing the graph.
    """
    package_to_index = {package['name']: idx for idx, package in enumerate(package_list)}
    edge_indices = []

    for idx, package in enumerate(package_list):
        for dep in package['dependancies']:
            if dep in package_to_index:
                edge_indices.append((idx, package_to_index[dep]))

    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
    return edge_index


In [6]:
def split_data(data: Data, test_size: float = 0.2) -> Data:
    """
    Splits the data into training and test sets.

    Args:
        data (Data): The graph data object.
        test_size (float): The proportion of the dataset to include in the test split.

    Returns:
        Data: The data object with train and test masks added.
    """
    num_nodes = data.x.size(0)
    node_indices = torch.arange(num_nodes)

    train_idx, test_idx = train_test_split(node_indices, test_size=test_size)

    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    train_mask[train_idx] = True
    test_mask[test_idx] = True

    data.train_mask = train_mask
    data.test_mask = test_mask

    return data


In [7]:
class GNN(torch.nn.Module):
    """
    Graph Neural Network model using GCNConv layers.

    Args:
        hidden_channels (int): Number of hidden channels.
        num_topics (int): Number of topics.
        num_audiences (int): Number of audiences.
    """
    def __init__(self, hidden_channels: int, num_topics: int, num_audiences: int):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(768, hidden_channels)  # Assuming input feature size is 768
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.out_topic = torch.nn.Linear(hidden_channels, num_topics)
        self.out_audience = torch.nn.Linear(hidden_channels, num_audiences)

    def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Forward pass of the GNN.

        Args:
            x (torch.Tensor): Node feature matrix.
            edge_index (torch.Tensor): Graph edge index.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Topic and audience output tensors.
        """
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        return self.out_topic(x), self.out_audience(x)


In [8]:
def prepare_data_for_gnn(package_list: List[Dict[str, List[str]]],
                         tokenizer: BertTokenizer,
                         model: BertModel) -> Tuple[Data, int, int, Dict[str, int], Dict[str, int]]:
    """
    Prepares data for a Graph Neural Network (GNN) from a list of packages.

    Parameters:
    - package_list (List[Dict[str, List[str]]]): List of dictionaries representing packages, each containing 'description', 'topic', and 'intended audience'.
    - tokenizer (YourTokenizerType): The tokenizer used to process package descriptions.
    - model (YourModelType): The BERT model used to extract embeddings from package descriptions.

    Returns:
    Tuple[Data, int, int, Dict[str, int], Dict[str, int]]:
    - Data: Graph data object containing node features, edge indices, and topic and audience labels.
    - int: Number of unique topics.
    - int: Number of unique intended audiences.
    - Dict[str, int]: Mapping of topics to their corresponding indices.
    - Dict[str, int]: Mapping of intended audiences to their corresponding indices.
    """
    all_x = []
    for item in tqdm(package_list, desc="Processing package descriptions"):
        description_embedding = get_bert_embedding(item['description'], tokenizer, model).squeeze(0)
        all_x.append(torch.cat([description_embedding]))

    all_x = torch.stack(all_x)

    # Prepare all_y_topic and all_y_audience
    all_topics: Set[str] = set()
    all_audiences: Set[str] = set()
    for item in package_list:
        all_topics.update(item['topic'])
        all_audiences.update(item['intended audience'])

    num_topics = len(all_topics)
    num_audiences = len(all_audiences)
    topic_to_id = {topic: idx for idx, topic in enumerate(all_topics)}
    audience_to_id = {audience: idx for idx, audience in enumerate(all_audiences)}

    all_y_topic = torch.zeros(len(package_list), num_topics).float()
    all_y_audience = torch.zeros(len(package_list), num_audiences).float()

    for idx, item in enumerate(package_list):
        for t in item['topic']:
            all_y_topic[idx, topic_to_id[t]] = 1.0
        for a in item['intended audience']:
            all_y_audience[idx, audience_to_id[a]] = 1.0

    edge_index = create_global_graph(package_list)
    global_data = Data(x=all_x, edge_index=edge_index, y_topic=all_y_topic, y_audience=all_y_audience)

    return global_data, num_topics, num_audiences, topic_to_id, audience_to_id


In [9]:
def train_gnn(global_data: Data, num_topics: int, num_audiences: int, num_epochs: int = 100) -> GNN:
    """
    Trains a Graph Neural Network (GNN) using the provided global data.

    Parameters:
    - global_data (Data): Graph data object containing node features, edge indices, and topic and audience labels.
    - num_topics (int): Number of unique topics.
    - num_audiences (int): Number of unique intended audiences.
    - num_epochs (int, optional): Number of training epochs. Default is 100.

    Returns:
    GNN: Trained GNN model.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gnn_model = GNN(hidden_channels=128, num_topics=num_topics, num_audiences=num_audiences).to(device)
    optimizer = Adam(gnn_model.parameters(), lr=0.01)
    criterion = BCEWithLogitsLoss()

    global_data = global_data.to(device)

    for epoch in range(num_epochs):
        optimizer.zero_grad()
        out_topic, out_audience = gnn_model(global_data.x, global_data.edge_index)

        loss_topic = criterion(out_topic[global_data.train_mask], global_data.y_topic[global_data.train_mask])
        loss_audience = criterion(out_audience[global_data.train_mask], global_data.y_audience[global_data.train_mask])

        loss = loss_topic + loss_audience
        loss.backward()
        optimizer.step()

    return gnn_model


In [10]:
def evaluate_model(model, data: Data) -> None:
    """
    Evaluate a GNN model on the given data.

    Parameters:
    - model: The trained GNN model.
    - data (Data): Graph data object containing node features, edge indices, and topic and audience labels.

    Returns:
    None
    """
    model.eval()
    with torch.no_grad():
        out_topic, out_audience = model(data.x, data.edge_index)

        # Convert the logits to probabilities
        out_topic_prob = torch.sigmoid(out_topic)
        out_audience_prob = torch.sigmoid(out_audience)

        # Convert probabilities to binary predictions
        out_topic_pred = (out_topic_prob > 0.5).float()
        out_audience_pred = (out_audience_prob > 0.5).float()

        # Compute accuracy and F1-score for 'topic'
        acc_topic = accuracy_score(data.y_topic[data.test_mask].cpu(), out_topic_pred[data.test_mask].cpu())
        f1_topic = f1_score(data.y_topic[data.test_mask].cpu(), out_topic_pred[data.test_mask].cpu(), average='micro')

        # Compute accuracy and F1-score for 'audience'
        acc_audience = accuracy_score(data.y_audience[data.test_mask].cpu(), out_audience_pred[data.test_mask].cpu())
        f1_audience = f1_score(data.y_audience[data.test_mask].cpu(), out_audience_pred[data.test_mask].cpu(), average='micro')

        print(f"Topic - Accuracy: {acc_topic}, F1-score: {f1_topic}")
        print(f"Audience - Accuracy: {acc_audience}, F1-score: {f1_audience}")


In [11]:
def plot_graph(edge_index, package_list: List[Dict[str, str]]) -> None:
    """
    Plot a graph using NetworkX and Plotly with the given edge index and package list.

    Parameters:
    - edge_index: Edge indices of the graph.
    - package_list (List[Dict[str, str]]): List of dictionaries representing packages, each containing 'name'.

    Returns:
    None
    """
    G = nx.Graph()
    edge_list = edge_index.t().tolist()
    G.add_edges_from(edge_list)

    # Create a mapping of node indices to package names
    index_to_package = {idx: package['name'] for idx, package in enumerate(package_list)}

    # Get positions for the nodes in G
    pos = nx.spring_layout(G)

    # Extract x and y coordinates
    x_nodes = [pos[k][0] for k in G.nodes()]
    y_nodes = [pos[k][1] for k in G.nodes()]

    # Create edges
    edge_x = []
    edge_y = []
    for edge in edge_list:
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    # Create edge trace
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    # Create node trace
    node_trace = go.Scatter(
        x=x_nodes, y=y_nodes,
        mode='markers+text',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            size=10,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            )
        ),
        text=[index_to_package[i] for i in range(len(pos))],
        textposition="top center"
    )

    # Create the figure
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=0, l=0, r=0, t=0),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )

    fig.show()

In [12]:
def prepare_single_input(description: str, tokenizer, model) -> torch.Tensor:
    """
    Process a single input description using BERT tokenizer and model.

    Parameters:
    - description (str): Input description to be processed.
    - tokenizer: The tokenizer used to process the description.
    - model: The BERT model used to extract embeddings.

    Returns:
    torch.Tensor: Embedding of the input description with an added batch dimension.
    """
    description_embedding = get_bert_embedding(description, tokenizer, model).squeeze(0)
    return description_embedding.unsqueeze(0)  # Add a batch dimension

def predict_topic_and_audience(input_data, model, topic_to_id, audience_to_id):
    """
    Predict topics and audiences based on input data using a trained model.

    Parameters:
    - input_data: Input data, typically the output of `prepare_single_input`.
    - model: Trained model for prediction.
    - topic_to_id (Dict[str, int]): Mapping of topics to their corresponding indices.
    - audience_to_id (Dict[str, int]): Mapping of audiences to their corresponding indices.

    Returns:
    Tuple[List[str], List[str]]: Predicted topics and audiences.
    """
    model.eval()
    with torch.no_grad():
        out_topic, out_audience = model(input_data, torch.empty((2, 0), dtype=torch.long).to(global_data.x.device))

        # Convert logits to probabilities
        out_topic_prob = torch.sigmoid(out_topic)
        out_audience_prob = torch.sigmoid(out_audience)

        # Convert probabilities to binary predictions
        out_topic_pred = (out_topic_prob > 0.5).float()
        out_audience_pred = (out_audience_prob > 0.5).float()

        # Convert binary predictions to topic and audience names
        predicted_topics = [topic for topic, idx in topic_to_id.items() if out_topic_pred[0, idx] == 1]
        predicted_audiences = [audience for audience, idx in audience_to_id.items() if out_audience_pred[0, idx] == 1]

        return predicted_topics, predicted_audiences

In [13]:
def get_connected_nodes(new_package_dependencies, package_to_index):
    """
    Get connected nodes based on package dependencies.

    Parameters:
    - new_package_dependencies (List[str]): List of package dependencies.
    - package_to_index (Dict[str, int]): Mapping of package names to their corresponding indices.

    Returns:
    torch.Tensor: Tensor of connected node indices.
    """
    connected_nodes = []
    for dep in new_package_dependencies:
        if dep in package_to_index:
            connected_nodes.append(package_to_index[dep])
    return torch.tensor(connected_nodes, dtype=torch.long)

def test_single_input(description, dependencies, global_data, tokenizer, bert_model, gnn_model, device, package_to_index, id_to_topic, id_to_audience, threshold):
    """
    Test a single input description with dependencies using a GNN model.

    Parameters:
    - description (str): Input description.
    - dependencies (List[str]): List of package dependencies.
    - global_data (Data): Graph data object containing node features, edge indices, and topic and audience labels.
    - tokenizer: The tokenizer used to process the description.
    - bert_model: The BERT model used to extract embeddings.
    - gnn_model: Trained GNN model.
    - device: Device to perform computation on (e.g., 'cuda' or 'cpu').
    - package_to_index (Dict[str, int]): Mapping of package names to their corresponding indices.
    - id_to_topic (Dict[int, str]): Mapping of topic indices to their names.
    - id_to_audience (Dict[int, str]): Mapping of audience indices to their names.
    - threshold (float): Threshold for binary predictions.

    Returns:
    Dict[str, List[Union[str, float]]]: Dictionary containing predicted topic and audience names and probabilities.
    """
    description_embedding = get_bert_embedding(description, tokenizer, bert_model).squeeze(0).to(device)

    x_input = torch.cat([description_embedding]).unsqueeze(0).to(device)

    new_x = torch.cat([global_data.x.to(device), x_input], dim=0)

    new_node_idx = new_x.size(0) - 1
    connected_nodes = get_connected_nodes(dependencies, package_to_index).to(device)
    new_edges = torch.tensor([[new_node_idx] * len(connected_nodes), connected_nodes], dtype=torch.long).to(device)
    new_edge_index = torch.cat([global_data.edge_index.to(device), new_edges], dim=1)

    with torch.no_grad():
        out_topic, out_audience = gnn_model(new_x, new_edge_index)

    topic_probs = torch.sigmoid(out_topic[-1])
    predicted_topic_idx = (topic_probs > threshold).nonzero(as_tuple=True)[0]
    predicted_topic_names = [id_to_topic[idx.item()] for idx in predicted_topic_idx]
    predicted_topic_probs = [topic_probs[idx].item() for idx in predicted_topic_idx]

    audience_probs = torch.sigmoid(out_audience[-1])
    predicted_audience_idx = (audience_probs > threshold).nonzero(as_tuple=True)[0]
    predicted_audience_names = [id_to_audience[idx.item()] for idx in predicted_audience_idx]
    predicted_audience_probs = [audience_probs[idx].item() for idx in predicted_audience_idx]

    return {
        "predicted_topic_names": predicted_topic_names,
        "predicted_topic_probs": predicted_topic_probs,
        "predicted_audience_names": predicted_audience_names,
        "predicted_audience_probs": predicted_audience_probs
    }

# Train

In [14]:
# Main code
if __name__ == "__main__":
    package_list = fetch_package_data()

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    global_data, num_topics, num_audiences, topic_to_id, audience_to_id = prepare_data_for_gnn(package_list, tokenizer, model)

    global_data = split_data(global_data, test_size=0.2)

    gnn_model = train_gnn(global_data, num_topics, num_audiences)

    evaluate_model(gnn_model, global_data)

    plot_graph(global_data.edge_index.cpu(), package_list)

100%|█████████▉| 501/502 [32:06<00:03,  3.85s/it]



https://pypi.org/search/?c=Programming+Language+%3A%3A+Python+%3A%3A+3&page=501, HTTP Error 404: Not Found


Processing package descriptions: 100%|██████████| 10020/10020 [21:58<00:00,  7.60it/s]


Topic - Accuracy: 0.3912175648702595, F1-score: 0.2974319662706018
Audience - Accuracy: 0.5409181636726547, F1-score: 0.6676543209876543


# Test

In [15]:
test_description = "This is a calculations package for data manipulation."
test_dependencies = ["numpy", "matplotlib"]

# Prepare the input
input_data = prepare_single_input(test_description, tokenizer, model).to(global_data.x.device)

# Make predictions
predicted_topics, predicted_audiences = predict_topic_and_audience(input_data, gnn_model, topic_to_id, audience_to_id)

print("Predicted Topics:", predicted_topics)
print("Predicted Audiences:", predicted_audiences)

Predicted Topics: []
Predicted Audiences: ['Developers']


In [16]:
# Create package_to_index and id_to_topic mappings
package_to_index = {package['name']: idx for idx, package in enumerate(package_list)}
id_to_topic = {idx: topic for topic, idx in topic_to_id.items()}
id_to_audience = {idx: audience for audience, idx in audience_to_id.items()}

# Initialize GNN model and move it to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Assume gnn_model is trained by your existing code
# ...

# Test a single package
description = "This is my personal static site generator, it lacks testings and documents at the moment. If you need a static site generator, find one here with good community support."
dependencies = [x.lower() for x in ["PyYAML", "Pygments", "beautifulsoup4", "markdown", "pymdown-extensions", "Jinja2", "watchdog"]]
dependencies = [x for x in package_to_index.keys()][:6]

predicted_topic_names = test_single_input(description, dependencies, global_data, tokenizer, model, gnn_model, device, package_to_index, id_to_topic, id_to_audience, threshold=0.5)

print("Predicted Topic Names:", predicted_topic_names)

Predicted Topic Names: {'predicted_topic_names': [], 'predicted_topic_probs': [], 'predicted_audience_names': ['Developers'], 'predicted_audience_probs': [0.7003011107444763]}
