In [17]:
# Install these if not already done
# !pip install openai pymilvus==2.3.4 tiktoken

from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, list_collections
import openai
import pandas as pd
import numpy as np
from typing import List
from tqdm import tqdm
import os

In [19]:
from pymilvus import connections
connections.connect("default", host="localhost", port="19530")
print("Connected to Milvus.")

Connected to Milvus.


In [21]:
# Drop old collection if it already exists (optional for re-runs)
if "github_issues" in list_collections():
    Collection("github_issues").drop()

# Define fields
issue_id_field = FieldSchema(name="issue_id", dtype=DataType.INT64, is_primary=True, auto_id=False)
text_field = FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=1000)
embedding_field = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1536)

# Define schema
schema = CollectionSchema(
    fields=[issue_id_field, text_field, embedding_field],
    description="GitHub issue triage schema"
)

# Create collection
collection = Collection(name="github_issues", schema=schema)
print("Collection 'github_issues' created.")

Collection 'github_issues' created.


In [25]:
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set OpenAI API key from environment
openai.api_key = os.getenv("OPENAI_API_KEY")

# Confirm it's loaded (optional debug)
if openai.api_key:
    print("OpenAI API key loaded.")
else:
    print("OpenAI API key NOT found. Check your .env file.")

OpenAI API key loaded.


In [35]:
from openai import OpenAI

# Create OpenAI client
client = OpenAI()

# Sample GitHub issues
sample_issues = [
    "Bug: App crashes when clicking the login button",
    "Feature request: Add dark mode",
    "Question: How do I set up the development environment?",
    "Bug: Typo in README file",
    "General: This project is really cool!",
    "Discussion: Should we move to TypeScript?",
    "Bug: API returns 500 error on GET /users"
]

# Assign issue IDs
issue_ids = list(range(1, len(sample_issues) + 1))

# Define reusable embedding function
def get_embedding(text: str) -> List[float]:
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=[text]
    )
    return response.data[0].embedding

# Embed and collect
embedded_issues = []
for text in tqdm(sample_issues, desc="Embedding issues"):
    embedded_issues.append(get_embedding(text))

# Insert into Milvus
collection.insert([issue_ids, sample_issues, embedded_issues])
collection.flush()

print("Inserted sample issues into Milvus.")

Embedding issues: 100%|███████████████████████████| 7/7 [00:05<00:00,  1.35it/s]


Inserted sample issues into Milvus.


In [43]:
def search_similar_issues(query: str, top_k: int = 3):
    # Ensure index exists (required before loading and searching)
    if not collection.has_index():
        collection.create_index(
            field_name="embedding",
            index_params={
                "index_type": "IVF_FLAT",
                "metric_type": "COSINE",
                "params": {"nlist": 128}
            }
        )

    # Load collection into memory
    collection.load()

    # Embed the query
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=[query]
    )
    query_embedding = response.data[0].embedding

    # Search the collection
    results = collection.search(
        data=[query_embedding],
        anns_field="embedding",
        param={"metric_type": "COSINE", "params": {"nprobe": 10}},
        limit=top_k,
        output_fields=["issue_id", "text"]
    )

    return results[0]

In [45]:
# Define search query
query = "App crashes on login"
matches = search_similar_issues(query)

# Display results
for match in matches:
    issue_id = match.entity.get("issue_id")
    text = match.entity.get("text")
    score = match.distance

    print(f"Issue ID: {issue_id}")
    print(f"Text: {text}")
    print(f"Similarity Score: {score:.4f}")
    print("---")

Issue ID: 1
Text: Bug: App crashes when clicking the login button
Similarity Score: 0.7664
---
Issue ID: 7
Text: Bug: API returns 500 error on GET /users
Similarity Score: 0.3433
---
Issue ID: 4
Text: Bug: Typo in README file
Similarity Score: 0.2541
---


In [53]:
def classify_issue_text(text: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a GitHub issue triage assistant."},
            {"role": "user", "content": f"Classify this GitHub issue with only one word: bug, feature request, question, general comment, or discussion.\n\nIssue: {text}"}
        ],
        temperature=0
    )

    # Normalize and clean output
    label = response.choices[0].message.content.strip().lower()
    label = label.replace("category:", "").replace("this github issue should be classified as", "").strip()
    label = label.strip('"').strip("'").strip(".").strip()
    
    return label

In [55]:
# Classify and label each sample issue
for issue in sample_issues:
    label = classify_issue_text(issue)
    print(f"Issue: {issue}\n → Label: {label}\n")

Issue: Bug: App crashes when clicking the login button
 → Label: bug

Issue: Feature request: Add dark mode
 → Label: feature request

Issue: Question: How do I set up the development environment?
 → Label: question

Issue: Bug: Typo in README file
 → Label: bug

Issue: General: This project is really cool!
 → Label: general comment

Issue: Discussion: Should we move to TypeScript?
 → Label: discussion

Issue: Bug: API returns 500 error on GET /users
 → Label: bug



In [61]:
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, list_collections
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm
from typing import List
import os

# === Setup ===
load_dotenv()
client = OpenAI()
connections.connect("default", host="localhost", port="19530")

# === Drop existing collection ===
if "github_issues" in list_collections():
    Collection("github_issues").drop()

# === Define schema with label ===
issue_id_field = FieldSchema(name="issue_id", dtype=DataType.INT64, is_primary=True, auto_id=False)
text_field = FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=1000)
label_field = FieldSchema(name="label", dtype=DataType.VARCHAR, max_length=100)
embedding_field = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1536)

schema = CollectionSchema(
    fields=[issue_id_field, text_field, label_field, embedding_field],
    description="GitHub issue triage schema with label"
)

collection = Collection(name="github_issues", schema=schema)

# === Sample issues ===
sample_issues = [
    "Bug: App crashes when clicking the login button",
    "Feature request: Add dark mode",
    "Question: How do I set up the development environment?",
    "Bug: Typo in README file",
    "General: This project is really cool!",
    "Discussion: Should we move to TypeScript?",
    "Bug: API returns 500 error on GET /users"
]

issue_ids = list(range(1, len(sample_issues) + 1))

# === Helper functions ===
def get_embedding(text: str) -> List[float]:
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=[text]
    )
    return response.data[0].embedding

def classify_issue_text(text: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a GitHub issue triage assistant."},
            {"role": "user", "content": f"Classify this GitHub issue with only one word: bug, feature request, question, general comment, or discussion.\n\nIssue: {text}"}
        ],
        temperature=0
    )
    label = response.choices[0].message.content.strip().lower()
    label = label.replace("category:", "").replace("this github issue should be classified as", "").strip()
    label = label.strip('"').strip("'").strip(".").strip()
    return label

# === Generate embeddings and labels ===
embedded_issues = [get_embedding(text) for text in tqdm(sample_issues, desc="Embedding")]
labels = [classify_issue_text(text) for text in sample_issues]

# === Insert into Milvus ===
collection.insert([issue_ids, sample_issues, labels, embedded_issues])
collection.flush()

# === Create index ===
collection.create_index(
    field_name="embedding",
    index_params={
        "index_type": "IVF_FLAT",
        "metric_type": "COSINE",
        "params": {"nlist": 128}
    }
)

# === Search with optional label filter ===
def search_similar_issues(query: str, top_k: int = 3, label_filter: str = None):
    if not collection.has_index():
        collection.create_index(
            field_name="embedding",
            index_params={
                "index_type": "IVF_FLAT",
                "metric_type": "COSINE",
                "params": {"nlist": 128}
            }
        )
    collection.load()
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=[query]
    )
    query_embedding = response.data[0].embedding
    expr = f'label == "{label_filter}"' if label_filter else None
    results = collection.search(
        data=[query_embedding],
        anns_field="embedding",
        param={"metric_type": "COSINE", "params": {"nprobe": 10}},
        limit=top_k,
        output_fields=["issue_id", "text", "label"],
        expr=expr
    )
    return results[0]

Embedding: 100%|██████████████████████████████████| 7/7 [00:05<00:00,  1.36it/s]


In [63]:
query = "App crashes on login"
matches = search_similar_issues(query, label_filter="bug")
for match in matches:
    print(f"Issue ID: {match.entity.get('issue_id')}")
    print(f"Label: {match.entity.get('label')}")
    print(f"Text: {match.entity.get('text')}")
    print(f"Score: {match.distance:.4f}")
    print("---")

Issue ID: 1
Label: bug
Text: Bug: App crashes when clicking the login button
Score: 0.7663
---
Issue ID: 7
Label: bug
Text: Bug: API returns 500 error on GET /users
Score: 0.3433
---
Issue ID: 4
Label: bug
Text: Bug: Typo in README file
Score: 0.2541
---
