# FOSSistant difficulty prediction model v0.3.0 data processing

## Environment setup

In [None]:
%pip install -U -q datasets bitsandbytes "huggingface_hub[hf_xet]" "huggingface_hub[hf_transfer]" "distilabel[hf-transformers]" outlines

In [None]:
# GDRIVE_DIR = r"/content/drive/"
ROOT_DIR = r"~/"

MODEL_PATH = "answerdotai/ModernBERT-large"
# MODEL_PATH = r"answerdotai/ModernBERT-base"

In [None]:
import os
import random
import numpy as np
import torch

!export HF_HUB_ENABLE_HF_TRANSFER=1
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

random.seed(42)
np.random.seed(42)

torch.set_float32_matmul_precision("high")

In [None]:
import re
import json
import base64

from tqdm.auto import tqdm

import pandas as pd

from transformers import pipeline, BitsAndBytesConfig
from transformers.pipelines.pt_utils import KeyDataset
from datasets import (Dataset, DatasetDict,
                      Features, Value, ClassLabel,
                      load_dataset, load_from_disk,
                      concatenate_datasets)
from huggingface_hub import login

# import kagglehub
# from kagglehub import KaggleDatasetAdapter

from pydantic import BaseModel
import openai
from openai import OpenAI
from google import genai
from google.genai import types
from cerebras.cloud.sdk import Cerebras
from mistralai import Mistral

# import distilabel
# from distilabel.pipeline import Pipeline
# from distilabel.models import TransformersLLM
# from distilabel.steps import LoadDataFromFileSystem
# from distilabel.steps.tasks import TextClassification

# login("")

In [None]:
# from google.colab import drive
# drive.mount(GDRIVE_DIR)

## Initial dataset preparation

In [None]:
!mkdir -p $ROOT_DIR/datasets/fossistant/
!wget https://figshare.com/ndownloader/files/35739797 -O $ROOT_DIR/datasets/fossistant/github_issues_figshare_original.json

In [None]:
ds_1 = load_dataset("mlfoundations-dev/github-issues")
ds_1 = concatenate_datasets([ds_1["train"], ds_1["test"]])

df_1 = ds_1.to_pandas()
df_1 = df_1[["repo_name", "issue_number", "title", "body", "labels"]]


df_2 = pd.read_json(ROOT_DIR + "datasets/fossistant/github_issues_figshare_original.json")
df_2[["repo_name", "issue_number"]] = df_2["url"].str.extract(r"repos\/(.+)\/issues\/(\d+)")
df_2 = df_2[["repo_name", "issue_number", "title", "body", "labels"]]


# ds_3 = load_dataset("bigcode/the-stack-github-issues")

# df_3 = ds_3.to_pandas()
# df_3 = df_3[["title", "body"]]


df = pd.concat([df_1, df_2], ignore_index=True)
# df = pd.concat([df_1, df_2, df_3], ignore_index=True)
df["issue_number"] = df["issue_number"].astype(int)

df

In [None]:
df["repo_name_lower"] = df["repo_name"].str.lower()
subset = ["repo_name_lower", "issue_number"]

display(df.duplicated(subset=subset).value_counts())
df = df.drop_duplicates(subset=subset, ignore_index=True)
display(df.duplicated(subset=subset).value_counts())
df = df.drop(columns=["repo_name_lower"])

In [None]:
df["repo_name"].value_counts(normalize=True)

In [None]:
display(df.isnull().sum())
df["body"] = df["body"].fillna("")
# df = df.dropna(ignore_index=True)
display(df.isnull().sum())

## Rule-based dataset annotation

In [None]:
pattern = re.compile(r"easy|newbie|begin|starter|started|^minor$|bug.*minor|minor.*fix|p-minor|novice|grab|good.*first|first.*time|low.*fruit|small$|^low$|effort.*low|estimate.*low|task.*low|level.*low|difficulty.*low", re.IGNORECASE)
easy_loc = df["labels"].apply(lambda labels: any(pattern.search(label) for label in labels))

easy_df = df[easy_loc]
easy_df

In [None]:
pattern = re.compile(r"intermediate|good.*second|effort.*medium|estimate.*medium|task.*medium|level.*medium|difficulty.*medium", re.IGNORECASE)
medium_loc = df["labels"].apply(lambda labels: any(pattern.search(label) for label in labels))

medium_df = df[medium_loc & ~easy_loc]
medium_df

In [None]:
# pattern = re.compile(r"important|major|breaking|hard|serious|advanced|large|^long$|effort.*long|long.*term|p0|p1|critical|difficult$|^core$|^expert$|effort.*expert|estimate.*expert|task.*expert|level.*expert|difficulty.*expert", re.IGNORECASE)
pattern = re.compile(r"hard|serious|advanced|large|difficult$|^expert$|effort.*expert|estimate.*expert|task.*expert|level.*expert|difficulty.*expert", re.IGNORECASE)
hard_loc = df["labels"].apply(lambda labels: any(pattern.search(label) for label in labels))

hard_df = df[hard_loc & ~easy_loc & ~medium_loc]
hard_df

In [None]:
# pattern = re.compile(r"need.*more|request.*comment", re.IGNORECASE)
# misc_loc = df["labels"].apply(lambda labels: any(pattern.search(label) for label in labels))

# misc_df = df[misc_loc & ~easy_loc & ~medium_loc & ~hard_loc]
# misc_df

In [None]:
df["labels"].apply(lambda labels: [label for label in labels if re.compile(r"request.*comment", re.IGNORECASE).search(label)] or None).value_counts()

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None):
    display(df[df["labels"].apply(lambda labels: any(re.compile(r"request.*comment", re.IGNORECASE).search(label) for label in labels))])

In [None]:
# rest_df = df[~easy_loc & ~medium_loc & ~hard_loc & ~misc_loc]
rest_df = df[~easy_loc & ~medium_loc & ~hard_loc]
rest_df = rest_df.reset_index(drop=True)
rest_df

In [None]:
easy_df = easy_df.assign(difficulty=0, difficulty_model="rule")
medium_df = medium_df.assign(difficulty=1, difficulty_model="rule")
hard_df = hard_df.assign(difficulty=2, difficulty_model="rule")
# misc_df = misc_df.assign(difficulty=3, difficulty_model="rule")

rest_df = rest_df.assign(difficulty=4, difficulty_model="none")
rest_df = rest_df.sample(frac=1, random_state=42, ignore_index=True) # Not classified (unknown)

# merged_df = pd.concat([easy_df, medium_df, hard_df, misc_df, rest_df], ignore_index=True)
merged_df = pd.concat([easy_df, medium_df, hard_df, rest_df], ignore_index=True)
merged_df.to_json(ROOT_DIR + "datasets/fossistant/github_issues_rule_based_annotation.jsonl",
                  orient="records", lines=True)
merged_df

## LLM-based dataset annotation

In [None]:
SYSTEM_PROMPT = '# Instruction\nPlease classify the github issue by assigning the most appropriate labels.\nDo not explain your reasoning or provide any additional commentary.\nIf the text is ambiguous or lacks sufficient information for classification, respond with "unknown".\nProvide the label that best describes the text.\nDetermine the difficulty of the GitHub issue.\n\n## Labeling the user input\nUse the available labels to classify the user query. Analyze the context of each label specifically:\navailable_labels = [\n    "easy",  # A beginner-friendly issue that doesn\'t require much prior experience.\n    "medium",  # An issue suitable for contributors with intermediate-level skills and experience.\n    "hard",  # A challenging issue that likely requires advanced or specialized expertise.\n    "misc",  # An issue that doesn\'t involve direct problem-solving, such as general discussions or non-technical topics, so its difficulty can\'t be assessed.\n    "unknown",  # An issue with unclear requirements or scope, making its difficulty hard to determine.\n]\n\n## Examples\n### Input\n```\nTitle: A small typo in docs\nBody: I found a small typo in docs!\nLabels: [\'good first issue\', \'docs\']\n```\n### Output\n```\neasy\n```\n\n## Output Format\nNow, please give me the labels in as-is raw text format, do not include any other text in your response:\n```\nlabel\n```'
print(SYSTEM_PROMPT)
print("\n\n---\n\n")

USER_TEMPLATE = 'Title: {title}\nBody: {body}\nLabels: {labels}'
print(USER_TEMPLATE)

In [None]:
df = pd.read_json(ROOT_DIR + "datasets/fossistant/github_issues_rule_based_annotation.jsonl",
                  lines=True)

df["messages"] = df[["title", "body", "labels"]].apply(
    lambda row: [
        {
            "role": "system",
            "content": SYSTEM_PROMPT,
        },
        {
            "role": "user",
            "content": USER_TEMPLATE.format(title=row["title"], body=row["body"], labels=row["labels"]),
        },
    ],
    axis=1,
)
df = df[["messages", "difficulty", "difficulty_model"]]

df.to_json(ROOT_DIR + "datasets/fossistant/github_issues_llm_based_annotation_input.jsonl",
           orient="records", lines=True)
df

In [None]:
df = pd.read_json(ROOT_DIR + "datasets/fossistant/github_issues_llm_based_annotation_input.jsonl",
                  lines=True)
df

In [None]:
client = OpenAI(
    # api_key="",
    # base_url="https://generativelanguage.googleapis.com/v1beta/openai/",

    api_key="",
    base_url="https://api.cerebras.ai/v1",
)

models = [
    # "gemini-2.5-flash-preview-05-20",
    # "gemini-2.0-flash",
    # "gemini-2.0-flash-lite",
    # "gemini-1.5-flash",

    "llama-4-scout-17b-16e-instruct",
]
model_index = 0

class Difficulty(BaseModel):
    difficulty: str

for index, row in df.iterrows():
    if row["difficulty_model"] != "none":
        continue

    while True:
        model = models[model_index]

        try:
            completion = client.beta.chat.completions.parse(
                model=model,
                messages=row["messages"],
                response_format=Difficulty,
            )

            difficulty = completion.choices[0].message.parsed.difficulty.strip().lower()
            if "easy" in difficulty:
                difficulty_index = 0
            elif "medium" in difficulty:
                difficulty_index = 1
            elif "hard" in difficulty:
                difficulty_index = 2
            elif "misc" in difficulty:
                difficulty_index = 3
            else:
                difficulty_index = 4

            df.loc[index, "difficulty_model"] = model
            df.loc[index, "difficulty"] = difficulty_index

            break
        except openai.RateLimitError:
            print(f"RateLimitError: {model} / Index: {index}")

            model_index = (model_index + 1) % len(models)
        # except openai.BadRequestError:
        #     print(f"BadRequestError: {model} / Index: {index}")

        #     df.loc[index, "difficulty_model"] = model
        #     df.loc[index, "difficulty"] = 5 # Error

        #     break

    if index % 100 == 0:
        df.to_json(ROOT_DIR + "datasets/fossistant/github_issues_llm_based_annotation_input.jsonl",
                   orient="records", lines=True)
        print(f"Saved a checkpoint / Index: {index}")

In [None]:
df.to_json(ROOT_DIR + "datasets/fossistant/github_issues_llm_based_annotation_input.jsonl",
           orient="records", lines=True)

In [None]:
train_df = df.query("difficulty <= 3")

def to_train_input(messages: list):
    content = messages[1]["content"]
    labels_idx = content.rfind("\nLabels:")
    return content[:labels_idx]

train_df["text"] = train_df["messages"].apply(to_train_input)
train_df = train_df[["text", "difficulty"]]
train_df = train_df.rename(columns={"difficulty": "labels"})
train_df = train_df.reset_index(drop=True)

train_df.to_json(ROOT_DIR + "datasets/fossistant/github_issues_llm_based_annotation.jsonl",
                 orient="records", lines=True)
train_df

In [None]:
train_df = pd.read_json(ROOT_DIR + "datasets/fossistant/github_issues_llm_based_annotation.jsonl",
                        lines=True)
train_df

In [None]:
ds = Dataset.from_pandas(train_df, preserve_index=False, features=Features({
    "text": Value("string"),
    "labels": ClassLabel(names=["easy", "medium", "hard", "misc"]),
}))

# ds = ds.train_test_split(
#     test_size=0.1,
#     stratify_by_column="labels",
#     seed=42,
# )

train_valid_test_ds = ds.train_test_split(
    test_size=0.2,
    stratify_by_column="labels",
    seed=42,
)
valid_test_ds = train_valid_test_ds["test"].train_test_split(
    test_size=0.5,
    stratify_by_column="labels",
    seed=42,
)
ds = DatasetDict({
    "train": train_valid_test_ds["train"],
    "valid": valid_test_ds["train"],
    "test": valid_test_ds["test"],
})

ds.save_to_disk(ROOT_DIR + "datasets/fossistant/github_issues")
ds

In [None]:
random.choice(ds["train"])

In [None]:
ds["train"].to_pandas()["labels"].value_counts()
# ds["valid"].to_pandas()["labels"].value_counts()
# ds["test"].to_pandas()["labels"].value_counts()

## Post-processing & Saving

In [None]:
ds = load_from_disk(ROOT_DIR + "datasets/fossistant/github_issues")
ds

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.model_max_length = 1024

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True,
                     return_tensors="pt")

tokenized_ds = ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized_ds.save_to_disk(ROOT_DIR + "datasets/fossistant/github_issues_tokenized")

tokenized_ds["train"].features.keys()