<a href="https://colab.research.google.com/github/25b3nk/hf-nlp-course/blob/main/course/en/chapter5/section5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

In [4]:
!git config --global user.email "csbhaskar95@gmail.com"
!git config --global user.name "25b3nk"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
!pip install requests



## Imports

In [7]:
from datasets import load_dataset

# Creating your own dataset

In [5]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [6]:
response.status_code

200

In [7]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/7381',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/7381/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/7381/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/7381/events',
  'html_url': 'https://github.com/huggingface/datasets/issues/7381',
  'id': 2815649092,
  'node_id': 'I_kwDODunzps6n02VE',
  'number': 7381,
  'title': 'Iterating over values of a column in the IterableDataset',
  'user': {'login': 'TopCoder2K',
   'id': 47208659,
   'node_id': 'MDQ6VXNlcjQ3MjA4NjU5',
   'avatar_url': 'https://avatars.githubusercontent.com/u/47208659?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/TopCoder2K',
   'html_url': 'https://github.com/TopCoder2K',
   'followers_url': 'https://api.github.com/users/TopCoder2K/followers',
   'f

In [8]:
from google.colab import userdata

GITHUB_TOKEN = userdata.get('GITHUB_API_KEY')  # Copy your GitHub token here
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [9]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [None]:
# Depending on your internet connection, this can take several minutes to run...
fetch_issues()

  0%|          | 0/100 [00:00<?, ?it/s]

Reached GitHub rate limit. Sleeping for one hour ...
Downloaded all the issues for datasets! Dataset stored at ./datasets-issues.jsonl


In [None]:
!head -n 2 /content/datasets-issues.jsonl

{"url":"https:\/\/api.github.com\/repos\/huggingface\/datasets\/issues\/7378","repository_url":"https:\/\/api.github.com\/repos\/huggingface\/datasets","labels_url":"https:\/\/api.github.com\/repos\/huggingface\/datasets\/issues\/7378\/labels{\/name}","comments_url":"https:\/\/api.github.com\/repos\/huggingface\/datasets\/issues\/7378\/comments","events_url":"https:\/\/api.github.com\/repos\/huggingface\/datasets\/issues\/7378\/events","html_url":"https:\/\/github.com\/huggingface\/datasets\/issues\/7378","id":2802957388,"node_id":"I_kwDODunzps6nEbxM","number":7378,"title":"Allow pushing config version to hub","user":{"login":"momeara","id":129072,"node_id":"MDQ6VXNlcjEyOTA3Mg==","avatar_url":"https:\/\/avatars.githubusercontent.com\/u\/129072?v=4","gravatar_id":"","url":"https:\/\/api.github.com\/users\/momeara","html_url":"https:\/\/github.com\/momeara","followers_url":"https:\/\/api.github.com\/users\/momeara\/followers","following_url":"https:\/\/api.github.com\/users\/momeara\/fol

In [None]:
df = pd.read_json("datasets-issues.jsonl", lines=True)

In [None]:
df.columns

Index(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url',
       'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels',
       'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments',
       'created_at', 'updated_at', 'closed_at', 'author_association',
       'sub_issues_summary', 'active_lock_reason', 'body', 'closed_by',
       'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason',
       'draft', 'pull_request'],
      dtype='object')

In [None]:
# print(len(df[df['closed_at'].isnull()]))
df['closed_at'] = df['closed_at'].fillna(pd.Timestamp('1970-01-01'))
# print(len(df[df['closed_at'].isnull()]))
df.to_json("datasets-issues-cleaned.jsonl", orient="records", lines=True)

In [None]:
print(df.dtypes)

url                                      object
repository_url                           object
labels_url                               object
comments_url                             object
events_url                               object
html_url                                 object
id                                        int64
node_id                                  object
number                                    int64
title                                    object
user                                     object
labels                                   object
state                                    object
locked                                     bool
assignee                                 object
assignees                                object
milestone                                object
comments                                  int64
created_at                  datetime64[ns, UTC]
updated_at                  datetime64[ns, UTC]
closed_at                               

In [None]:
import numpy as np
df['time_diff'] = df[df['closed_at'].notnull()]['closed_at'].astype(np.datetime64[ns]) - df[df['closed_at'].notnull()]['created_at']

In [None]:
from datasets import load_dataset, Features, Value, Dataset

# features = Features({
#     'closed_at': Value('timestamp[s]', id=None),
#     'merged_at': Value('timestamp[s]', id=None)
# })

# issues_dataset = load_dataset("json", data_files="datasets-issues-cleaned.jsonl", split="train")
issues_dataset = Dataset.from_pandas(df)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request'],
    num_rows: 7314
})

In [None]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

# Print out the URL and pull request entries
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

>> URL: https://github.com/huggingface/datasets/pull/3765
>> Pull request: {'diff_url': 'https://github.com/huggingface/datasets/pull/3765.diff', 'html_url': 'https://github.com/huggingface/datasets/pull/3765', 'merged_at': None, 'patch_url': 'https://github.com/huggingface/datasets/pull/3765.patch', 'url': 'https://api.github.com/repos/huggingface/datasets/pulls/3765'}

>> URL: https://github.com/huggingface/datasets/issues/545
>> Pull request: None

>> URL: https://github.com/huggingface/datasets/pull/526
>> Pull request: {'diff_url': 'https://github.com/huggingface/datasets/pull/526.diff', 'html_url': 'https://github.com/huggingface/datasets/pull/526', 'merged_at': '2020-08-24T12:50:42Z', 'patch_url': 'https://github.com/huggingface/datasets/pull/526.patch', 'url': 'https://api.github.com/repos/huggingface/datasets/pulls/526'}



In [None]:
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

Map:   0%|          | 0/7314 [00:00<?, ? examples/s]

In [None]:
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

In [None]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        # print(response.status_code)
        # print(response.json())
        return []
    return [r["body"] for r in response.json()]


# Test our function works as expected
get_comments(2792)

[]

In [None]:
# Depending on your internet connection, this can take a few minutes...
issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)

Map:   0%|          | 0/7314 [00:00<?, ? examples/s]

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
issues_with_comments_dataset.push_to_hub("github-issues")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/25b3nk/github-issues/commit/8e3e1d5ac4d7d54a02f0cecab6f3c46f848a948d', commit_message='Upload dataset', commit_description='', oid='8e3e1d5ac4d7d54a02f0cecab6f3c46f848a948d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/25b3nk/github-issues', endpoint='https://huggingface.co', repo_type='dataset', repo_id='25b3nk/github-issues'), pr_revision=None, pr_num=None)

In [None]:
remote_dataset = load_dataset("25b3nk/github-issues", split="train")
remote_dataset

README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.96M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7314 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 7314
})

# Assignment
- Create a dataset with issues from ollama
- Upload the dataset
- Collate the dataset and load it for finetuning
- Train a multi-label classifier and do inference on new issues created or the test set

## Create and upload the dataset
https://github.com/ollama/ollama/issues

In [10]:
fetch_issues(owner="ollama",repo="ollama")

  0%|          | 0/100 [00:00<?, ?it/s]

Reached GitHub rate limit. Sleeping for one hour ...
Downloaded all the issues for ollama! Dataset stored at ./ollama-issues.jsonl


In [13]:
!head -n 2 /content/ollama-issues.jsonl

{"url":"https:\/\/api.github.com\/repos\/ollama\/ollama\/issues\/8691","repository_url":"https:\/\/api.github.com\/repos\/ollama\/ollama","labels_url":"https:\/\/api.github.com\/repos\/ollama\/ollama\/issues\/8691\/labels{\/name}","comments_url":"https:\/\/api.github.com\/repos\/ollama\/ollama\/issues\/8691\/comments","events_url":"https:\/\/api.github.com\/repos\/ollama\/ollama\/issues\/8691\/events","html_url":"https:\/\/github.com\/ollama\/ollama\/pull\/8691","id":2820813762,"node_id":"PR_kwDOJ0Z1Ps6Jf9LD","number":8691,"title":"Fix install_cuda_driver_yum() for dnf5","user":{"login":"FreeCap23","id":62378314,"node_id":"MDQ6VXNlcjYyMzc4MzE0","avatar_url":"https:\/\/avatars.githubusercontent.com\/u\/62378314?v=4","gravatar_id":"","url":"https:\/\/api.github.com\/users\/FreeCap23","html_url":"https:\/\/github.com\/FreeCap23","followers_url":"https:\/\/api.github.com\/users\/FreeCap23\/followers","following_url":"https:\/\/api.github.com\/users\/FreeCap23\/following{\/other_user}","gis

In [14]:
from datasets import load_dataset

issues_dataset = load_dataset("json", data_files="ollama-issues.jsonl", split="train")
issues_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason'],
    num_rows: 8576
})

In [15]:
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

Map:   0%|          | 0/8576 [00:00<?, ? examples/s]

In [16]:
issues_dataset.push_to_hub("ollama-github-issues")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/25b3nk/ollama-github-issues/commit/9cf5821116dec416752735e5667797592bb61dd0', commit_message='Upload dataset', commit_description='', oid='9cf5821116dec416752735e5667797592bb61dd0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/25b3nk/ollama-github-issues', endpoint='https://huggingface.co', repo_type='dataset', repo_id='25b3nk/ollama-github-issues'), pr_revision=None, pr_num=None)

In [17]:
issues_dataset = load_dataset("json", data_files="ollama-issues.jsonl")
issues_dataset

DatasetDict({
    train: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason'],
        num_rows: 8576
    })
})

In [18]:
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

In [20]:
train_test_split = issues_dataset["train"].train_test_split(test_size=0.2)

In [22]:
train_test_split.push_to_hub("ollama-github-issues")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/5.72k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/25b3nk/ollama-github-issues/commit/9e1f69c434f77214462ccc02ea8e0776962a6976', commit_message='Upload dataset', commit_description='', oid='9e1f69c434f77214462ccc02ea8e0776962a6976', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/25b3nk/ollama-github-issues', endpoint='https://huggingface.co', repo_type='dataset', repo_id='25b3nk/ollama-github-issues'), pr_revision=None, pr_num=None)

## Load dataset from hugging face

In [2]:
from datasets import load_dataset

remote_dataset = load_dataset("25b3nk/ollama-github-issues")
remote_dataset

DatasetDict({
    train: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
        num_rows: 6860
    })
    test: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason

In [3]:
remote_dataset["train"][2]["labels"]

[{'id': 5667396200,
  'node_id': 'LA_kwDOJ0Z1Ps8AAAABUc2aaA',
  'url': 'https://api.github.com/repos/ollama/ollama/labels/feature%20request',
  'name': 'feature request',
  'color': 'a2eeef',
  'default': False,
  'description': 'New feature or request'},
 {'id': 5860134234,
  'node_id': 'LA_kwDOJ0Z1Ps8AAAABXUqNWg',
  'url': 'https://api.github.com/repos/ollama/ollama/labels/windows',
  'name': 'windows',
  'color': '0052CC',
  'default': False,
  'description': ''}]

In [4]:
filtered_dataset = remote_dataset.filter(lambda x: x["is_pull_request"] == False).filter(lambda x: x["body"] is not None)

In [5]:
filtered_dataset

DatasetDict({
    train: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
        num_rows: 4484
    })
    test: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason

In [6]:
all_labels = set()
for labels in filtered_dataset["train"]["labels"]:
  for label in labels:
    all_labels.add(label["name"])

In [7]:
label2id = {label: i for i, label in enumerate(all_labels)}
id2label = {i: label for i, label in enumerate(all_labels)}

In [8]:
import numpy as np

def encode_labels(example):
    labels = example['labels']  # Split string into individual labels
    label_ids = [label2id[label["name"]] for label in labels if label["name"] in label2id]

    # Create a multi-hot vector with the length of all unique labels
    multi_hot = np.zeros(len(all_labels), dtype=int)
    multi_hot[label_ids] = 1
    example['multi_hot_labels'] = multi_hot.tolist()  # Convert back to a list to save in dataset
    example['label_ids'] = label_ids
    return example

In [9]:
encoded_dataset = filtered_dataset.map(encode_labels)

Map:   0%|          | 0/4484 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

## Ready the tokenizer & tokenize the dataset

In [10]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [11]:
encoded_dataset["train"].features["body"]
tokenizer(encoded_dataset["train"][0]["body"], truncation=True)

{'input_ids': [101, 1045, 4384, 2008, 1996, 19330, 10278, 2050, 2544, 12057, 2004, 8946, 2121, 11661, 2038, 2042, 7172, 2000, 1014, 1012, 1015, 1012, 2654, 1998, 2947, 2323, 2448, 2732, 16044, 2099, 2475, 1998, 19073, 4275, 1011, 1045, 2572, 2145, 2025, 2383, 6735, 2770, 2216, 1010, 19330, 10278, 2050, 2074, 19119, 1012, 1012, 1012, 2572, 1045, 4394, 2242, 1029, 16770, 1024, 1013, 1013, 19351, 8428, 1012, 4012, 1013, 2632, 3501, 12881, 13213, 2629, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
def tokenize_github_issues(examples):
  return tokenizer(examples["body"], truncation=True, padding=True)

In [14]:
tokenized_datasets = encoded_dataset.map(tokenize_github_issues, batched=True)

Map:   0%|          | 0/4484 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

## Getting training setup

In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request', 'multi_hot_labels', 'label_ids', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4484
    })
    test: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', '

In [16]:
tokenized_datasets = tokenized_datasets.remove_columns(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'])

In [17]:
tokenized_datasets = tokenized_datasets.rename_column("multi_hot_labels", "labels")

In [18]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'label_ids', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4484
    })
    test: Dataset({
        features: ['labels', 'label_ids', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1152
    })
})

In [19]:
import torch

tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
# tokenized_datasets = tokenized_datasets.map(lambda x: {'labels': torch.FloatTensor(x['labels'])})

In [20]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=2, collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=1, collate_fn=data_collator
)

In [22]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(all_labels),id2label=id2label, label2id=label2id, problem_type="multi_label_classification")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
for batch in train_dataloader:
  val = {k: v.shape for k, v in batch.items()}
  print(f"{val}")
  batch['labels'] = batch['labels'].float()
  outputs = model(**batch)
  print(outputs.loss, outputs.logits.shape)
  break

{'labels': torch.Size([2, 35]), 'input_ids': torch.Size([2, 512]), 'attention_mask': torch.Size([2, 512])}
tensor(0.6849, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>) torch.Size([2, 35])


In [43]:
# from transformers import TrainingArguments

# # Specify training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     logging_dir="./logs",
# )



In [46]:
# from transformers import Trainer

# # Initialize the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["test"],
# )

In [None]:
# # Train the model
# trainer.train()

In [28]:
import torch
from transformers import AdamW
from sklearn.metrics import accuracy_score, f1_score

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Custom loss function (BCEWithLogitsLoss for multi-label classification)
loss_fn = torch.nn.BCEWithLogitsLoss()

# Function for calculating metrics
def compute_metrics(preds, labels):
    sigmoid_preds = torch.sigmoid(preds).cpu().numpy()
    sigmoid_preds = (sigmoid_preds > 0.5).astype(int)  # Convert logits to 0 or 1

    labels = labels.cpu().numpy()
    acc = accuracy_score(labels, sigmoid_preds)
    f1 = f1_score(labels, sigmoid_preds, average='micro')

    return {"accuracy": acc, "f1": f1}




In [None]:
from tqdm import tqdm

# Training loop
epochs = 3

for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()

        # Move batch to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].float().to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs} completed.")

    # Evaluate the model after each epoch
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].float().to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.append(outputs.logits)
            true_labels.append(labels)

    preds = torch.cat(preds)
    true_labels = torch.cat(true_labels)
    metrics = compute_metrics(preds, true_labels)

    print(f"Validation metrics: {metrics}")


  0%|          | 2/2242 [00:34<10:19:40, 16.60s/it]