In [1]:
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, XLNetTokenizer, BertTokenizer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch
import re
from huggingface_hub import HfApi, login
from dotenv import load_dotenv
import shutil
import gc
from tokenizers import Tokenizer
import time

### Login to huggingface hub

In [2]:
load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(token=token)
api = HfApi()

### Static Methods and Variables

In [3]:
TEMP_DIR = "temp"
TEMP_MODEL_DIR = f"{TEMP_DIR}/model"
TOKENIZER = 'qwen' # available: llama3, wordlevel, sentencepiece, wordpiece, deepseek, qwen
# HF_MODEL_NAME = "meta-llama/Llama-3.2-1B"
# HF_MODEL_NAME = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'
HF_MODEL_NAME = 'Qwen/Qwen3-1.7B'
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available():
    # set up to release cache memory when possible
    torch.cuda.empty_cache()
    # set up more conservative memory limits  
    torch.cuda.set_per_process_memory_fraction(0.8)  # Use only 80% of GPU memory
MODELS = [
    [
        # {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']},
        # {'train': ['aptanastudio'], 'test': ['aptanastudio']},
        # {'train': ['bamboo'], 'test': ['bamboo']},
        # {'train': ['clover'], 'test': ['clover']},
        # {'train': ['datamanagement'], 'test': ['datamanagement']},
        # {'train': ['duracloud'], 'test': ['duracloud']},
        # {'train': ['jirasoftware'], 'test': ['jirasoftware']},
        # {'train': ['mesos'], 'test': ['mesos']},
        # {'train': ['moodle'], 'test': ['moodle']},
        # {'train': ['mule'], 'test': ['mule']},
        # {'train': ['mulestudio'], 'test': ['mulestudio']},
        # {'train': ['springxd'], 'test': ['springxd']},
        # {'train': ['talenddataquality'], 'test': ['talenddataquality']},
        # {'train': ['talendesb'], 'test': ['talendesb']},
        # {'train': ['titanium'], 'test': ['titanium']},
        {'train': ['usergrid'], 'test': ['usergrid']},
    ],
    [
        {'train': ['mesos'], 'test': ['usergrid']},
        {'train': ['usergrid'], 'test': ['mesos']},
        {'train': ['appceleratorstudio'], 'test': ['aptanastudio']},
        {'train': ['appceleratorstudio'], 'test': ['titanium']},
        {'train': ['titanium'], 'test': ['appceleratorstudio']},
        {'train': ['aptanastudio'], 'test': ['titanium']},
        {'train': ['mule'], 'test': ['mulestudio']},
        {'train': ['mulestudio'], 'test': ['mule']}
    ],
    [
        {'train': ['clover'], 'test': ['usergrid']},
        {'train': ['talendesb'], 'test': ['mesos']},
        {'train': ['talenddataquality'], 'test': ['aptanastudio']},
        {'train': ['mule'], 'test': ['titanium']},
        {'train': ['talenddataquality'], 'test': ['appceleratorstudio']},
        {'train': ['mulestudio'], 'test': ['titanium']},
        {'train': ['appceleratorstudio'], 'test': ['mulestudio']},
        {'train': ['appceleratorstudio'], 'test': ['mule']}
    ]
]

def readme(base_model_name, train_project, test_project, epochs, batch_size, training_time, mae, mdae, repo_url):
  project_name = train_project if train_project == test_project else f"{train_project} → {test_project}"
  if train_project == test_project:
    project_name = train_project
    datasets = f"- {train_project}"
  else:
    project_name = f"{train_project} - {test_project}"
    datasets = f"- {train_project}\n- {test_project}"

  tokenizer = ''
  tokenizerImports = ''
  tokenizerUtilization = ''
  licence = 'llama3.2'  # Default license for Llama 3.2 models
  name = 'llama-3.2-1b-story-point-estimation'
  modelName = 'LLAMA 3'
  baseModel = 'Llama 3.2 1B'

  if TOKENIZER == 'llama3':
    tokenizerImports = "from transformers import AutoTokenizer"
    tokenizerUtilization = f"""tokenizer = AutoTokenizer.from_pretrained("{repo_url}")"""
  elif TOKENIZER == 'wordlevel':
    tokenizer = '- Tokenizer: SP Word Level'
    tokenizerImports = "from tokenizers import Tokenizer"
    tokenizerUtilization = f"""tokenizer = Tokenizer.from_pretrained("{repo_url}")"""
  elif TOKENIZER == 'sentencepiece':
    tokenizer = '- Tokenizer: SP SentencePiece'
    tokenizerImports = "from transformers import XLNetTokenizer"
    tokenizerUtilization = f"""tokenizer = XLNetTokenizer('spm_tokenizer.model', padding_side='right')"""
  elif TOKENIZER == 'wordpiece':
    tokenizer = '- Tokenizer: SP WordPiece'
    tokenizerImports = "from transformers import BertTokenizer"
    tokenizerUtilization = f"""tokenizer = BertTokenizer('vocab.txt')"""
  elif TOKENIZER == 'deepseek':
    tokenizer = '- Tokenizer: DeepSeek BPE Tokenizer'
    tokenizerImports = "from transformers import AutoTokenizer"
    tokenizerUtilization = f"""tokenizer = AutoTokenizer.from_pretrained("{repo_url}")"""
    licence = 'apache-2.0'  # Default license for DeepSeek-R1-Distill-Qwen-1.5B models
    name = 'DeepSeek-R1-Distill-Qwen-1.5B-story-point-estimation'
    modelName = 'DeepSeek R1 Qwen'
    baseModel = 'DeepSeek R1 Distill Qwen 1.5B'
  elif TOKENIZER == 'qwen':
    tokenizer = '- Tokenizer: Qwen BPE Tokenizer'
    tokenizerImports = "from transformers import AutoTokenizer"
    tokenizerUtilization = f"""tokenizer = AutoTokenizer.from_pretrained("{repo_url}")"""
    licence = 'apache-2.0'
    name = 'Qwen3-story-point-estimation'
    modelName = 'Qwen 3'
    baseModel = 'Qwen 3'

  return f"""---
license: {licence}
language:
- en
base_model:
- {base_model_name}
pipeline_tag: text-classification
library_name: transformers
tags:
- regression
- story-point-estimation
- software-engineering
datasets:
{datasets}
metrics:
- mae
- mdae

model-index:
- name: {name}
  results:
  - task:
      type: regression
      name: Story Point Estimation
    dataset:
      type: {test_project}
      name: {test_project} Dataset
      split: test
    metrics:
      - type: mae
        value: {mae:.3f}
        name: Mean Absolute Error (MAE)
      - type: mdae
        value: {mdae:.3f}
        name: Median Absolute Error (MdAE)
---
# {modelName} Story Point Estimator - {project_name}

This model is fine-tuned on issue descriptions from {train_project} and tested on {test_project} for story point estimation.

## Model Details
- Base Model: {baseModel}
- Training Project: {train_project}
- Test Project: {test_project}
- Task: Story Point Estimation (Regression)
- Architecture: PEFT (LoRA)
{tokenizer}

- Input: Issue titles
- Output: Story point estimation (continuous value)

## Usage
```python
from transformers import AutoModelForSequenceClassification
from peft import PeftConfig, PeftModel
{tokenizerImports}

# Load peft config model
config = PeftConfig.from_pretrained("{repo_url}")

# Load tokenizer and model
{tokenizerUtilization}
base_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path,
    num_labels=1,
    torch_dtype=torch.float16,
    device_map='auto'
)
model = PeftModel.from_pretrained(base_model, "{repo_url}")

# Prepare input text
text = "Your issue description here"
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=20, padding="max_length")

# Get prediction
outputs = model(**inputs)
story_points = outputs.logits.item()
```

## Training Details
- Fine-tuning method: LoRA (Low-Rank Adaptation)
- Sequence length: 20 tokens
- Best training epoch: {epochs} / 20 epochs
- Batch size: {batch_size}
- Training time: {training_time:.3f} seconds
- Mean Absolute Error (MAE): {mae:.3f}
- Median Absolute Error (MdAE): {mdae:.3f}
"""

def prepare_and_push_model(
    base_model_name="meta-llama/Llama-3.2-1B",
    trained_model_path="./models/aptanastudio_aptanastudio_epo_X",
    new_model_name="DEVCamiloSepulveda/0-LLAMA3SP-appceleratorstudio",
    train_project="mesos",
    test_project="usergrid",
    mae=0.0,
    mdae=0.0,
    training_time=0.0,
    epochs=0,
    batch_size=32
):
  global HF_MODEL_NAME

  gc.collect()
  torch.cuda.empty_cache()

  # Create temp dir, if exists, delete it
  if os.path.exists(TEMP_MODEL_DIR):
    os.system(f"rm -rf {TEMP_MODEL_DIR}")
  os.makedirs(TEMP_MODEL_DIR, exist_ok=True)

  # Create README.md
  with open(f"{TEMP_MODEL_DIR}/README.md", "w") as f:
    f.write(readme(base_model_name, train_project, test_project, epochs, batch_size, training_time, mae, mdae, new_model_name))

  # Save Model
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type='nf4',
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.float16,
  )
  lora_config = LoraConfig(
      r=8,
      lora_alpha=16,
      target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
      lora_dropout=0.1,
      bias='none',
      task_type='SEQ_CLS'
  )
  MODEL = AutoModelForSequenceClassification.from_pretrained(
      HF_MODEL_NAME,
      quantization_config=quantization_config,
      num_labels=1,
      torch_dtype=torch.float16,
      device_map='auto',
      low_cpu_mem_usage=True
  )
  MODEL = prepare_model_for_kbit_training(MODEL)
  MODEL = get_peft_model(MODEL, lora_config)
  MODEL.gradient_checkpointing_enable()
  MODEL.enable_input_require_grads()

  state_dict = torch.load(trained_model_path, map_location=DEVICE, weights_only=True)
  MODEL.load_state_dict(state_dict, strict=False)

  # Save tokenizer
  if TOKENIZER == 'llama3':
    print('using llama3 tokenizer!')
    tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME, add_prefix_space=True)

    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    # Update some model configs
    # Must use .cache = False as below or it crashes from my experience
    MODEL.config.pad_token_id = tokenizer.pad_token_id

    tokenizer.save_pretrained(f"./{TEMP_MODEL_DIR}")
  elif TOKENIZER == 'wordlevel':
    print('using wordlevel tokenizer!')
    tokenizer = Tokenizer.from_file('all_tokenizers/sp_word_level/wordlevel.json')
    MODEL.config.pad_token_id = 3

    shutil.copy('all_tokenizers/sp_word_level/wordlevel.json', 
            os.path.join(TEMP_MODEL_DIR, 'tokenizer.json'))
  elif TOKENIZER == 'sentencepiece':
    print('using sentencepiece tokenizer!')
    tokenizer = XLNetTokenizer('all_tokenizers/sp_sentence_piece/spm_tokenizer.model', padding_side='right')
    MODEL.config.pad_token_id = 0

    shutil.copy('all_tokenizers/sp_sentence_piece/spm_tokenizer.model',
            os.path.join(TEMP_MODEL_DIR, 'spm_tokenizer.model'))
    shutil.copy('all_tokenizers/sp_sentence_piece/spm_tokenizer.vocab',
            os.path.join(TEMP_MODEL_DIR, 'spm_tokenizer.vocab'))
  elif TOKENIZER == 'wordpiece':
    print('using wordpiece tokenizer!')
    tokenizer = BertTokenizer('all_tokenizers/sp_word_piece/vocab.txt')
    MODEL.config.pad_token_id = 0

    shutil.copy('all_tokenizers/sp_word_piece/vocab.txt',
            os.path.join(TEMP_MODEL_DIR, 'vocab.txt'))
  elif TOKENIZER == 'deepseek':
    print('using deepseek tokenizer!')
    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
    MODEL.config.pad_token_id = tokenizer.pad_token_id
  elif TOKENIZER == 'qwen':
    print('using qwen tokenizer!')
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")
    MODEL.config.pad_token_id = tokenizer.pad_token_id

    tokenizer.save_pretrained(f"./{TEMP_MODEL_DIR}")

  MODEL.config.use_cache = False
  MODEL.config.pretraining_tp = 1

  MODEL.cuda()

  MODEL.save_pretrained(f"./{TEMP_MODEL_DIR}")
  # Save weights
  torch.save(MODEL.state_dict(), f"./{TEMP_MODEL_DIR}/pytorch_model.bin")

  # Clear memory
  MODEL.cpu()
  del MODEL
  gc.collect()
  torch.cuda.empty_cache()
  try:
    del tokenizer
  except:
      pass

  # Delete existing repo and recreate
  try:
    api.delete_repo(repo_id=new_model_name, token=token)
  except Exception as e:
    print(f"Repo might not exist or error deleting: {e}")

  # Create repo
  api.create_repo(
    repo_id=new_model_name,
    repo_type="model",
    token=token
  )

  MAX_RETRIES = 5
  RETRY_DELAY = 5

  retries = 0
  while retries < MAX_RETRIES:
    try:
        # Push model
        api.upload_folder(
            folder_path=TEMP_MODEL_DIR,
            repo_id=new_model_name,
            token=token
        )
        print("Upload successful.")
        break
    except Exception as e:
        retries += 1
        print(f"Upload failed (attempt {retries}/{MAX_RETRIES}): {e}")
        if retries < MAX_RETRIES:
            time.sleep(RETRY_DELAY)
        else:
            raise RuntimeError(f"Upload failed after {MAX_RETRIES} attempts.") from e

  # Delete temp dir
  os.system(f"rm -rf {TEMP_MODEL_DIR}")
  gc.collect()
  torch.cuda.empty_cache()

### Create model config

In [None]:
def main():
    global MODELS, TOKENIZER, HF_MODEL_NAME
    project = 'LLAMA3SP'

    if TOKENIZER == 'llama3':
        caracter = "0"
    elif TOKENIZER == 'wordlevel':
        caracter = "2"
    elif TOKENIZER == 'sentencepiece':
        caracter = "7"
    elif TOKENIZER == 'wordpiece':
        caracter = "6"
    elif TOKENIZER == 'deepseek':
        caracter = "1"
        project = 'DeepSeekR1SP'
    elif TOKENIZER == 'qwen':
        caracter = "3"
        project = 'Qwen3SP'

    for i, model in enumerate(MODELS):
        identifier = caracter * (i + 1)
        for j, data in enumerate(model):
            train_project = data['train'][0]
            test_project = data['test'][0]
            if (i == 0):
                model_name = train_project;
            else:
                model_name = f"{train_project}-{test_project}"
            model_name = f"DEVCamiloSepulveda/{identifier}-{project}-{model_name}"
            print(f"Uploading model {model_name}...")
            # Open the file in results to upload the model
            with open(f"./results/{train_project}_{test_project}.txt", "r") as f:
                model_results = f.read()
                mae, mdae, training_time, epochs, batch_size = (
                    float(re.search(r"MAE:\s*([\d.]+)", model_results).group(1)),
                    float(re.search(r"MdAE:\s*([\d.]+)", model_results).group(1)),
                    float(re.search(r"training time:\s*([\d.]+)", model_results).group(1)),
                    int(re.search(r"Epochs:\s*(\d+)", model_results).group(1)),
                    int(re.search(r"batch size:\s*(\d+)", model_results).group(1))
                )
            trained_model_path = os.path.join(f"./models/", f"{train_project}_{test_project}_epo_{epochs}")
            print(f"Model path: {trained_model_path}")
            gc.collect()
            torch.cuda.empty_cache()
            prepare_and_push_model(
                base_model_name=HF_MODEL_NAME,
                trained_model_path=trained_model_path,
                new_model_name=model_name,
                train_project=train_project,
                test_project=test_project,
                mae=mae,
                mdae=mdae,
                training_time=training_time,
                epochs=epochs,
                batch_size=batch_size
            )

if __name__ == "__main__":
    main()