# Llama3SP - Model Inspection Notebook

#### Hello!  Welcome to the model inspection notebook, we provides the scripts for model loading and model inference on testing data of model training process for all of the models mentioned in our experiments in the paper. 

##### Attention!!!
##### Before interacting with this notebook, you may want to install a few dependencies [HERE](#dependencies).
##### Also, make sure to run the [Static Methods](#static-method) cell, then you are good to go
#### The models are categorized by the experiment scenario, please follow the link as follows to reach the specific section

### 1. [Within Project Models](#within_project)
### 2. [Cross Project Models](#cross_project)

<a id='dependencies'></a>
## Dependencies Installation
#### run the cell below to install the dependencies

In [None]:
%!pip install transformers
%!pip install peft
%!pip install torch
%!pip install tokenizers
%!pip install captum

<a id='static-method'></a>
## Static Methods

In [1]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from Llama3SP import LlamaForSequenceClassification as Llama3SP
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    XLNetTokenizer,
    BertTokenizer,
    BitsAndBytesConfig,
)
from peft import (
    PeftModel,
    PeftConfig,
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from tokenizers import Tokenizer
from pathlib import Path

import torch
import pandas as pd
import numpy as np
import os
import gc
import shutil
import re

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Tokenizer Mapping
TOKENIZER_MAPPING = {"#0": "llama3", "#00": "llama3", "#000": "llama3",
                     "#2": "sp_word_level", "#22": "sp_word_level", "#222": "sp_word_level",
                     "#6": "wordpiece_sp", "#66": "wordpiece_sp", "#666": "wordpiece_sp",
                     "#7": "sentencepiece_sp", "#77": "sentencepiece_sp", "#777": "sentencepiece_sp"}

# pad token ID mapping
PAD_TOKEN_ID_MAPPING = {"llama3": 128001, "sp_word_level": 3, "wordpiece_sp": 0, "sentencepiece_sp": 0}
# static global vars
global DYNAMIC_BATCH, DEVICE
DYNAMIC_BATCH = True
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available():
    # set up to release cache memory when possible
    torch.cuda.empty_cache()
    # set up more conservative memory limits  
    torch.cuda.set_per_process_memory_fraction(0.8)  # Use only 80% of GPU memory
SEQUENCE_LEN = 20
# dynamic global vars
global PAD_TOKEN_ID, BATCH_SIZE_RATIO, BATCH_SIZE, TOKENIZER, WITHIN_PROJECT, TEXT, KEY, TOK, MODEL, PROJECT_ID
PAD_TOKEN_ID = None
BATCH_SIZE_RATIO = None
BATCH_SIZE = None
TOKENIZER = None
WITHIN_PROJECT = None
TEXT = None
KEY = None
TOK = None
MODEL = None
PROJECT_ID = None


def tokenization(text_list, path):
    global TOKENIZER, SEQUENCE_LEN, MODEL, TOK, PAD_TOKEN_ID
    
    if TOKENIZER == 'llama3':
        print('using llama3 tokenizer!')
        tokenizer = AutoTokenizer.from_pretrained(path)

        tokenizer.pad_token = tokenizer.eos_token
        MODEL.config.pad_token_id = MODEL.config.eos_token_id 
        
        # ensure that no sequence exceeds SEQUENCE_LEN
        encoded_dict = tokenizer.batch_encode_plus(
            text_list,
            max_length=SEQUENCE_LEN,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True,
            return_token_type_ids=False
        )
        
        # Verify dimensions
        if encoded_dict['input_ids'].shape[1] > SEQUENCE_LEN:
            print(f"Warning: Truncating sequences to {SEQUENCE_LEN} tokens")
            encoded_dict['input_ids'] = encoded_dict['input_ids'][:, :SEQUENCE_LEN]
            if 'attention_mask' in encoded_dict:
                encoded_dict['attention_mask'] = encoded_dict['attention_mask'][:, :SEQUENCE_LEN]
        
        TOK = tokenizer
        return encoded_dict
    elif TOKENIZER == 'sp_word_level':
        print('using word-level tokenizer!')
        tokenizer = Tokenizer.from_pretrained(path)
        encoded_sentences = {'input_ids':[]}
        for sentence in text_list:
            encoded = tokenizer.encode(sentence)
            encoded = encoded.ids
            if len(encoded) > SEQUENCE_LEN:
                encoded = encoded[:SEQUENCE_LEN]
            elif len(encoded) < SEQUENCE_LEN:
                padding = SEQUENCE_LEN - len(encoded)
                for _ in range(padding):
                    encoded.append(3)
            encoded_sentences['input_ids'].append(encoded)
        tokenizer.pad_token_id = PAD_TOKEN_ID
        MODEL.config.pad_token_id = PAD_TOKEN_ID

        TOK = tokenizer
        return encoded_sentences
    elif TOKENIZER == 'sentencepiece_sp':
        print('using sentencepiece tokenizer!')
        tokenizer = XLNetTokenizer('all_tokenizers/sp_sentence_piece/spm_tokenizer.model', padding_side='right')

        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.pad_token = tokenizer.eos_token
        # update some model configs
        # must use .cache = False as below or it crashes from my experience
        MODEL.config.pad_token_id = tokenizer.pad_token_id
        MODEL.config.use_cache = False
        MODEL.config.pretraining_tp = 1
        return tokenizer.batch_encode_plus(text_list, truncation=True, max_length=SEQUENCE_LEN, padding='max_length')
    elif TOKENIZER == 'wordpiece_sp':
        print('using wordpiece tokenizer!')
        tokenizer = BertTokenizer('all_tokenizers/sp_word_piece/vocab.txt')

        MODEL.config.pad_token_id = tokenizer.pad_token_id
        MODEL.config.use_cache = False
        MODEL.config.pretraining_tp = 1
        return tokenizer.batch_encode_plus(text_list, truncation=True, max_length=SEQUENCE_LEN, padding='max_length')


def prepare_dataframe(file_name):
    data = pd.read_csv(file_name)
    # some rows have no description, fill blank to avoid Null
    data = data.fillna(' ')
    d = {'text': data['title'], 'label': data['storypoint'], 'issuekey': data['issuekey']}
    return pd.DataFrame(data=d)


def prepare_dataloader(seq, y, sampler_type):
    global BATCH_SIZE
    tensor_dataset = TensorDataset(seq, y)
    if sampler_type == 'random':
        sampler = RandomSampler(tensor_dataset)
    elif sampler_type == 'sequential':
        sampler = SequentialSampler(tensor_dataset)
    dataloader = DataLoader(tensor_dataset, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader


def load_trained_model(model_id, project_name):
    global WITHIN_PROJECT, MODEL, PROJECT_ID

    local = False
    try:
        int(model_id[1:])
        local = False
    except:
        local = True

    if local:
        print("Loading model from local...")

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.float16,
        )
        lora_config = LoraConfig(
            r=8,
            lora_alpha=16,
            target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
            lora_dropout=0.1,
            bias='none',
            task_type='SEQ_CLS'
        )
        HF_MODEL_NAME = "meta-llama/Llama-3.2-1B"
        MODEL = AutoModelForSequenceClassification.from_pretrained(
            HF_MODEL_NAME,
            quantization_config=quantization_config,
            num_labels=1,
            torch_dtype=torch.float16,
            device_map='auto',
            low_cpu_mem_usage=True,
            pad_token_id=PAD_TOKEN_ID,
        )
        MODEL = prepare_model_for_kbit_training(MODEL)
        MODEL = get_peft_model(MODEL, lora_config)
        MODEL.gradient_checkpointing_enable()
        MODEL.enable_input_require_grads()

        state_dict = torch.load(model_id, map_location=DEVICE, weights_only=True)
        MODEL.load_state_dict(state_dict, strict=False)

    else:
        print("Loading model from Hugging Face...")

        if WITHIN_PROJECT:
            path = "DEVCamiloSepulveda/" + model_id[1:] + "-LLAMA3SP-" + project_name
        else:
            path = "DEVCamiloSepulveda/" + model_id[1:] + "-LLAMA3SP-" + project_name.split("_")[0] + "-" + project_name.split("_")[1]
        PROJECT_ID = path

        # Load the model configuration
        config = PeftConfig.from_pretrained(path)

        # Load the original base model
        base_model = AutoModelForSequenceClassification.from_pretrained(
            config.base_model_name_or_path,
            num_labels=1,
            torch_dtype=torch.float16,
            device_map='auto'
        )

        # Load the LoRA adapters
        MODEL = PeftModel.from_pretrained(base_model, path)
    
    return MODEL


def prepare_test_dataloader(file_name, model_id, project_name):
    global WITHIN_PROJECT, BATCH_SIZE, BATCH_SIZE_RATIO, TEXT, KEY, MODEL

    global WITHIN_PROJECT, MODEL
    if WITHIN_PROJECT:
        path = "DEVCamiloSepulveda/" + model_id[1:] + "-LLAMA3SP-" + project_name
    else:
        path = "DEVCamiloSepulveda/" + model_id[1:] + "-LLAMA3SP-" + project_name.split("_")[0] + "-" + project_name.split("_")[1]
    path = "meta-llama/Llama-3.2-1B"

    if WITHIN_PROJECT:
        # calculate the batch size
        df = prepare_dataframe(file_name)
        BATCH_SIZE = min(int(int(len(df['text'][:int(len(df)*0.6)])) * BATCH_SIZE_RATIO), 512)
        print("Batch Size: ", BATCH_SIZE)
        # prepare testing data
        test_text = df['text'][int(len(df)*0.8):]
        TEXT = test_text
        KEY = df['issuekey'][int(len(df)*0.8):]
        test_labels = df['label'][int(len(df)*0.8):]
        tokens_test = tokenization(test_text.tolist(), path)
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential')
    else:
        # calculate the batch size based on training data
        df = prepare_dataframe('sp_dataset/marked_data/' + file_name[0] + '.csv')
        BATCH_SIZE = min(int(int(len(df['text'][:int(len(df)*0.6)])) * BATCH_SIZE_RATIO), 512)
        # prepare testing data
        df = prepare_dataframe('sp_dataset/marked_data/' + file_name[1] + '.csv')
        test_text = df['text']
        TEXT = test_text
        KEY = df['issuekey']
        test_labels = df['label']
        tokens_test = tokenization(test_text.tolist(), path)
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential')         
    return test_dataloader


def calculate_mae(predictions, targets):
    return np.mean(np.abs(predictions - targets))


def do_inference(trained_model, test_dataloader):
    global TEXT, KEY
    global XAI
    predictions = []
    true_labels = []
    try:
        for batch in test_dataloader:
            batch = tuple(t.to(DEVICE) for t in batch)
            b_input_ids, b_labels = batch
            with torch.no_grad():
                logits = trained_model(b_input_ids)
            logits = logits['logits'].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            predictions.append(logits)
            true_labels.append(label_ids)
        
        # Flatten predictions and true_labels for MAE calculation
        flat_predictions = np.concatenate(predictions, axis=0).flatten()
        flat_true_labels = np.concatenate(true_labels, axis=0).flatten()
        
        # Calculate MAE of the model (MAE_p)
        MAE_p = calculate_mae(flat_predictions, flat_true_labels)
        
        # Generate random predictions and calculate MAE of random predictions (MAE_Po)
        random_predictions = np.random.rand(1000, len(flat_true_labels))  # 1000 ejecuciones de predicciones aleatorias
        MAE_Po = np.mean([calculate_mae(pred, flat_true_labels) for pred in random_predictions])
        
        # Calculate Standard Accuracy (SA)
        SA = (1 - (MAE_p / MAE_Po)) * 100
        
        # Calculate Median Absolute Error (MdAE)
        MdAE = np.median(np.abs(flat_predictions - flat_true_labels))
        
        return MAE_p, MdAE, SA
        
    except Exception as e:
        print(f"Unexpected error in do_inference: {e}")
        import traceback
        traceback.print_exc()
        return None


def main(model_id, project_name):
    global WITHIN_PROJECT, BATCH_SIZE_RATIO, TOKENIZER, PAD_TOKEN_ID, MODEL
    # define tokenizer based on model ID
    try:
        TOKENIZER = TOKENIZER_MAPPING[model_id]
    except:
        TOKENIZER = 'llama3'
    PAD_TOKEN_ID = PAD_TOKEN_ID_MAPPING[TOKENIZER]

    if len(project_name.split('_')) == 1:
        WITHIN_PROJECT = True
        BATCH_SIZE_RATIO = 0.3
        print('within project inference using model ' + model_id + ' for project ' + project_name)
        file_name = 'sp_dataset/marked_data/' + project_name + '.csv'
    else:
        WITHIN_PROJECT = False
        BATCH_SIZE_RATIO = 0.4
        training_project = project_name.split('_')[0]
        testing_project = project_name.split('_')[1]
        print('cross project inference using model ' + model_id + ' trained on ' + training_project 
              + ' for project ' + testing_project)
        file_name = (training_project, testing_project)
    trained_model = load_trained_model(model_id, project_name)
    
    trained_model.to(DEVICE)
    trained_model.eval()
    
    test_dataloader = prepare_test_dataloader(file_name, model_id, project_name)
    predictions = do_inference(trained_model, test_dataloader)
    
    return predictions


def clean_hf_cache(model_id=None):
    """
    Cleans the cache of files downloaded from Hugging Face.
    
    Args:
        model_id (str, optional): Specific model ID to clean.
                                  If not specified, cleans the entire cache.
    """
    # Use the correct cache path
    cache_path = os.path.join(str(Path.home()), '.cache', 'huggingface', 'hub')
    
    if not os.path.exists(cache_path):
        print(f"Cache directory not found at: {cache_path}")
        return
        
    if model_id:
        # If a model is specified, look for its specific directory
        model_path = os.path.join(cache_path, 'models--' + model_id.replace('/', '--'))
        if os.path.exists(model_path):
            print(f"Deleting cache for model: {model_id}")
            try:
                shutil.rmtree(model_path)
                print(f"Cache successfully deleted for: {model_id}")
            except Exception as e:
                print(f"Error deleting cache: {e}")
        else:
            print(f"Cache not found for model: {model_id}")
    else:
        # Clean the entire cache
        print(f"Deleting all Hugging Face cache at: {cache_path}")
        try:
            for item in os.listdir(cache_path):
                item_path = os.path.join(cache_path, item)
                if os.path.isdir(item_path):
                    shutil.rmtree(item_path)
                else:
                    os.remove(item_path)
            print("Cache completely deleted")
        except Exception as e:
            print(f"Error deleting cache: {e}")


def clean_gpu_memory():
    """
    Clean GPU memory by releasing cache memory and unused tensors
    """
    global MODEL

    del MODEL

    # Release PyTorch cache memory
    torch.cuda.empty_cache()
    
    # Release tensor memory
    if torch.cuda.is_available():
        # Get the current device
        device = torch.cuda.current_device()
        
        # Synchronize the device to ensure all operations are complete
        torch.cuda.synchronize(device)
        
        # Force garbage collection
        gc.collect()
        
        # Release all tensors assigned to the device
        torch.cuda.empty_cache()
        
        # Reset all CUDA devices
        torch.cuda.reset_peak_memory_stats(device)
        torch.cuda.reset_accumulated_memory_stats(device)


def local_model_inference(models):
    maes = []
    mdaes = []
    standard_accuracies = []
    for i, model in enumerate(models):
        train_project = model['train']
        test_project = model['test']
        model_name = f"{train_project}_{test_project}"
        # Open the file in results to upload the model
        with open(f"./results/{train_project}_{test_project}.txt", "r") as f:
            model_results = f.read()
            mae, mdae, training_time, epochs, batch_size = (
                float(re.search(r"MAE:\s*([\d.]+)", model_results).group(1)),
                float(re.search(r"MdAE:\s*([\d.]+)", model_results).group(1)),
                float(re.search(r"training time:\s*([\d.]+)", model_results).group(1)),
                int(re.search(r"Epochs:\s*(\d+)", model_results).group(1)),
                int(re.search(r"batch size:\s*(\d+)", model_results).group(1))
            )

        mae, mdae, sa = main(f"./models/{model_name}_epo_{epochs}", train_project)
        mae = round(mae, 2)
        mdae = round(mdae, 2)
        sa = round(sa, 2)
        maes.append(mae)
        mdaes.append(mdae)
        standard_accuracies.append(sa)

        print(f"Model: {model_name}, MAE: {mae}, MdAE: {mdae}, SA: {sa}")

        clean_gpu_memory()
    return maes, mdaes, standard_accuracies


def create_df(maes, mdaes, saes, models):
    data = [
        {
            'Project': train_proj,
            'MAE': mae,
            'MdAE': mdae,
            'SA': sa,
        }
        if (train_proj := model['train']) == (test_proj := model['test'])
        else {
            'Train': train_proj,
            'Test': test_proj,
            'MAE': mae,
            'MdAE': mdae,
            'SA': sa,
        }
        for model, mae, mdae, sa in zip(models, maes, mdaes, saes)
    ]
    return pd.DataFrame(data)

In [2]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.set_per_process_memory_fraction(0.8)

PROJECTS = [
    [
        {'train': 'appceleratorstudio', 'test': 'appceleratorstudio'},
        {'train': 'aptanastudio', 'test': 'aptanastudio'},
        {'train': 'bamboo', 'test': 'bamboo'},
        {'train': 'clover', 'test': 'clover'},
        {'train': 'datamanagement', 'test': 'datamanagement'},
        {'train': 'duracloud', 'test': 'duracloud'},
        {'train': 'jirasoftware', 'test': 'jirasoftware'},
        {'train': 'mesos', 'test': 'mesos'},
        {'train': 'moodle', 'test': 'moodle'},
        {'train': 'mule', 'test': 'mule'},
        {'train': 'mulestudio', 'test': 'mulestudio'},
        {'train': 'springxd', 'test': 'springxd'},
        {'train': 'talenddataquality', 'test': 'talenddataquality'},
        {'train': 'talendesb', 'test': 'talendesb'},
        {'train': 'titanium', 'test': 'titanium'},
        {'train': 'usergrid', 'test': 'usergrid'},
    ],
    [
        {'train': 'mesos', 'test': 'usergrid'},
        {'train': 'usergrid', 'test': 'mesos'},
        {'train': 'appceleratorstudio', 'test': 'aptanastudio'},
        {'train': 'appceleratorstudio', 'test': 'titanium'},
        {'train': 'titanium', 'test': 'appceleratorstudio'},
        {'train': 'aptanastudio', 'test': 'titanium'},
        {'train': 'mule', 'test': 'mulestudio'},
        {'train': 'mulestudio', 'test': 'mule'}
    ],
    [
        {'train': 'clover', 'test': 'usergrid'},
        {'train': 'talendesb', 'test': 'mesos'},
        {'train': 'talenddataquality', 'test': 'aptanastudio'},
        {'train': 'mule', 'test': 'titanium'},
        {'train': 'talenddataquality', 'test': 'appceleratorstudio'},
        {'train': 'mulestudio', 'test': 'titanium'},
        {'train': 'appceleratorstudio', 'test': 'mulestudio'},
        {'train': 'appceleratorstudio', 'test': 'mule'}
    ]
]

MODELS = {
    "#0": "Llama3.2",
    "#2": "Llama3.2+SPWordLevel",
    "#6": "Llama3.2+SPWordPiece",
    "#7": "Llama3.2+SPSentencePiece"
}

WITHIN_PROJECTS = PROJECTS[0]
CROSS_PROJECTS = PROJECTS[1:]
ALL_CROSS_PROJECTS = [item for sublist in PROJECTS[1:] for item in sublist]

<a id='within_project'></a>
## Within Projects Models

#### There are two parts under Within Project Model section, follow the link to reach the section:
#### 1. [Training Process Inspection](#within_project_tb)
#### 2. [Model Testing](#within_project_model_testing)

#### Different models are available for cross project estimation as follows: 

#### #0 - Llama3.2 Auto Tokenizer + Llama3.2
#### #2 - Word-level Story Point Tokenizer + Llama3.2
#### #6 - WordPiece Story Point Tokenizer + Llama3.2
#### #7 - SentencePiece Story Point Tokenizer + Llama3.2 

<a id='within_project_model_testing'></a>
### Model Testing

##### Run the cell below to do inference on all testing datasets using all the uploaded models on Hugging Face

In [3]:
llama3_within_df = pd.DataFrame({'Project': [item['train'] for item in WITHIN_PROJECTS]}, dtype='float64')
llama3_within_df_mdae = llama3_within_df.copy()
llama3_within_df_sa = llama3_within_df.copy()

for model in MODELS:
    maes = []
    mdmaes = []
    standard_accuracies = []
    
    for project in WITHIN_PROJECTS:
        print(f"Running inference for project: {project['train']}")
        mae, mdae, sa = main(model, project['train'])

        mae = round(mae, 2)
        mdae = round(mdae, 2)
        sa = round(sa, 2)
        maes.append(mae)
        mdmaes.append(mdae)
        standard_accuracies.append(sa)
        print(f"MAE for project {project['train']}: {mae}")
        print(f"MDAE for project {project['train']}: {mdae}")
        print(f"Standard Accuracy for project {project['train']}: {sa}")
        
        model_id = f"DEVCamiloSepulveda/{model[1:]}-LLAMA3SP-{project['train']}"
        # clean_hf_cache(model_id)
        clean_gpu_memory()
    llama3_within_df[MODELS[model]] = maes
    llama3_within_df[MODELS[model]] = llama3_within_df[MODELS[model]].apply(lambda x: round(x, 2))
    llama3_within_df_mdae[MODELS[model]] = mdmaes
    llama3_within_df_mdae[MODELS[model]] = llama3_within_df_mdae[MODELS[model]].apply(lambda x: round(x, 2))
    llama3_within_df_sa[MODELS[model]] = standard_accuracies
    llama3_within_df_sa[MODELS[model]] = llama3_within_df_sa[MODELS[model]].apply(lambda x: round(x, 2))

print("All projects processed")

llama3_within_df

  llama3_within_df = pd.DataFrame({'Project': [item['train'] for item in WITHIN_PROJECTS]}, dtype='float64')


Running inference for project: appceleratorstudio
within project inference using model #0 for project appceleratorstudio
Loading model from Hugging Face...


  _ = torch.tensor([0], device=i)
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using llama3 tokenizer!


  test_seq = torch.tensor(tokens_test['input_ids'])


MAE for project appceleratorstudio: 1.65
MDAE for project appceleratorstudio: 1.45
Standard Accuracy for project appceleratorstudio: 63.86
Running inference for project: aptanastudio
within project inference using model #0 for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  149
using llama3 tokenizer!
MAE for project aptanastudio: 3.74
MDAE for project aptanastudio: 2.9
Standard Accuracy for project aptanastudio: 44.37
Running inference for project: bamboo
within project inference using model #0 for project bamboo
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  93
using llama3 tokenizer!
MAE for project bamboo: 1.09
MDAE for project bamboo: 1.05
Standard Accuracy for project bamboo: 20.44
Running inference for project: clover
within project inference using model #0 for project clover
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  69
using llama3 tokenizer!
MAE for project clover: 4.08
MDAE for project clover: 2.02
Standard Accuracy for project clover: 8.59
Running inference for project: datamanagement
within project inference using model #0 for project datamanagement
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using llama3 tokenizer!
MAE for project datamanagement: 6.45
MDAE for project datamanagement: 3.54
Standard Accuracy for project datamanagement: 16.7
Running inference for project: duracloud
within project inference using model #0 for project duracloud
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  119
using llama3 tokenizer!
MAE for project duracloud: 1.05
MDAE for project duracloud: 0.89
Standard Accuracy for project duracloud: 19.37
Running inference for project: jirasoftware
within project inference using model #0 for project jirasoftware
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  63
using llama3 tokenizer!
MAE for project jirasoftware: 2.05
MDAE for project jirasoftware: 1.59
Standard Accuracy for project jirasoftware: 34.87
Running inference for project: mesos
within project inference using model #0 for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  302
using llama3 tokenizer!
MAE for project mesos: 1.38
MDAE for project mesos: 1.1
Standard Accuracy for project mesos: 38.67
Running inference for project: moodle
within project inference using model #0 for project moodle
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  209
using llama3 tokenizer!
MAE for project moodle: 11.61
MDAE for project moodle: 8.72
Standard Accuracy for project moodle: -74.63
Running inference for project: mule
within project inference using model #0 for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  159
using llama3 tokenizer!
MAE for project mule: 2.65
MDAE for project mule: 2.27
Standard Accuracy for project mule: 35.72
Running inference for project: mulestudio
within project inference using model #0 for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  131
using llama3 tokenizer!
MAE for project mulestudio: 3.7
MDAE for project mulestudio: 2.65
Standard Accuracy for project mulestudio: 42.59
Running inference for project: springxd
within project inference using model #0 for project springxd
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using llama3 tokenizer!
MAE for project springxd: 2.07
MDAE for project springxd: 1.79
Standard Accuracy for project springxd: 25.47
Running inference for project: talenddataquality
within project inference using model #0 for project talenddataquality
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  248
using llama3 tokenizer!
MAE for project talenddataquality: 3.79
MDAE for project talenddataquality: 3.45
Standard Accuracy for project talenddataquality: -29.97
Running inference for project: talendesb
within project inference using model #0 for project talendesb
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  156
using llama3 tokenizer!
MAE for project talendesb: 1.08
MDAE for project talendesb: 0.86
Standard Accuracy for project talendesb: 30.09
Running inference for project: titanium
within project inference using model #0 for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  405
using llama3 tokenizer!
MAE for project titanium: 2.46
MDAE for project titanium: 1.84
Standard Accuracy for project titanium: 48.23
Running inference for project: usergrid
within project inference using model #0 for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  86
using llama3 tokenizer!
MAE for project usergrid: 1.51
MDAE for project usergrid: 1.28
Standard Accuracy for project usergrid: 38.45
Running inference for project: appceleratorstudio
within project inference using model #2 for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using word-level tokenizer!
MAE for project appceleratorstudio: 1.5
MDAE for project appceleratorstudio: 1.09
Standard Accuracy for project appceleratorstudio: 67.29
Running inference for project: aptanastudio
within project inference using model #2 for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  149
using word-level tokenizer!
MAE for project aptanastudio: 3.76
MDAE for project aptanastudio: 2.96
Standard Accuracy for project aptanastudio: 44.05
Running inference for project: bamboo
within project inference using model #2 for project bamboo
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  93
using word-level tokenizer!
MAE for project bamboo: 1.43
MDAE for project bamboo: 1.12
Standard Accuracy for project bamboo: -3.79
Running inference for project: clover
within project inference using model #2 for project clover
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  69
using word-level tokenizer!
MAE for project clover: 4.9
MDAE for project clover: 3.13
Standard Accuracy for project clover: -9.83
Running inference for project: datamanagement
within project inference using model #2 for project datamanagement
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using word-level tokenizer!
MAE for project datamanagement: 16.79
MDAE for project datamanagement: 15.9
Standard Accuracy for project datamanagement: -116.75
Running inference for project: duracloud
within project inference using model #2 for project duracloud
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  119
using word-level tokenizer!
MAE for project duracloud: 1.37
MDAE for project duracloud: 1.12
Standard Accuracy for project duracloud: -4.53
Running inference for project: jirasoftware
within project inference using model #2 for project jirasoftware
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  63
using word-level tokenizer!
MAE for project jirasoftware: 2.32
MDAE for project jirasoftware: 1.95
Standard Accuracy for project jirasoftware: 26.38
Running inference for project: mesos
within project inference using model #2 for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  302
using word-level tokenizer!
MAE for project mesos: 1.51
MDAE for project mesos: 1.38
Standard Accuracy for project mesos: 32.94
Running inference for project: moodle
within project inference using model #2 for project moodle
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  209
using word-level tokenizer!
MAE for project moodle: 11.64
MDAE for project moodle: 12.64
Standard Accuracy for project moodle: -75.04
Running inference for project: mule
within project inference using model #2 for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  159
using word-level tokenizer!
MAE for project mule: 2.69
MDAE for project mule: 2.31
Standard Accuracy for project mule: 34.83
Running inference for project: mulestudio
within project inference using model #2 for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  131
using word-level tokenizer!
MAE for project mulestudio: 3.56
MDAE for project mulestudio: 2.55
Standard Accuracy for project mulestudio: 44.75
Running inference for project: springxd
within project inference using model #2 for project springxd
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using word-level tokenizer!
MAE for project springxd: 1.93
MDAE for project springxd: 1.52
Standard Accuracy for project springxd: 30.6
Running inference for project: talenddataquality
within project inference using model #2 for project talenddataquality
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  248
using word-level tokenizer!
MAE for project talenddataquality: 4.67
MDAE for project talenddataquality: 4.56
Standard Accuracy for project talenddataquality: -60.21
Running inference for project: talendesb
within project inference using model #2 for project talendesb
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  156
using word-level tokenizer!
MAE for project talendesb: 1.71
MDAE for project talendesb: 1.48
Standard Accuracy for project talendesb: -10.36
Running inference for project: titanium
within project inference using model #2 for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  405
using word-level tokenizer!
MAE for project titanium: 2.34
MDAE for project titanium: 1.66
Standard Accuracy for project titanium: 50.8
Running inference for project: usergrid
within project inference using model #2 for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  86
using word-level tokenizer!
MAE for project usergrid: 1.52
MDAE for project usergrid: 1.31
Standard Accuracy for project usergrid: 38.1
Running inference for project: appceleratorstudio
within project inference using model #6 for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using wordpiece tokenizer!
MAE for project appceleratorstudio: 1.67
MDAE for project appceleratorstudio: 1.37
Standard Accuracy for project appceleratorstudio: 63.46
Running inference for project: aptanastudio
within project inference using model #6 for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  149
using wordpiece tokenizer!
MAE for project aptanastudio: 4.04
MDAE for project aptanastudio: 2.95
Standard Accuracy for project aptanastudio: 39.9
Running inference for project: bamboo
within project inference using model #6 for project bamboo
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  93
using wordpiece tokenizer!
MAE for project bamboo: 1.17
MDAE for project bamboo: 1.05
Standard Accuracy for project bamboo: 15.34
Running inference for project: clover
within project inference using model #6 for project clover
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  69
using wordpiece tokenizer!
MAE for project clover: 3.84
MDAE for project clover: 1.76
Standard Accuracy for project clover: 13.88
Running inference for project: datamanagement
within project inference using model #6 for project datamanagement
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using wordpiece tokenizer!
MAE for project datamanagement: 6.78
MDAE for project datamanagement: 3.96
Standard Accuracy for project datamanagement: 12.39
Running inference for project: duracloud
within project inference using model #6 for project duracloud
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  119
using wordpiece tokenizer!
MAE for project duracloud: 1.29
MDAE for project duracloud: 1.33
Standard Accuracy for project duracloud: 1.52
Running inference for project: jirasoftware
within project inference using model #6 for project jirasoftware
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  63
using wordpiece tokenizer!
MAE for project jirasoftware: 2.13
MDAE for project jirasoftware: 1.73
Standard Accuracy for project jirasoftware: 32.32
Running inference for project: mesos
within project inference using model #6 for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  302
using wordpiece tokenizer!
MAE for project mesos: 1.19
MDAE for project mesos: 0.88
Standard Accuracy for project mesos: 47.17
Running inference for project: moodle
within project inference using model #6 for project moodle
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  209
using wordpiece tokenizer!
MAE for project moodle: 13.81
MDAE for project moodle: 14.3
Standard Accuracy for project moodle: -107.67
Running inference for project: mule
within project inference using model #6 for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  159
using wordpiece tokenizer!
MAE for project mule: 2.8
MDAE for project mule: 2.45
Standard Accuracy for project mule: 32.16
Running inference for project: mulestudio
within project inference using model #6 for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  131
using wordpiece tokenizer!
MAE for project mulestudio: 3.84
MDAE for project mulestudio: 2.61
Standard Accuracy for project mulestudio: 40.44
Running inference for project: springxd
within project inference using model #6 for project springxd
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using wordpiece tokenizer!
MAE for project springxd: 1.85
MDAE for project springxd: 1.47
Standard Accuracy for project springxd: 33.44
Running inference for project: talenddataquality
within project inference using model #6 for project talenddataquality
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  248
using wordpiece tokenizer!
MAE for project talenddataquality: 3.75
MDAE for project talenddataquality: 3.62
Standard Accuracy for project talenddataquality: -28.72
Running inference for project: talendesb
within project inference using model #6 for project talendesb
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  156
using wordpiece tokenizer!
MAE for project talendesb: 1.05
MDAE for project talendesb: 0.84
Standard Accuracy for project talendesb: 31.96
Running inference for project: titanium
within project inference using model #6 for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  405
using wordpiece tokenizer!
MAE for project titanium: 2.47
MDAE for project titanium: 1.95
Standard Accuracy for project titanium: 48.03
Running inference for project: usergrid
within project inference using model #6 for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  86
using wordpiece tokenizer!
MAE for project usergrid: 1.37
MDAE for project usergrid: 1.13
Standard Accuracy for project usergrid: 44.02
Running inference for project: appceleratorstudio
within project inference using model #7 for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using sentencepiece tokenizer!
MAE for project appceleratorstudio: 1.64
MDAE for project appceleratorstudio: 1.46
Standard Accuracy for project appceleratorstudio: 64.25
Running inference for project: aptanastudio
within project inference using model #7 for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  149
using sentencepiece tokenizer!
MAE for project aptanastudio: 3.74
MDAE for project aptanastudio: 2.67
Standard Accuracy for project aptanastudio: 44.35
Running inference for project: bamboo
within project inference using model #7 for project bamboo
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  93
using sentencepiece tokenizer!
MAE for project bamboo: 0.97
MDAE for project bamboo: 0.83
Standard Accuracy for project bamboo: 29.34
Running inference for project: clover
within project inference using model #7 for project clover
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  69
using sentencepiece tokenizer!
MAE for project clover: 3.82
MDAE for project clover: 1.42
Standard Accuracy for project clover: 14.35
Running inference for project: datamanagement
within project inference using model #7 for project datamanagement
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using sentencepiece tokenizer!
MAE for project datamanagement: 7.17
MDAE for project datamanagement: 3.43
Standard Accuracy for project datamanagement: 7.39
Running inference for project: duracloud
within project inference using model #7 for project duracloud
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  119
using sentencepiece tokenizer!
MAE for project duracloud: 1.1
MDAE for project duracloud: 1.02
Standard Accuracy for project duracloud: 15.58
Running inference for project: jirasoftware
within project inference using model #7 for project jirasoftware
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  63
using sentencepiece tokenizer!
MAE for project jirasoftware: 1.9
MDAE for project jirasoftware: 1.49
Standard Accuracy for project jirasoftware: 39.69
Running inference for project: mesos
within project inference using model #7 for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  302
using sentencepiece tokenizer!
MAE for project mesos: 1.29
MDAE for project mesos: 1.08
Standard Accuracy for project mesos: 42.65
Running inference for project: moodle
within project inference using model #7 for project moodle
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  209
using sentencepiece tokenizer!
MAE for project moodle: 11.22
MDAE for project moodle: 9.06
Standard Accuracy for project moodle: -68.73
Running inference for project: mule
within project inference using model #7 for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  159
using sentencepiece tokenizer!
MAE for project mule: 2.51
MDAE for project mule: 2.32
Standard Accuracy for project mule: 39.2
Running inference for project: mulestudio
within project inference using model #7 for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  131
using sentencepiece tokenizer!
MAE for project mulestudio: 3.81
MDAE for project mulestudio: 2.56
Standard Accuracy for project mulestudio: 40.95
Running inference for project: springxd
within project inference using model #7 for project springxd
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using sentencepiece tokenizer!
MAE for project springxd: 1.7
MDAE for project springxd: 1.33
Standard Accuracy for project springxd: 38.81
Running inference for project: talenddataquality
within project inference using model #7 for project talenddataquality
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  248
using sentencepiece tokenizer!
MAE for project talenddataquality: 3.91
MDAE for project talenddataquality: 3.74
Standard Accuracy for project talenddataquality: -34.27
Running inference for project: talendesb
within project inference using model #7 for project talendesb
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  156
using sentencepiece tokenizer!
MAE for project talendesb: 1.0
MDAE for project talendesb: 0.79
Standard Accuracy for project talendesb: 35.27
Running inference for project: titanium
within project inference using model #7 for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  405
using sentencepiece tokenizer!
MAE for project titanium: 2.32
MDAE for project titanium: 1.57
Standard Accuracy for project titanium: 51.29
Running inference for project: usergrid
within project inference using model #7 for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  86
using sentencepiece tokenizer!
MAE for project usergrid: 1.37
MDAE for project usergrid: 1.25
Standard Accuracy for project usergrid: 44.14
All projects processed


Unnamed: 0,Project,Llama3.2,Llama3.2+SPWordLevel,Llama3.2+SPWordPiece,Llama3.2+SPSentencePiece
0,appceleratorstudio,1.65,1.5,1.67,1.64
1,aptanastudio,3.74,3.76,4.04,3.74
2,bamboo,1.09,1.43,1.17,0.97
3,clover,4.08,4.9,3.84,3.82
4,datamanagement,6.45,16.79,6.78,7.17
5,duracloud,1.05,1.37,1.29,1.1
6,jirasoftware,2.05,2.32,2.13,1.9
7,mesos,1.38,1.51,1.19,1.29
8,moodle,11.61,11.64,13.81,11.22
9,mule,2.65,2.69,2.8,2.51


##### Optional: Save the results

You can save the results of the DataFrame to a CSV file for further analysis or record-keeping. Run the following script to save the `llama3_within_df` DataFrame to a CSV file.

In [13]:
# Save the DataFrame to a CSV file
llama3_within_df.to_csv('./data_model_analysis/Llama3_within_results.csv', index=False)#

print("CSV file 'Llama3_within_results.csv' created successfully.")

CSV file 'Llama3_within_results.csv' created successfully.


In [4]:
llama3_within_df_mdae.to_csv('./data_model_analysis/Llama3_within_results_mdae.csv', index=False)
llama3_within_df_sa.to_csv('./data_model_analysis/Llama3_within_results_sa.csv', index=False)

print("CSV files 'Llama3_within_results_mdae.csv' and 'Llama3_within_results_sa.csv' created successfully.")

##### Run the cell below to do inference on testing dataset using **local** trained model on all within projects

In [3]:
print("Loading models from local on Within projects inference...")
        
within_maes, within_mdaes, within_saes = local_model_inference(WITHIN_PROJECTS)
within_df = create_df(within_maes, within_mdaes, within_saes, WITHIN_PROJECTS)

within_df

Loading models from local on Within projects inference...
within project inference using model ./models/appceleratorstudio_appceleratorstudio_epo_1 for project appceleratorstudio
Loading model from local...


  _ = torch.tensor([0], device=i)
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using llama3 tokenizer!


  test_seq = torch.tensor(tokens_test['input_ids'])


Model: appceleratorstudio_appceleratorstudio, MAE: 1.59, MdAE: 1.34, SA: 65.28
within project inference using model ./models/aptanastudio_aptanastudio_epo_12 for project aptanastudio
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  149
using llama3 tokenizer!
Model: aptanastudio_aptanastudio, MAE: 3.89, MdAE: 3.1, SA: 42.05
within project inference using model ./models/bamboo_bamboo_epo_17 for project bamboo
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  93
using llama3 tokenizer!
Model: bamboo_bamboo, MAE: 1.11, MdAE: 1.01, SA: 19.5
within project inference using model ./models/clover_clover_epo_2 for project clover
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  69
using llama3 tokenizer!
Model: clover_clover, MAE: 3.99, MdAE: 2.61, SA: 10.45
within project inference using model ./models/datamanagement_datamanagement_epo_1 for project datamanagement
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using llama3 tokenizer!
Model: datamanagement_datamanagement, MAE: 5.84, MdAE: 2.94, SA: 24.53
within project inference using model ./models/duracloud_duracloud_epo_3 for project duracloud
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  119
using llama3 tokenizer!
Model: duracloud_duracloud, MAE: 1.01, MdAE: 0.7, SA: 22.61
within project inference using model ./models/jirasoftware_jirasoftware_epo_3 for project jirasoftware
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  63
using llama3 tokenizer!
Model: jirasoftware_jirasoftware, MAE: 2.38, MdAE: 1.82, SA: 24.54
within project inference using model ./models/mesos_mesos_epo_0 for project mesos
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  302
using llama3 tokenizer!
Model: mesos_mesos, MAE: 1.27, MdAE: 1.03, SA: 43.57
within project inference using model ./models/moodle_moodle_epo_17 for project moodle
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  209
using llama3 tokenizer!
Model: moodle_moodle, MAE: 13.9, MdAE: 12.57, SA: -109.09
within project inference using model ./models/mule_mule_epo_15 for project mule
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  159
using llama3 tokenizer!
Model: mule_mule, MAE: 2.76, MdAE: 2.55, SA: 33.16
within project inference using model ./models/mulestudio_mulestudio_epo_1 for project mulestudio
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  131
using llama3 tokenizer!
Model: mulestudio_mulestudio, MAE: 3.91, MdAE: 3.31, SA: 39.3
within project inference using model ./models/springxd_springxd_epo_6 for project springxd
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using llama3 tokenizer!
Model: springxd_springxd, MAE: 1.87, MdAE: 1.69, SA: 32.57
within project inference using model ./models/talenddataquality_talenddataquality_epo_7 for project talenddataquality
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  248
using llama3 tokenizer!
Model: talenddataquality_talenddataquality, MAE: 4.54, MdAE: 4.28, SA: -55.81
within project inference using model ./models/talendesb_talendesb_epo_7 for project talendesb
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  156
using llama3 tokenizer!
Model: talendesb_talendesb, MAE: 1.06, MdAE: 0.81, SA: 31.27
within project inference using model ./models/titanium_titanium_epo_1 for project titanium
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  405
using llama3 tokenizer!
Model: titanium_titanium, MAE: 2.6, MdAE: 1.98, SA: 45.25
within project inference using model ./models/usergrid_usergrid_epo_14 for project usergrid
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  86
using llama3 tokenizer!
Model: usergrid_usergrid, MAE: 1.47, MdAE: 1.06, SA: 39.79


Unnamed: 0,Project,MAE,MdAE,SA
0,appceleratorstudio,1.59,1.34,65.28
1,aptanastudio,3.89,3.1,42.05
2,bamboo,1.11,1.01,19.5
3,clover,3.99,2.61,10.45
4,datamanagement,5.84,2.94,24.53
5,duracloud,1.01,0.7,22.61
6,jirasoftware,2.38,1.82,24.54
7,mesos,1.27,1.03,43.57
8,moodle,13.9,12.57,-109.09
9,mule,2.76,2.55,33.16


##### Optional: Save the results

You can save the results of the DataFrame to a CSV file for further analysis or record-keeping. Run the following script to save the `within_df` DataFrame to a CSV file.

In [4]:
# Save the DataFrame to a CSV file
within_df.to_csv('./data_model_analysis/Llama3SP_within_results.csv', index=False)

print("CSV file 'within_project_results.csv' created successfully.")

CSV file 'within_project_results.csv' created successfully.


<a id='cross_project'></a>
## Cross Projects Models

#### There are two parts under Cross Project Model section, follow the link to reach the section:
#### 1. [Training Process Inspection](#cross_project_tb)
#### 2. [Model Testing](#cross_project_model_testing)

##### Different models are available for cross project estimation as follows:

### Cross project - within repository models
#### #00 - Llama3.2 Auto Tokenizer
#### #22 - Word-level Story Point Tokenizer + Llama3.2
#### #66 - WordPiece Story Point Tokenizer + Llama3.2
#### #77 - SentencePiece Story Point Tokenizer + Llama3.2  

### Cross project - cross repository models
#### #000 - Llama3.2 Auto Tokenizer
#### #222 - Word-level Story Point Tokenizer + Llama3.2
#### #666 - WordPiece Story Point Tokenizer + Llama3.2
#### #777 - SentencePiece Story Point Tokenizer + Llama3.2 

<a id='cross_project_model_testing'></a>
### Model Testing

##### Run the cell below to do inference on all testing datasets using all the uploaded models on Hugging Face

In [5]:
llama3_cross_df = pd.DataFrame(
    {
        'Train': [item['train'] for item in ALL_CROSS_PROJECTS],
        'Test': [item['test'] for item in ALL_CROSS_PROJECTS]
    }
    , dtype='float64'
)
llama3_cross_df_mdae = llama3_cross_df.copy()
llama3_cross_df_sa = llama3_cross_df.copy()

for model in MODELS:
    maes = []
    mdmaes = []
    standard_accuracies = []

    for i, projects in enumerate(CROSS_PROJECTS):
        for project in projects:
            model_id = model
            caracter = model[1:]
            caracter = caracter * (i + 2)
            model_id = f"#{caracter}"

            print(f"Running inference for project trained on {project['train']} and tested on {project['test']}")
            mae, mdae, sa = main(model_id, f"{project['train']}_{project['test']}" )
            
            mae = round(mae, 2)
            mdae = round(mdae, 2)
            sa = round(sa, 2)
            maes.append(mae)
            mdmaes.append(mdae)
            standard_accuracies.append(sa)
            print(f"MAE for project {project['train']}: {mae}")
            print(f"MDAE for project {project['train']}: {mdae}")
            print(f"Standard Accuracy for project {project['train']}: {sa}")

            hf_model_id = f"DEVCamiloSepulveda/{model[1:]}-LLAMA3SP-{project['train']}"
            # clean_hf_cache(hf_model_id)
            clean_gpu_memory()
    llama3_cross_df[MODELS[model]] = maes
    llama3_cross_df[MODELS[model]] = llama3_cross_df[MODELS[model]].apply(lambda x: round(x, 2))
    llama3_cross_df_mdae[MODELS[model]] = mdmaes
    llama3_cross_df_mdae[MODELS[model]] = llama3_cross_df_mdae[MODELS[model]].apply(lambda x: round(x, 2))
    llama3_cross_df_sa[MODELS[model]] = standard_accuracies
    llama3_cross_df_sa[MODELS[model]] = llama3_cross_df_sa[MODELS[model]].apply(lambda x: round(x, 2))


print("All projects processed")

llama3_cross_df

  llama3_cross_df = pd.DataFrame(


Running inference for project trained on mesos and tested on usergrid
cross project inference using model #00 trained on mesos for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!


  test_seq = torch.tensor(tokens_test['input_ids'])


MAE for project mesos: 1.34
MDAE for project mesos: 1.05
Standard Accuracy for project mesos: 43.23
Running inference for project trained on usergrid and tested on mesos
cross project inference using model #00 trained on usergrid for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project usergrid: 1.74
MDAE for project usergrid: 1.34
Standard Accuracy for project usergrid: 32.85
Running inference for project trained on appceleratorstudio and tested on aptanastudio
cross project inference using model #00 trained on appceleratorstudio for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project appceleratorstudio: 4.36
MDAE for project appceleratorstudio: 3.12
Standard Accuracy for project appceleratorstudio: 42.05
Running inference for project trained on appceleratorstudio and tested on titanium
cross project inference using model #00 trained on appceleratorstudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project appceleratorstudio: 3.36
MDAE for project appceleratorstudio: 2.32
Standard Accuracy for project appceleratorstudio: 42.19
Running inference for project trained on titanium and tested on appceleratorstudio
cross project inference using model #00 trained on titanium for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project titanium: 2.55
MDAE for project titanium: 1.88
Standard Accuracy for project titanium: 50.29
Running inference for project trained on aptanastudio and tested on titanium
cross project inference using model #00 trained on aptanastudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project aptanastudio: 3.78
MDAE for project aptanastudio: 2.78
Standard Accuracy for project aptanastudio: 34.99
Running inference for project trained on mule and tested on mulestudio
cross project inference using model #00 trained on mule for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project mule: 3.57
MDAE for project mule: 2.53
Standard Accuracy for project mule: 39.44
Running inference for project trained on mulestudio and tested on mule
cross project inference using model #00 trained on mulestudio for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project mulestudio: 2.98
MDAE for project mulestudio: 2.54
Standard Accuracy for project mulestudio: 34.85
Running inference for project trained on clover and tested on usergrid
cross project inference using model #000 trained on clover for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project clover: 2.03
MDAE for project clover: 1.67
Standard Accuracy for project clover: 13.91
Running inference for project trained on talendesb and tested on mesos
cross project inference using model #000 trained on talendesb for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project talendesb: 1.52
MDAE for project talendesb: 1.01
Standard Accuracy for project talendesb: 41.39
Running inference for project trained on talenddataquality and tested on aptanastudio
cross project inference using model #000 trained on talenddataquality for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project talenddataquality: 4.53
MDAE for project talenddataquality: 3.48
Standard Accuracy for project talenddataquality: 39.69
Running inference for project trained on mule and tested on titanium
cross project inference using model #000 trained on mule for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project mule: 3.48
MDAE for project mule: 2.13
Standard Accuracy for project mule: 40.14
Running inference for project trained on talenddataquality and tested on appceleratorstudio
cross project inference using model #000 trained on talenddataquality for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project talenddataquality: 2.72
MDAE for project talenddataquality: 2.08
Standard Accuracy for project talenddataquality: 47.12
Running inference for project trained on mulestudio and tested on titanium
cross project inference using model #000 trained on mulestudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project mulestudio: 3.73
MDAE for project mulestudio: 2.79
Standard Accuracy for project mulestudio: 35.84
Running inference for project trained on appceleratorstudio and tested on mulestudio
cross project inference using model #000 trained on appceleratorstudio for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project appceleratorstudio: 3.41
MDAE for project appceleratorstudio: 2.39
Standard Accuracy for project appceleratorstudio: 42.15
Running inference for project trained on appceleratorstudio and tested on mule
cross project inference using model #000 trained on appceleratorstudio for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using llama3 tokenizer!
MAE for project appceleratorstudio: 2.9
MDAE for project appceleratorstudio: 2.35
Standard Accuracy for project appceleratorstudio: 36.61
Running inference for project trained on mesos and tested on usergrid
cross project inference using model #22 trained on mesos for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project mesos: 1.4
MDAE for project mesos: 1.16
Standard Accuracy for project mesos: 40.39
Running inference for project trained on usergrid and tested on mesos
cross project inference using model #22 trained on usergrid for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project usergrid: 1.73
MDAE for project usergrid: 1.29
Standard Accuracy for project usergrid: 33.22
Running inference for project trained on appceleratorstudio and tested on aptanastudio
cross project inference using model #22 trained on appceleratorstudio for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project appceleratorstudio: 5.23
MDAE for project appceleratorstudio: 3.69
Standard Accuracy for project appceleratorstudio: 30.43
Running inference for project trained on appceleratorstudio and tested on titanium
cross project inference using model #22 trained on appceleratorstudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project appceleratorstudio: 3.77
MDAE for project appceleratorstudio: 2.2
Standard Accuracy for project appceleratorstudio: 35.25
Running inference for project trained on titanium and tested on appceleratorstudio
cross project inference using model #22 trained on titanium for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project titanium: 2.64
MDAE for project titanium: 1.98
Standard Accuracy for project titanium: 48.6
Running inference for project trained on aptanastudio and tested on titanium
cross project inference using model #22 trained on aptanastudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project aptanastudio: 3.64
MDAE for project aptanastudio: 2.65
Standard Accuracy for project aptanastudio: 37.35
Running inference for project trained on mule and tested on mulestudio
cross project inference using model #22 trained on mule for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project mule: 4.07
MDAE for project mule: 2.54
Standard Accuracy for project mule: 30.94
Running inference for project trained on mulestudio and tested on mule
cross project inference using model #22 trained on mulestudio for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project mulestudio: 3.07
MDAE for project mulestudio: 2.69
Standard Accuracy for project mulestudio: 32.95
Running inference for project trained on clover and tested on usergrid
cross project inference using model #222 trained on clover for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project clover: 2.53
MDAE for project clover: 2.39
Standard Accuracy for project clover: -7.49
Running inference for project trained on talendesb and tested on mesos
cross project inference using model #222 trained on talendesb for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project talendesb: 1.95
MDAE for project talendesb: 1.63
Standard Accuracy for project talendesb: 24.52
Running inference for project trained on talenddataquality and tested on aptanastudio
cross project inference using model #222 trained on talenddataquality for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project talenddataquality: 4.69
MDAE for project talenddataquality: 3.26
Standard Accuracy for project talenddataquality: 37.58
Running inference for project trained on mule and tested on titanium
cross project inference using model #222 trained on mule for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project mule: 3.68
MDAE for project mule: 2.16
Standard Accuracy for project mule: 36.65
Running inference for project trained on talenddataquality and tested on appceleratorstudio
cross project inference using model #222 trained on talenddataquality for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project talenddataquality: 2.66
MDAE for project talenddataquality: 2.11
Standard Accuracy for project talenddataquality: 48.22
Running inference for project trained on mulestudio and tested on titanium
cross project inference using model #222 trained on mulestudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project mulestudio: 3.73
MDAE for project mulestudio: 2.61
Standard Accuracy for project mulestudio: 35.89
Running inference for project trained on appceleratorstudio and tested on mulestudio
cross project inference using model #222 trained on appceleratorstudio for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project appceleratorstudio: 3.54
MDAE for project appceleratorstudio: 2.44
Standard Accuracy for project appceleratorstudio: 39.96
Running inference for project trained on appceleratorstudio and tested on mule
cross project inference using model #222 trained on appceleratorstudio for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using word-level tokenizer!
MAE for project appceleratorstudio: 2.89
MDAE for project appceleratorstudio: 2.36
Standard Accuracy for project appceleratorstudio: 36.88
Running inference for project trained on mesos and tested on usergrid
cross project inference using model #66 trained on mesos for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project mesos: 1.38
MDAE for project mesos: 1.03
Standard Accuracy for project mesos: 41.3
Running inference for project trained on usergrid and tested on mesos
cross project inference using model #66 trained on usergrid for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project usergrid: 1.6
MDAE for project usergrid: 1.26
Standard Accuracy for project usergrid: 38.01
Running inference for project trained on appceleratorstudio and tested on aptanastudio
cross project inference using model #66 trained on appceleratorstudio for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project appceleratorstudio: 4.28
MDAE for project appceleratorstudio: 2.74
Standard Accuracy for project appceleratorstudio: 43.11
Running inference for project trained on appceleratorstudio and tested on titanium
cross project inference using model #66 trained on appceleratorstudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project appceleratorstudio: 3.41
MDAE for project appceleratorstudio: 2.29
Standard Accuracy for project appceleratorstudio: 41.44
Running inference for project trained on titanium and tested on appceleratorstudio
cross project inference using model #66 trained on titanium for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project titanium: 2.36
MDAE for project titanium: 1.66
Standard Accuracy for project titanium: 54.07
Running inference for project trained on aptanastudio and tested on titanium
cross project inference using model #66 trained on aptanastudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project aptanastudio: 4.11
MDAE for project aptanastudio: 3.75
Standard Accuracy for project aptanastudio: 29.36
Running inference for project trained on mule and tested on mulestudio
cross project inference using model #66 trained on mule for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project mule: 3.43
MDAE for project mule: 2.34
Standard Accuracy for project mule: 41.84
Running inference for project trained on mulestudio and tested on mule
cross project inference using model #66 trained on mulestudio for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project mulestudio: 3.09
MDAE for project mulestudio: 2.46
Standard Accuracy for project mulestudio: 32.61
Running inference for project trained on clover and tested on usergrid
cross project inference using model #666 trained on clover for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project clover: 2.07
MDAE for project clover: 1.58
Standard Accuracy for project clover: 11.85
Running inference for project trained on talendesb and tested on mesos
cross project inference using model #666 trained on talendesb for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project talendesb: 1.6
MDAE for project talendesb: 1.0
Standard Accuracy for project talendesb: 38.26
Running inference for project trained on talenddataquality and tested on aptanastudio
cross project inference using model #666 trained on talenddataquality for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project talenddataquality: 4.54
MDAE for project talenddataquality: 3.16
Standard Accuracy for project talenddataquality: 39.65
Running inference for project trained on mule and tested on titanium
cross project inference using model #666 trained on mule for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project mule: 3.33
MDAE for project mule: 2.29
Standard Accuracy for project mule: 42.67
Running inference for project trained on talenddataquality and tested on appceleratorstudio
cross project inference using model #666 trained on talenddataquality for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project talenddataquality: 2.98
MDAE for project talenddataquality: 2.38
Standard Accuracy for project talenddataquality: 41.94
Running inference for project trained on mulestudio and tested on titanium
cross project inference using model #666 trained on mulestudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project mulestudio: 3.51
MDAE for project mulestudio: 2.52
Standard Accuracy for project mulestudio: 39.6
Running inference for project trained on appceleratorstudio and tested on mulestudio
cross project inference using model #666 trained on appceleratorstudio for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project appceleratorstudio: 3.39
MDAE for project appceleratorstudio: 2.53
Standard Accuracy for project appceleratorstudio: 42.49
Running inference for project trained on appceleratorstudio and tested on mule
cross project inference using model #666 trained on appceleratorstudio for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using wordpiece tokenizer!
MAE for project appceleratorstudio: 3.01
MDAE for project appceleratorstudio: 2.73
Standard Accuracy for project appceleratorstudio: 34.23
Running inference for project trained on mesos and tested on usergrid
cross project inference using model #77 trained on mesos for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project mesos: 1.05
MDAE for project mesos: 0.67
Standard Accuracy for project mesos: 55.22
Running inference for project trained on usergrid and tested on mesos
cross project inference using model #77 trained on usergrid for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project usergrid: 1.56
MDAE for project usergrid: 0.97
Standard Accuracy for project usergrid: 39.53
Running inference for project trained on appceleratorstudio and tested on aptanastudio
cross project inference using model #77 trained on appceleratorstudio for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project appceleratorstudio: 4.31
MDAE for project appceleratorstudio: 2.69
Standard Accuracy for project appceleratorstudio: 42.62
Running inference for project trained on appceleratorstudio and tested on titanium
cross project inference using model #77 trained on appceleratorstudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project appceleratorstudio: 3.3
MDAE for project appceleratorstudio: 2.39
Standard Accuracy for project appceleratorstudio: 43.19
Running inference for project trained on titanium and tested on appceleratorstudio
cross project inference using model #77 trained on titanium for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project titanium: 2.36
MDAE for project titanium: 1.76
Standard Accuracy for project titanium: 54.14
Running inference for project trained on aptanastudio and tested on titanium
cross project inference using model #77 trained on aptanastudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project aptanastudio: 3.78
MDAE for project aptanastudio: 2.91
Standard Accuracy for project aptanastudio: 35.0
Running inference for project trained on mule and tested on mulestudio
cross project inference using model #77 trained on mule for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project mule: 3.65
MDAE for project mule: 1.99
Standard Accuracy for project mule: 38.13
Running inference for project trained on mulestudio and tested on mule
cross project inference using model #77 trained on mulestudio for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project mulestudio: 3.31
MDAE for project mulestudio: 2.86
Standard Accuracy for project mulestudio: 27.74
Running inference for project trained on clover and tested on usergrid
cross project inference using model #777 trained on clover for project usergrid
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project clover: 1.48
MDAE for project clover: 1.25
Standard Accuracy for project clover: 37.04
Running inference for project trained on talendesb and tested on mesos
cross project inference using model #777 trained on talendesb for project mesos
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project talendesb: 1.66
MDAE for project talendesb: 1.06
Standard Accuracy for project talendesb: 35.73
Running inference for project trained on talenddataquality and tested on aptanastudio
cross project inference using model #777 trained on talenddataquality for project aptanastudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project talenddataquality: 4.73
MDAE for project talenddataquality: 3.21
Standard Accuracy for project talenddataquality: 37.03
Running inference for project trained on mule and tested on titanium
cross project inference using model #777 trained on mule for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project mule: 3.31
MDAE for project mule: 2.15
Standard Accuracy for project mule: 43.13
Running inference for project trained on talenddataquality and tested on appceleratorstudio
cross project inference using model #777 trained on talenddataquality for project appceleratorstudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project talenddataquality: 2.49
MDAE for project talenddataquality: 1.93
Standard Accuracy for project talenddataquality: 51.43
Running inference for project trained on mulestudio and tested on titanium
cross project inference using model #777 trained on mulestudio for project titanium
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project mulestudio: 3.97
MDAE for project mulestudio: 3.45
Standard Accuracy for project mulestudio: 31.78
Running inference for project trained on appceleratorstudio and tested on mulestudio
cross project inference using model #777 trained on appceleratorstudio for project mulestudio
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project appceleratorstudio: 3.53
MDAE for project appceleratorstudio: 2.46
Standard Accuracy for project appceleratorstudio: 40.16
Running inference for project trained on appceleratorstudio and tested on mule
cross project inference using model #777 trained on appceleratorstudio for project mule
Loading model from Hugging Face...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using sentencepiece tokenizer!
MAE for project appceleratorstudio: 2.77
MDAE for project appceleratorstudio: 2.44
Standard Accuracy for project appceleratorstudio: 39.48
All projects processed


Unnamed: 0,Train,Test,Llama3.2,Llama3.2+SPWordLevel,Llama3.2+SPWordPiece,Llama3.2+SPSentencePiece
0,mesos,usergrid,1.34,1.4,1.38,1.05
1,usergrid,mesos,1.74,1.73,1.6,1.56
2,appceleratorstudio,aptanastudio,4.36,5.23,4.28,4.31
3,appceleratorstudio,titanium,3.36,3.77,3.41,3.3
4,titanium,appceleratorstudio,2.55,2.64,2.36,2.36
5,aptanastudio,titanium,3.78,3.64,4.11,3.78
6,mule,mulestudio,3.57,4.07,3.43,3.65
7,mulestudio,mule,2.98,3.07,3.09,3.31
8,clover,usergrid,2.03,2.53,2.07,1.48
9,talendesb,mesos,1.52,1.95,1.6,1.66


##### Optional: Save the results

You can save the results of the DataFrame to a CSV file for further analysis or record-keeping. Run the following script to save the `llama3_cross_df` DataFrame to a CSV file.

In [6]:
# Save the DataFrtaFrame to a CSV file
llama3_cross_df.to_csv('./data_model_analysis/Llama3_cross_results.csv', index=False)

print("CSV file 'Llama3_cross_results.csv' created successfully.")

CSV file 'Llama3_cross_results.csv' created successfully.


In [6]:
llama3_cross_df_mdae.to_csv('./data_model_analysis/Llama3_cross_results_mdae.csv', index=False)
llama3_cross_df_sa.to_csv('./data_model_analysis/Llama3_cross_results_sa.csv', index=False)

print("CSV file 'Llama3_cross_results_mdae.csv' and 'Llama3_cross_results_sa.csv' created successfully.")

CSV file 'Llama3_cross_results_mdae.csv' and 'Llama3_cross_results_sa.csv' created successfully.


##### Run the cell below to do inference on testing dataset using **local** trained model on all cross projects

In [7]:
print("Loading models from local on Cross projects inference...")

cross_projects = [item for sublist in PROJECTS[1:] for item in sublist]
cross_maes, cross_mdaes, cross_saes = local_model_inference(cross_projects)
cross_df = create_df(cross_maes, cross_mdaes, cross_saes, cross_projects)

cross_df

Loading models from local on Cross projects inference...
within project inference using model ./models/mesos_usergrid_epo_6 for project mesos
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  302
using llama3 tokenizer!


  test_seq = torch.tensor(tokens_test['input_ids'])


Model: mesos_usergrid, MAE: 1.37, MdAE: 1.06, SA: 39.19
within project inference using model ./models/usergrid_mesos_epo_4 for project usergrid
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  86
using llama3 tokenizer!
Model: usergrid_mesos, MAE: 1.45, MdAE: 1.2, SA: 40.64
within project inference using model ./models/appceleratorstudio_aptanastudio_epo_0 for project appceleratorstudio
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using llama3 tokenizer!
Model: appceleratorstudio_aptanastudio, MAE: 1.53, MdAE: 1.23, SA: 66.62
within project inference using model ./models/appceleratorstudio_titanium_epo_0 for project appceleratorstudio
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using llama3 tokenizer!
Model: appceleratorstudio_titanium, MAE: 1.7, MdAE: 1.45, SA: 62.93
within project inference using model ./models/titanium_appceleratorstudio_epo_0 for project titanium
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  405
using llama3 tokenizer!
Model: titanium_appceleratorstudio, MAE: 2.65, MdAE: 2.19, SA: 44.29
within project inference using model ./models/aptanastudio_titanium_epo_2 for project aptanastudio
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  149
using llama3 tokenizer!
Model: aptanastudio_titanium, MAE: 3.48, MdAE: 2.8, SA: 48.17
within project inference using model ./models/mule_mulestudio_epo_16 for project mule
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  159
using llama3 tokenizer!
Model: mule_mulestudio, MAE: 2.73, MdAE: 2.52, SA: 33.71
within project inference using model ./models/mulestudio_mule_epo_6 for project mulestudio
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  131
using llama3 tokenizer!
Model: mulestudio_mule, MAE: 3.87, MdAE: 3.04, SA: 39.99
within project inference using model ./models/clover_usergrid_epo_8 for project clover
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  69
using llama3 tokenizer!
Model: clover_usergrid, MAE: 4.17, MdAE: 2.55, SA: 6.6
within project inference using model ./models/talendesb_mesos_epo_15 for project talendesb
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  156
using llama3 tokenizer!
Model: talendesb_mesos, MAE: 1.07, MdAE: 0.87, SA: 30.58
within project inference using model ./models/talenddataquality_aptanastudio_epo_3 for project talenddataquality
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  248
using llama3 tokenizer!
Model: talenddataquality_aptanastudio, MAE: 3.71, MdAE: 3.4, SA: -27.29
within project inference using model ./models/mule_titanium_epo_13 for project mule
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  159
using llama3 tokenizer!
Model: mule_titanium, MAE: 2.68, MdAE: 2.33, SA: 35.1
within project inference using model ./models/talenddataquality_appceleratorstudio_epo_0 for project talenddataquality
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  248
using llama3 tokenizer!
Model: talenddataquality_appceleratorstudio, MAE: 3.92, MdAE: 3.91, SA: -34.41
within project inference using model ./models/mulestudio_titanium_epo_1 for project mulestudio
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  131
using llama3 tokenizer!
Model: mulestudio_titanium, MAE: 3.85, MdAE: 3.12, SA: 40.32
within project inference using model ./models/appceleratorstudio_mulestudio_epo_0 for project appceleratorstudio
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using llama3 tokenizer!
Model: appceleratorstudio_mulestudio, MAE: 1.74, MdAE: 1.54, SA: 61.97
within project inference using model ./models/appceleratorstudio_mule_epo_2 for project appceleratorstudio
Loading model from local...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Size:  512
using llama3 tokenizer!
Model: appceleratorstudio_mule, MAE: 1.91, MdAE: 1.71, SA: 58.32


Unnamed: 0,Train,Test,MAE,MdAE,SA
0,mesos,usergrid,1.37,1.06,39.19
1,usergrid,mesos,1.45,1.2,40.64
2,appceleratorstudio,aptanastudio,1.53,1.23,66.62
3,appceleratorstudio,titanium,1.7,1.45,62.93
4,titanium,appceleratorstudio,2.65,2.19,44.29
5,aptanastudio,titanium,3.48,2.8,48.17
6,mule,mulestudio,2.73,2.52,33.71
7,mulestudio,mule,3.87,3.04,39.99
8,clover,usergrid,4.17,2.55,6.6
9,talendesb,mesos,1.07,0.87,30.58


##### Optional: Save the results

You can save the results of the DataFrame to a CSV file for further analysis or record-keeping. Run the following script to save the `cross_df` DataFrame to a CSV file.

In [8]:
# Save the DataFrtaFrame to a CSV file
cross_df.to_csv('./data_model_analysis/Llama3SP_cross_results.csv', index=False)

print("CSV file 'Llama3SP_cross_results.csv' created successfully.")

CSV file 'Llama3SP_cross_results.csv' created successfully.
