In [9]:
pip install --upgrade transformers torch


Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading torch-2.4.0-cp39-none-macosx_11_0_arm64.whl.metadata (26 kB)
Downloading transformers-4.44.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading torch-2.4.0-cp39-none-macosx_11_0_arm64.whl (62.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: torch, transformers
  Attempting uninstall: torch
    Found existing installation: torch 2.3.1
    Uninstalling torch-2.3.1:
      Successfully uninstalled torch-2.3.1
  Attemptin

In [5]:
import torch

# Check if a CUDA-enabled GPU is available or if using MPS (for macOS with M1/M2 chip)
use_fp16 = torch.cuda.is_available()
use_fp16

False

In [1]:
import torch

# Determine if CUDA or MPS is available
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
import torch.nn.functional as F
from transformers import TrainingArguments


# Download NLTK stop words
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load the dataset
file_path = "/Users/aaryanshah/Oncampus-Job/NLP_Gal/data/TrainingSet 1(Deals).csv"
df = pd.read_csv(file_path)

# Define text columns and the target column
text_columns = ['Target Business Description', 'M&A Headline', 'Deal Synopsis']
df['combined_text'] = df[text_columns].apply(lambda x: ' '.join(x.dropna()), axis=1)

# Clean the text data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['combined_text'].apply(clean_text)

# Define score columns
score_columns = ['Singh', 'Arora', 'Neilsen', 'RH', 'Uche', 'Edrick']

# Function to remove the outlier from each row using manual iteration to find the max difference
def remove_outlier_manual(row):
    scores = row[score_columns].dropna()  # Remove NaNs for processing
    if len(scores) > 1:  # Ensure there are enough scores to compare
        median = np.median(scores)
        diffs = np.abs(scores - median)
        max_diff_value = diffs.max()
        max_diff_index = diffs[diffs == max_diff_value].index[0]
        row.at[max_diff_index] = np.nan
    return row

# Apply the outlier removal function
df_cleaned = df.apply(remove_outlier_manual, axis=1)

# Calculate the average for each row
df_cleaned['Average'] = df_cleaned[score_columns].mean(axis=1)

# Handle NaN values in the Average column
df_cleaned = df_cleaned.dropna(subset=['Average'])

# Ensure 'Average' values are within 1 to 5
df_cleaned = df_cleaned[df_cleaned['Average'].between(1, 5)]

# Map the 'average' scores to ensure they are within 1 to 5 range
target_column = 'Average'
df_cleaned['labels'] = df_cleaned[target_column].astype(int) - 1  # Make sure labels are 0-indexed for classification


# Additional Domain-Specific Keyword Lists
core_keywords = set([
    "crude", "shale", "hydrocarbon", "methane", "petroleum", "gas", "oil",
    "gasoline", "diesel", "ethanol", "propane", "petrol", "coke", "gasohol",
    "coal", "lng", "barrel", "condensate", "benzene", "bitumen", "asphalt",
    "coking", "butane", "kerosene", "bopd", "butene", "fossil", "liqufied",
    "dlng", "fuel", "engine", "oilgas", "petrolium", "petroli", "petroleos",
    "gasrelated", "gasrec", "petroluem", "petrolera", "gallon", "gasfueled",
    "gasinvest", "water", "fluid", "liquide", "grease", "liquor", "rock",
    "gasket", "condes", "octanex", "toilet", "carbide", "catalytic", "vapor",
    "coolant", "boilers", "hydrogenfuelled", "lignite", "pine", "pollution",
    "pygas", "mortar", "exxon", "exxonmobil", "chevron", "cement", "waste",
    "concrete", "hydrogen", "liquefied", "hydrothermal", "liquid", "sewage",
    "plastic", "polypropylene", "polyethylene", "polymer", "propylene",
    "polystyrene", "polyester", "copolymer", "polyvinyl", "xylene", "polysterene",
    "polyurethane", "polyethlyene", "polystyrol", "polythene", "polyprophylene",
    "polythylene", "polybutylene", "polyblend", "petrolube", "corplubricants",
    "petroquimica", "chemical", "lubricant", "petrochemical", "compound",
    "pharmaceutical", "fertilizer", "ethylene", "pvc", "additive", "catalyst",
    "lubricating", "olefin", "lube", "butadiene", "aromatics", "thermoplastic",
    "textile", "solvent", "thiochemicals", "bauxite", "mtbe", "acetylene",
    "oleochemicals", "acetate", "ammonia", "ammonium", "aromatic", "bicarbonate",
    "biobutanol", "chlorochemicals", "ethylbenzene", "fertiliser", "herbicide",
    "isophtheric", "isopropyl", "oleochemical", "mbpd", "petrochemistry",
    "phosphate", "phosphorus", "butadine", "cfc", "lubricants", "olefins",
    "polymerization", "pyrolysis", "chemicals", "inorganic", "organic",
    "synthetic", "composite", "lubricated", "hydrated", "hydrochloric",
    "nitrogenous", "reaction", "drug", "hydrate", "foam", "synthesis",
    "chemistry", "hydrogenrich", "minerales", "contaminant", "mineral",
    "uranium", "silicate", "production", "manufacture", "produce",
    "construction", "distributor", "marketing", "supply", "building",
    "producing", "manufacturing", "trading", "advertising", "supplying",
    "constructing", "selling", "fabrication", "purchasing", "procurement",
    "sourcing", "develop", "developing", "sell", "maintenance", "consulting",
    "handling", "distributing", "welding", "imaging", "repairing", "billing",
    "sales", "hr", "wholesale", "business", "manufacturer", "industrial",
    "technology", "industry", "market", "producer", "financial", "acquisition",
    "produced", "vendor", "acquires", "revenue", "marketed", "industriale",
    "factory", "build", "stock", "purchase", "processed", "enterprise",
    "trader", "constructed", "supplier", "industria", "making", "make",
    "built", "engineered", "composed", "acquiring", "industrials",
    "corpfilling", "produkte", "corporate", "delivering", "development",
    "equipment", "establishment", "management", "operate", "operates",
    "operating", "operation", "service", "software", "services", "design",
    "maintaining", "manage", "managed", "manages", "developer", "machinery",
    "repair", "commodity", "servicing", "distributes", "specializing",
    "coordination", "designing", "distribute", "exchange", "growth",
    "ltdmarketing", "machining", "maintain", "managing", "operational",
    "partnership", "farming", "program", "proprietary", "woodworking",
    "deves", "strategic", "training", "assembly", "resale", "resells",
    "tech", "branded", "commercially", "contracting", "cooperative",
    "developement", "organizing", "engineer", "finishing", "formation",
    "foundry", "programs", "quality", "operations", "planning", "productive",
    "prospecting", "merger", "inventory", "company", "gmbhs", "llc", "corp",
    "subsidiary", "corporation", "warehousing", "transportation", "vehicle",
    "transport", "automotive", "delivery", "logistics", "cargo", "car",
    "shipping", "truck", "packing", "hauling", "railway", "logistical",
    "trucking", "harbor", "seaport", "distribution", "storage", "port",
    "container", "warehouse", "bunker", "ship", "maersk", "railroad",
    "tankers", "sea", "coast", "marine", "submarine", "maritime", "travel",
    "automobile", "transit", "passenger", "ford", "freight", "parcel",
    "bus", "drive", "truckstop", "tow", "destination", "vehicular", "traffic",
    "towing", "transporte", "facility", "infrastructure", "transmission",
    "packaging", "aircraft", "courier", "subway", "trucks", "carrier",
    "deliver", "highway", "postal", "route", "drives", "airport", "place",
    "stations", "airborne", "package", "tanker", "transportadora",
    "trolley", "vessel", "carport", "dealership", "fleet", "shipment",
    "carriers", "facilities", "cars", "landfill", "military", "crew",
    "tacoma", "airline", "offshore", "aviation", "retail", "retailer",
    "supermarket", "shop", "shopping", "store", "station", "retailing",
    "sale", "merchant", "commerce", "customer", "wholesaler", "dealer",
    "catalogue", "cataloguer", "shopsouvenir", "refinery", "gasification",
    "distillation", "cracking", "refueling", "fueling", "exploration",
    "mining", "refining", "engineering", "processing", "drilling",
    "midstream", "downstream", "refine", "upstream", "extraction",
    "regasification", "refineria", "discovery", "exploring", "pumping",
    "explore", "explores", "drill", "drilled", "refinement", "surveying",
    "sensing", "lcm", "directional", "rotary", "blowout", "alkylation",
    "isomerization", "stratigraphy", "petrology", "sedimentology",
    "stratigraphic", "hydrocracking", "hydrotreating", "hydrodesulfurization",
    "geology", "sagd", "dehydrogenation", "css", "fcc", "psa", "drainage",
    "refined", "sands", "mapping", "process", "study", "hydraulic", "flow",
    "geological", "refines", "blending", "remediation", "reprocess",
    "concentrated", "convey", "pressure", "expertise", "processes",
    "project", "studies", "gi", "limestone", "liquefying", "plating",
    "scientific", "ground", "seismic", "sand", "pipeline", "oilfield",
    "pipe", "rig", "reservoir", "wellhead", "well", "pipelines", "borehole",
    "lidar", "wellbore", "drillships", "rigs", "bops", "eor", "mwd", "lwd",
    "ipr", "basin", "farm", "lp", "tube", "pumped", "tubing", "inflow",
    "tank", "farms", "creek", "deepwater", "rifle", "valve", "leak",
    "cylinder", "cavern", "coalbed", "sink", "ductor", "finpipe",
    "shalebased", "tanks", "pumps", "mud", "piped", "piston", "plumbing",
    "pump", "combustion", "gasfired", "energy", "electric", "electricity",
    "mw", "battery", "turbine", "capacity", "heating", "megawatt",
    "cogeneration", "electrical", "grid", "generator", "gigawatthours",
    "highvoltage", "inverters", "turbines", "generated", "generates",
    "furnace", "unit", "heat", "thermal", "compressor", "efficiency",
    "generating", "galvanized", "regulator", "outlet", "exhaust", "units",
    "furnaces", "power", "inverter", "powergen", "powertrain", "propulsion",
    "micropower", "molten", "tesla", "emission", "reactors"

    # Add more core keywords as necessary...
])

emerging_keywords = set([
    "solar", "photovoltaic", "solarbased", "solarworld", "solarpack",
    "solares", "solarkonzept", "sunlight", "sunpower", "solarbridge",
    "wind", "renewable", "renewables", "sustainable", "hydroelectric",
    "cleantech", "wave", "hydro", "recycled", "lightsource", "ecoplastic",
    "windpower", "eco", "ecooils", "seaenergy", "tidal", "electrolysis",
    "otec", "ocean", "plant", "pv", "hydros", "energetici", "natureworks",
    "terra", "atmospheric", "cloud", "energetica", "energeticas",
    "environment", "environmentally", "plants", "nature", "environmental",
    "nuclear", "biofuel", "biofuels", "bioenergia", "biocatalyst",
    "biological", "bioenergy", "bio", "bioethanol", "biosev",
    "regenerative", "bioengineering", "biopower", "regeneration",
    "biogaz", "biotech", "wastetoenergy", "bioleo", "bioforming",
    "bioproducts", "bioreactor", "biochem", "biosolutions", "biogas",
    "biomass", "biodiesel", "sugarcane", "biorefineries", "biochemical",
    "biochemtex", "biogazowa", "bioprocesses", "biorefinery", "lfg",
    "biotechnology", "botanical", "enzyme"
    # Add more emerging keywords as necessary...
])

# Create Features Based on Keyword Presence
def keyword_features(text):
    text_set = set(text.split())  # Convert text to a set of words
    core_count = len(core_keywords & text_set)
    emerging_count = len(emerging_keywords & text_set)
    return pd.Series([core_count, emerging_count])

df_cleaned[['core_count', 'emerging_count']] = df_cleaned['cleaned_text'].apply(keyword_features)

# Pre-tokenize the entire dataset (including additional features)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_len = 512
tokenized_texts = df_cleaned['cleaned_text'].apply(
    lambda x: tokenizer.encode_plus(
        x,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
).tolist()

# Convert tokenized data into a custom dataset
class CustomTextDataset(Dataset):
    def __init__(self, tokenized_data, labels, core_counts, emerging_counts):
        self.input_ids = torch.stack([item['input_ids'].squeeze(0) for item in tokenized_data])
        self.attention_mask = torch.stack([item['attention_mask'].squeeze(0) for item in tokenized_data])
        self.labels = torch.tensor(labels)
        self.core_counts = torch.tensor(core_counts, dtype=torch.float32)
        self.emerging_counts = torch.tensor(emerging_counts, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx],
            'core_count': self.core_counts[idx],
            'emerging_count': self.emerging_counts[idx]
        }

# Model with modified forward method to use core and emerging keyword counts

class CustomRobertaForSequenceClassification(RobertaForSequenceClassification):
    def forward(self, input_ids, attention_mask=None, labels=None, core_count=None, emerging_count=None):
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Ensure that labels, core_count, and emerging_count are correctly broadcasted
        if labels is not None:
            labels_expanded = labels.unsqueeze(-1).expand_as(logits)  # Make sure labels are broadcasted correctly

            # Enhance logits adjustments with differentiated weighting based on class performance insights
            logits_adjustment_core = 0.1 + 0.05 * (labels_expanded == 4).float() + 0.05 * (labels_expanded == 5).float()
            logits_adjustment_emerging = 0.1 + 0.05 * (labels_expanded == 4).float() + 0.05 * (labels_expanded == 5).float()

            logits += core_count.unsqueeze(1) * logits_adjustment_core
            logits += emerging_count.unsqueeze(1) * logits_adjustment_emerging

        loss = None
        if labels is not None:
            custom_weights = torch.tensor([1.0, 1.2, 1.2, 1.5, 2.0], dtype=torch.float).to(logits.device)
            loss_fct = torch.nn.CrossEntropyLoss(weight=custom_weights)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return (loss, logits) if loss is not None else logits



# Cross-validation setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 0

# Prepare results storage
all_fold_results = []

# Training arguments
# Modify the TrainingArguments without FP16
training_args = TrainingArguments(
    output_dir='./results',  # Directory for storing logs and model checkpoints
    num_train_epochs=7,  # Train for 7 epochs
    per_device_train_batch_size=8,  # Smaller batch size to manage GPU memory
    per_device_eval_batch_size=16,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    warmup_ratio=0.1,  # Alternatively, set warmup as a ratio of total steps
    weight_decay=0.01,  # Weight decay for regularization
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=50,  # Log model predictions every 50 steps
    evaluation_strategy="epoch",  # Evaluate model at the end of each epoch
    learning_rate=2e-5,  # Higher initial learning rate
    lr_scheduler_type='linear',  # Learning rate scheduler
    fp16=False,  # Disable mixed precision training
    gradient_accumulation_steps=2,  # Use gradient accumulation for larger effective batch size
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="eval_loss",  # Use evaluation loss to determine the best model
    save_total_limit=2  # Limit the total amount of checkpoints; older ones are deleted
)



for train_index, test_index in skf.split(df_cleaned['cleaned_text'], df_cleaned['labels']):
    fold += 1
    print(f"\nFold {fold}")

    # Create datasets
    train_dataset = CustomTextDataset(
        [tokenized_texts[i] for i in train_index],
        df_cleaned['labels'].iloc[train_index].tolist(),
        df_cleaned['core_count'].iloc[train_index].tolist(),
        df_cleaned['emerging_count'].iloc[train_index].tolist()
    )
    test_dataset = CustomTextDataset(
        [tokenized_texts[i] for i in test_index],
        df_cleaned['labels'].iloc[test_index].tolist(),
        df_cleaned['core_count'].iloc[test_index].tolist(),
        df_cleaned['emerging_count'].iloc[test_index].tolist()
    )

    # Initialize a fresh model for each fold with weighted loss
    model = CustomRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)

    # Trainer with early stopping callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=lambda p: {
            'accuracy': accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
            'f1': classification_report(p.label_ids, np.argmax(p.predictions, axis=1), target_names=[str(i+1) for i in range(5)], output_dict=True)['weighted avg']['f1-score']
        },
        callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Fold {fold} - Accuracy: {eval_result['eval_accuracy']}")
    print(f"Fold {fold} - F1-Score: {eval_result['eval_f1']}")

    all_fold_results.append({
        'fold': fold,
        'accuracy': eval_result['eval_accuracy'],
        'f1': eval_result['eval_f1']
    })

# Display results for all folds
for result in all_fold_results:
    print(f"Fold {result['fold']} - Accuracy: {result['accuracy']}, F1-Score: {result['f1']}")

# Calculate and display the average results
avg_accuracy = np.mean([result['accuracy'] for result in all_fold_results])
avg_f1 = np.mean([result['f1'] for result in all_fold_results])
print(f"\nAverage Accuracy: {avg_accuracy}")
print(f"Average F1-Score: {avg_f1}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aaryanshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Fold 1


Some weights of CustomRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/889 [00:00<?, ?it/s]

{'loss': 1.6019, 'grad_norm': 3.533015251159668, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.39}
{'loss': 1.3696, 'grad_norm': 5.096282482147217, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.79}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.0518385171890259, 'eval_accuracy': 0.6094674556213018, 'eval_f1': 0.46158197006613294, 'eval_runtime': 28.4526, 'eval_samples_per_second': 17.819, 'eval_steps_per_second': 1.125, 'epoch': 1.0}
{'loss': 1.0785, 'grad_norm': 7.578855037689209, 'learning_rate': 6e-06, 'epoch': 1.18}
{'loss': 0.8371, 'grad_norm': 8.223987579345703, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.57}
{'loss': 0.7646, 'grad_norm': 7.4581403732299805, 'learning_rate': 1e-05, 'epoch': 1.97}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7563526034355164, 'eval_accuracy': 0.7159763313609467, 'eval_f1': 0.6845190133348386, 'eval_runtime': 24.159, 'eval_samples_per_second': 20.986, 'eval_steps_per_second': 1.325, 'epoch': 2.0}
{'loss': 0.7301, 'grad_norm': 19.088003158569336, 'learning_rate': 1.2e-05, 'epoch': 2.36}
{'loss': 0.6682, 'grad_norm': 9.154130935668945, 'learning_rate': 1.4e-05, 'epoch': 2.76}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7014633417129517, 'eval_accuracy': 0.7396449704142012, 'eval_f1': 0.7207743851837118, 'eval_runtime': 25.498, 'eval_samples_per_second': 19.884, 'eval_steps_per_second': 1.255, 'epoch': 3.0}
{'loss': 0.6005, 'grad_norm': 11.35035514831543, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.15}
{'loss': 0.5792, 'grad_norm': 11.842965126037598, 'learning_rate': 1.8e-05, 'epoch': 3.54}
{'loss': 0.5879, 'grad_norm': 10.356588363647461, 'learning_rate': 2e-05, 'epoch': 3.94}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.7531223297119141, 'eval_accuracy': 0.7435897435897436, 'eval_f1': 0.7312949024982321, 'eval_runtime': 23.427, 'eval_samples_per_second': 21.642, 'eval_steps_per_second': 1.366, 'epoch': 4.0}
{'loss': 0.4855, 'grad_norm': 27.87413215637207, 'learning_rate': 1.74293059125964e-05, 'epoch': 4.33}
{'loss': 0.5421, 'grad_norm': 9.233284950256348, 'learning_rate': 1.4858611825192803e-05, 'epoch': 4.72}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.7131755948066711, 'eval_accuracy': 0.73767258382643, 'eval_f1': 0.7393242515753313, 'eval_runtime': 23.9998, 'eval_samples_per_second': 21.125, 'eval_steps_per_second': 1.333, 'epoch': 5.0}
{'loss': 0.3933, 'grad_norm': 4.338791847229004, 'learning_rate': 1.2287917737789203e-05, 'epoch': 5.12}
{'loss': 0.3486, 'grad_norm': 35.08097839355469, 'learning_rate': 9.717223650385606e-06, 'epoch': 5.51}
{'loss': 0.37, 'grad_norm': 11.659290313720703, 'learning_rate': 7.1465295629820055e-06, 'epoch': 5.91}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.7039790153503418, 'eval_accuracy': 0.7633136094674556, 'eval_f1': 0.7661079350686514, 'eval_runtime': 24.2307, 'eval_samples_per_second': 20.924, 'eval_steps_per_second': 1.321, 'epoch': 6.0}
{'loss': 0.2947, 'grad_norm': 14.355135917663574, 'learning_rate': 4.575835475578407e-06, 'epoch': 6.3}
{'loss': 0.2456, 'grad_norm': 11.556453704833984, 'learning_rate': 2.0051413881748076e-06, 'epoch': 6.69}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.7707707285881042, 'eval_accuracy': 0.7652859960552268, 'eval_f1': 0.7664964720517852, 'eval_runtime': 23.2276, 'eval_samples_per_second': 21.827, 'eval_steps_per_second': 1.378, 'epoch': 7.0}
{'train_runtime': 2819.8115, 'train_samples_per_second': 5.027, 'train_steps_per_second': 0.315, 'train_loss': 0.6582898235428319, 'epoch': 7.0}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 - Accuracy: 0.7396449704142012
Fold 1 - F1-Score: 0.7207743851837118

Fold 2


Some weights of CustomRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/889 [00:00<?, ?it/s]

{'loss': 1.5946, 'grad_norm': 3.4140305519104004, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.39}
{'loss': 1.3652, 'grad_norm': 7.586578845977783, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.79}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.1705008745193481, 'eval_accuracy': 0.6094674556213018, 'eval_f1': 0.46158197006613294, 'eval_runtime': 24.3335, 'eval_samples_per_second': 20.835, 'eval_steps_per_second': 1.315, 'epoch': 1.0}
{'loss': 1.1933, 'grad_norm': 14.963207244873047, 'learning_rate': 6e-06, 'epoch': 1.18}
{'loss': 0.8945, 'grad_norm': 7.3716139793396, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.57}
{'loss': 0.8189, 'grad_norm': 7.62677526473999, 'learning_rate': 1e-05, 'epoch': 1.97}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.8117831945419312, 'eval_accuracy': 0.7080867850098619, 'eval_f1': 0.6770593816758359, 'eval_runtime': 23.8576, 'eval_samples_per_second': 21.251, 'eval_steps_per_second': 1.341, 'epoch': 2.0}
{'loss': 0.7489, 'grad_norm': 22.392475128173828, 'learning_rate': 1.2e-05, 'epoch': 2.36}
{'loss': 0.6931, 'grad_norm': 5.736989974975586, 'learning_rate': 1.4e-05, 'epoch': 2.76}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.718282163143158, 'eval_accuracy': 0.7218934911242604, 'eval_f1': 0.7299347514499277, 'eval_runtime': 24.0806, 'eval_samples_per_second': 21.054, 'eval_steps_per_second': 1.329, 'epoch': 3.0}
{'loss': 0.6495, 'grad_norm': 13.466558456420898, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.15}
{'loss': 0.5851, 'grad_norm': 25.43783187866211, 'learning_rate': 1.8e-05, 'epoch': 3.54}
{'loss': 0.6418, 'grad_norm': 9.57933521270752, 'learning_rate': 2e-05, 'epoch': 3.94}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.6371483206748962, 'eval_accuracy': 0.7435897435897436, 'eval_f1': 0.7524888197694655, 'eval_runtime': 23.5035, 'eval_samples_per_second': 21.571, 'eval_steps_per_second': 1.361, 'epoch': 4.0}
{'loss': 0.4856, 'grad_norm': 20.007740020751953, 'learning_rate': 1.74293059125964e-05, 'epoch': 4.33}
{'loss': 0.4851, 'grad_norm': 4.887363433837891, 'learning_rate': 1.4858611825192803e-05, 'epoch': 4.72}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.6993793249130249, 'eval_accuracy': 0.7554240631163708, 'eval_f1': 0.7533253600442557, 'eval_runtime': 23.1543, 'eval_samples_per_second': 21.897, 'eval_steps_per_second': 1.382, 'epoch': 5.0}
{'loss': 0.4791, 'grad_norm': 12.985546112060547, 'learning_rate': 1.2287917737789203e-05, 'epoch': 5.12}
{'loss': 0.305, 'grad_norm': 13.249201774597168, 'learning_rate': 9.717223650385606e-06, 'epoch': 5.51}
{'loss': 0.3407, 'grad_norm': 12.86470890045166, 'learning_rate': 7.1465295629820055e-06, 'epoch': 5.91}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.762715220451355, 'eval_accuracy': 0.7672583826429981, 'eval_f1': 0.7684960315361515, 'eval_runtime': 22.7268, 'eval_samples_per_second': 22.308, 'eval_steps_per_second': 1.408, 'epoch': 6.0}
{'loss': 0.2607, 'grad_norm': 18.80027198791504, 'learning_rate': 4.575835475578407e-06, 'epoch': 6.3}
{'loss': 0.2082, 'grad_norm': 10.20966911315918, 'learning_rate': 2.0051413881748076e-06, 'epoch': 6.69}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.8170561790466309, 'eval_accuracy': 0.7593688362919132, 'eval_f1': 0.7568416532285164, 'eval_runtime': 22.595, 'eval_samples_per_second': 22.439, 'eval_steps_per_second': 1.416, 'epoch': 7.0}
{'train_runtime': 2391.3036, 'train_samples_per_second': 5.928, 'train_steps_per_second': 0.372, 'train_loss': 0.6728727868491002, 'epoch': 7.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Fold 2 - Accuracy: 0.7435897435897436
Fold 2 - F1-Score: 0.7524888197694655

Fold 3


Some weights of CustomRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/889 [00:00<?, ?it/s]

{'loss': 1.5977, 'grad_norm': 2.7441699504852295, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.39}
{'loss': 1.4025, 'grad_norm': 8.70174503326416, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.79}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.1527999639511108, 'eval_accuracy': 0.6126482213438735, 'eval_f1': 0.4654925211191196, 'eval_runtime': 22.6165, 'eval_samples_per_second': 22.373, 'eval_steps_per_second': 1.415, 'epoch': 1.0}
{'loss': 1.1742, 'grad_norm': 4.826243877410889, 'learning_rate': 6e-06, 'epoch': 1.18}
{'loss': 0.8451, 'grad_norm': 8.929620742797852, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.57}
{'loss': 0.7681, 'grad_norm': 11.2863130569458, 'learning_rate': 1e-05, 'epoch': 1.97}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7418118119239807, 'eval_accuracy': 0.7450592885375494, 'eval_f1': 0.7268291407690205, 'eval_runtime': 26.4641, 'eval_samples_per_second': 19.12, 'eval_steps_per_second': 1.209, 'epoch': 2.0}
{'loss': 0.7062, 'grad_norm': 17.02804946899414, 'learning_rate': 1.2e-05, 'epoch': 2.36}
{'loss': 0.6479, 'grad_norm': 7.491434097290039, 'learning_rate': 1.4e-05, 'epoch': 2.76}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.6957205533981323, 'eval_accuracy': 0.7509881422924901, 'eval_f1': 0.71967013971288, 'eval_runtime': 68.0844, 'eval_samples_per_second': 7.432, 'eval_steps_per_second': 0.47, 'epoch': 3.0}
{'loss': 0.6394, 'grad_norm': 6.098319053649902, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.15}
{'loss': 0.6078, 'grad_norm': 11.340021133422852, 'learning_rate': 1.8e-05, 'epoch': 3.54}
{'loss': 0.5915, 'grad_norm': 12.52563190460205, 'learning_rate': 2e-05, 'epoch': 3.94}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.6626803874969482, 'eval_accuracy': 0.7628458498023716, 'eval_f1': 0.7496164792013364, 'eval_runtime': 29.7952, 'eval_samples_per_second': 16.983, 'eval_steps_per_second': 1.074, 'epoch': 4.0}
{'loss': 0.5135, 'grad_norm': 27.625699996948242, 'learning_rate': 1.74293059125964e-05, 'epoch': 4.33}
{'loss': 0.5341, 'grad_norm': 9.56180477142334, 'learning_rate': 1.4858611825192803e-05, 'epoch': 4.72}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7120953798294067, 'eval_accuracy': 0.7648221343873518, 'eval_f1': 0.7615529514068896, 'eval_runtime': 27.5241, 'eval_samples_per_second': 18.384, 'eval_steps_per_second': 1.163, 'epoch': 5.0}
{'loss': 0.4362, 'grad_norm': 8.148985862731934, 'learning_rate': 1.2287917737789203e-05, 'epoch': 5.12}
{'loss': 0.395, 'grad_norm': 14.570697784423828, 'learning_rate': 9.717223650385606e-06, 'epoch': 5.51}
{'loss': 0.2988, 'grad_norm': 12.05713939666748, 'learning_rate': 7.1465295629820055e-06, 'epoch': 5.91}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.807100772857666, 'eval_accuracy': 0.7430830039525692, 'eval_f1': 0.7367651621583413, 'eval_runtime': 30.9542, 'eval_samples_per_second': 16.347, 'eval_steps_per_second': 1.034, 'epoch': 6.0}
{'loss': 0.2943, 'grad_norm': 10.870992660522461, 'learning_rate': 4.575835475578407e-06, 'epoch': 6.3}
{'loss': 0.2452, 'grad_norm': 15.95904541015625, 'learning_rate': 2.0051413881748076e-06, 'epoch': 6.69}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.806674063205719, 'eval_accuracy': 0.7490118577075099, 'eval_f1': 0.7466997785126545, 'eval_runtime': 38.1888, 'eval_samples_per_second': 13.25, 'eval_steps_per_second': 0.838, 'epoch': 7.0}
{'train_runtime': 5242.2085, 'train_samples_per_second': 2.705, 'train_steps_per_second': 0.17, 'train_loss': 0.6684219416134537, 'epoch': 7.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Fold 3 - Accuracy: 0.7628458498023716
Fold 3 - F1-Score: 0.7496164792013364

Fold 4


Some weights of CustomRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/889 [00:00<?, ?it/s]

{'loss': 1.5952, 'grad_norm': 5.519248962402344, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.39}
{'loss': 1.3631, 'grad_norm': 6.604671001434326, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.79}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.1483759880065918, 'eval_accuracy': 0.6126482213438735, 'eval_f1': 0.4654925211191196, 'eval_runtime': 22.6517, 'eval_samples_per_second': 22.338, 'eval_steps_per_second': 1.413, 'epoch': 1.0}
{'loss': 1.1414, 'grad_norm': 12.80813217163086, 'learning_rate': 6e-06, 'epoch': 1.18}
{'loss': 0.8981, 'grad_norm': 10.222712516784668, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.57}
{'loss': 0.7806, 'grad_norm': 6.732129096984863, 'learning_rate': 1e-05, 'epoch': 1.97}


  0%|          | 0/32 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7823296785354614, 'eval_accuracy': 0.7272727272727273, 'eval_f1': 0.7192996340441216, 'eval_runtime': 341.3288, 'eval_samples_per_second': 1.482, 'eval_steps_per_second': 0.094, 'epoch': 2.0}
{'loss': 0.6856, 'grad_norm': 13.266212463378906, 'learning_rate': 1.2e-05, 'epoch': 2.36}
{'loss': 0.7135, 'grad_norm': 10.422714233398438, 'learning_rate': 1.4e-05, 'epoch': 2.76}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.7482283115386963, 'eval_accuracy': 0.7292490118577075, 'eval_f1': 0.7238912493861531, 'eval_runtime': 23.2897, 'eval_samples_per_second': 21.726, 'eval_steps_per_second': 1.374, 'epoch': 3.0}
{'loss': 0.6302, 'grad_norm': 11.530143737792969, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.15}
{'loss': 0.5706, 'grad_norm': 22.487537384033203, 'learning_rate': 1.8e-05, 'epoch': 3.54}
{'loss': 0.5901, 'grad_norm': 29.152694702148438, 'learning_rate': 2e-05, 'epoch': 3.94}


  0%|          | 0/32 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# perform hyperparameter tuning

# Define hyperparameter search space
param_space = {
    'learning_rate': [2e-5, 3e-5, 4e-5],
    'num_train_epochs': [3, 4, 5],
    'per_device_train_batch_size': [8, 16, 32]
}

# Perform hyperparameter tuning
best_accuracy = 0
best_params = None

for params in ParameterGrid(param_space):
    print(f"Running with parameters: {params}")

    # Create datasets
    train_dataset = CustomTextDataset(
        tokenized_texts,
        df_cleaned['labels'].tolist(),
        df_cleaned['core_count'].tolist(),
        df_cleaned['emerging_count'].tolist()
    )
    test_dataset = CustomTextDataset(
        tokenized_texts,
        df_cleaned['labels'].tolist(),
        df_cleaned['core_count'].tolist(),
        df_cleaned['emerging_count'].tolist()
    )

    # Initialize a fresh model for each fold with weighted loss
    model = CustomRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)

    # Trainer with early stopping callback
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir='./results',  # Directory for storing logs and model checkpoints
            num_train_epochs=params['num_train_epochs'],  # Train for 7 epochs
            per_device_train_batch_size=params['per_device_train_batch_size'],  # Smaller batch size to manage GPU memory
            per_device_eval_batch_size=16,  # Batch size for evaluation
            warmup_steps=500,  # Number of warmup steps for learning rate scheduler
            warmup_ratio=0.1,  # Alternatively, set warmup as a ratio of total steps
            weight_decay=0.01,  # Weight decay for regularization
            logging_dir='./logs',  # Directory for storing logs
            logging_steps=50,  # Log model predictions every 50 steps
            evaluation_strategy="epoch",  # Evaluate model at the end of each epoch
            learning_rate=params['learning_rate'],  # Higher initial learning rate
            lr_scheduler_type='linear',  # Learning rate scheduler
            fp16=True,  # Enable mixed precision training
            gradient_accumulation_steps=2,  # Use gradient accumulation for larger effective batch size
            save_strategy="epoch",  # Save the model at the end of each epoch
            load_best_model_at_end=True,  # Load the best model at the end of training
            metric_for_best_model="eval_loss",  # Use evaluation loss to determine the best model
            save_total_limit=2  # Limit the total amount of checkpoints; older ones are deleted
        ),
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=lambda p: {
            'accuracy': accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
            'f1': classification_report(p.label_ids, np.argmax(p.predictions, axis=1), target_names=[str(i+1) for i in range(5)], output_dict=True)['weighted avg']['f1-score']
        },
        callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
    )
    
    # Train the model
    trainer.train()

    # Evaluate the model
    eval_result = trainer.evaluate()

    # Save the best model
    if eval_result['eval_accuracy'] > best_accuracy:
        best_accuracy = eval_result['eval_accuracy']
        best_params = params
        
    print(f"Fold {fold} - Accuracy: {eval_result['eval_accuracy']}")
    print(f"Fold {fold} - F1-Score: {eval_result['eval_f1']}")

    all_fold_results.append({
        'fold': fold,
        'accuracy': eval_result['eval_accuracy'],
        'f1': eval_result['eval_f1']
    })

    fold += 1
    
print(f"Best parameters: {best_params}")
print(f"Best accuracy: {best_accuracy}")

# Train the final model with the best hyperparameters
model = CustomRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)

# Trainer with early stopping callback
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir='./results',  # Directory for storing logs and model checkpoints
        num_train_epochs=best_params['num_train_epochs'],  # Train for 7 epochs
        per_device_train_batch_size=best_params['per_device_train_batch_size'],  # Smaller batch size to manage GPU memory
        per_device_eval_batch_size=16,  # Batch size for evaluation
        warmup_steps=500,  # Number of warmup steps for learning rate scheduler
        warmup_ratio=0.1,  # Alternatively, set warmup as a ratio of total steps
        weight_decay=0.01,  # Weight decay for regularization
        logging_dir='./logs',  # Directory for storing logs
        logging_steps=50,  # Log model predictions every 50 steps
        evaluation_strategy="epoch",  # Evaluate model at the end of each epoch
        learning_rate=best_params['learning_rate'],  # Higher initial learning rate
        lr_scheduler_type='linear',  # Learning rate scheduler
        fp16=True,  # Enable mixed precision training
        gradient_accumulation_steps=2,  # Use gradient accumulation for larger effective batch size
        save_strategy="epoch",  # Save the model at the end of each epoch
        load_best_model_at_end=True,  # Load the best model at the end of training
        metric_for_best_model="eval_loss",  # Use evaluation loss to determine the best model
        save_total_limit=2  # Limit the total amount of checkpoints; older ones are deleted
    ),
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
        'f1': classification_report(p.label_ids, np.argmax(p.predictions, axis=1), target_names=[str(i+1) for i in range(5)], output_dict=True)['weighted avg']['f1-score']
    },
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)