In [None]:
!pip install datasets



In [None]:
!pip install evaluate



In [None]:
from datasets import Dataset
from transformers import BartTokenizer
from transformers import BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import pandas as pd
import torch
import evaluate

In [None]:
# Checking if GPU is available
if torch.cuda.is_available():
    print("GPU is available - Using GPU")
    device = torch.device('cuda')
else:
    print("GPU is not available - Using CPU")
    device = torch.device('cpu')

GPU is available - Using GPU


In [None]:
def display_feature_list(features, feature_type):

    '''
    This function displays the features within each list for each type of data
    '''

    print(f"\n{feature_type} Features: ")
    print(', '.join(features) if features else 'None')

def describe_df(df):

    global categorical_features
    categorical_features = [col for col in df.columns if df[col].dtype == 'object']

    print(f"\n{type(df).__name__} shape: {df.shape}")
    print(f"\n{df.shape[0]:,.0f} samples")
    print(f'\nMissing Data: \n{df.isnull().sum()}')
    print(f'\nDuplicates: {df.duplicated().sum()}')

    display_feature_list(categorical_features, 'Categorical')

    print(f'\n{type(df).__name__} Head: \n')
    display(df.head(5))
    print(f'\n{type(df).__name__} Tail: \n')
    display(df.tail(5))

In [None]:
# Load the dataset
df = pd.read_csv('drive/MyDrive/IRWA_Project/Datasets/email_dataset.csv')

In [None]:
df.head()

Unnamed: 0,Abstract,Topic
0,We propose an adversarial training procedure...,Causal Image Generation
1,Multiple automakers have in development or i...,Ramp Merging in Autonomous Driving Systems
2,We propose a probabilistic model for interpr...,Single-Cell RNA Sequencing Analysis
3,We study the problem of generalized uniformi...,Uniformity Testing in Discrete Probability Dis...
4,"Specialized classifiers, namely those dedica...",Ensemble Methods for Specialized Classifiers


In [None]:
# Columns
column_names = df.columns.tolist()
print(column_names)

['Abstract', 'Topic']


In [None]:
# Info on the Dataset
describe_df(df)


DataFrame shape: (1000, 2)

1,000 samples

Missing Data: 
Abstract    0
Topic       0
dtype: int64

Duplicates: 0

Categorical Features: 
Abstract, Topic

DataFrame Head: 



Unnamed: 0,Abstract,Topic
0,We propose an adversarial training procedure...,Causal Image Generation
1,Multiple automakers have in development or i...,Ramp Merging in Autonomous Driving Systems
2,We propose a probabilistic model for interpr...,Single-Cell RNA Sequencing Analysis
3,We study the problem of generalized uniformi...,Uniformity Testing in Discrete Probability Dis...
4,"Specialized classifiers, namely those dedica...",Ensemble Methods for Specialized Classifiers



DataFrame Tail: 



Unnamed: 0,Abstract,Topic
995,This paper presents an automated approach fo...,Interpretable Feature Recommendation in Signal...
996,Traditional vision-based hand gesture recogn...,Microwave-based Hand Gesture Recognition
997,Generative models such as Variational Auto E...,Latent Space Operations in Generative Models
998,A grand challenge of the 21st century cosmol...,Cosmological Parameter Estimation with Machine...
999,A central task in the field of quantum compu...,Quantum Generative Models for Machine Learning


In [None]:
# Convert to Hugging face dataset
dataset = Dataset.from_pandas(df)

In [None]:
# Split into train and test data
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test['train']
test_dataset = train_test['test']

# Modeling

In [None]:
# Load the pre-trained BART tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]



In [None]:
# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples['Abstract'], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples['Topic'], max_length=128, truncation=True, padding="max_length")
    inputs['labels'] = targets['input_ids']
    return inputs

In [None]:
# Tokenize the dataset
train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# Load the BART model for conditional generation
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
# Set up the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True
)



In [None]:
# Set up the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.205982
2,No log,0.111889
3,1.381200,0.108769


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=600, training_loss=1.1660851844151814, metrics={'train_runtime': 14820.8957, 'train_samples_per_second': 0.162, 'train_steps_per_second': 0.04, 'total_flos': 731683749888000.0, 'train_loss': 1.1660851844151814, 'epoch': 3.0})

In [None]:
# Evaluate the model on validation set
results = trainer.evaluate()
print(results)

{'eval_loss': 0.10876873135566711, 'eval_runtime': 328.0626, 'eval_samples_per_second': 0.61, 'eval_steps_per_second': 0.152, 'epoch': 3.0}


In [None]:
# Saving model
directory = "Email_Subject_Generate"
trainer.save_model(directory)

# Saving model tokenizer
tokenizer.save_pretrained(directory)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('Email_Subject_Generate/tokenizer_config.json',
 'Email_Subject_Generate/special_tokens_map.json',
 'Email_Subject_Generate/vocab.json',
 'Email_Subject_Generate/merges.txt',
 'Email_Subject_Generate/added_tokens.json')

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi, HfFolder, Repository

model_name = "dasunFdo25/bart_email_subject_trained"

model = AutoModelForSeq2SeqLM.from_pretrained("Email_Subject_Generate")
tokenizer = AutoTokenizer.from_pretrained("Email_Subject_Generate")

model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dasunFdo25/bart_email_subject_trained/commit/fb4d944d3bda5ce815ebc86bd2336b1bb93d222d', commit_message='Upload tokenizer', commit_description='', oid='fb4d944d3bda5ce815ebc86bd2336b1bb93d222d', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
model_name = "dasunFdo25/bart_email_subject_trained"

# Load the model and tokenizer from the Hugging Face Hub
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Function to generate an email subject based on the email body
def generate_subject(abstract):
    inputs = tokenizer(abstract, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    output_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    subject = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return subject


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [None]:
# Example
abstract = "I hope this email finds you well. I wanted to update you on our project’s progress. We are on track to meet our deadlines. Please review the attached document, and let me know if you have any questions. Looking forward to your feedback."
subject = generate_subject(abstract)
print("Generated Subject:", subject)

Generated Subject: Evaluating the Progress of Our Project
