In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-an

In [2]:
import pandas as pd
from datasets import Dataset

In [3]:
# Load your dataset
df = pd.read_csv('drive/MyDrive/IRWA_Project/Datasets/email_dataset.csv')

In [4]:
df.head()

Unnamed: 0,Abstract,Topic
0,We propose an adversarial training procedure...,Causal Image Generation
1,Multiple automakers have in development or i...,Ramp Merging in Autonomous Driving Systems
2,We propose a probabilistic model for interpr...,Single-Cell RNA Sequencing Analysis
3,We study the problem of generalized uniformi...,Uniformity Testing in Discrete Probability Dis...
4,"Specialized classifiers, namely those dedica...",Ensemble Methods for Specialized Classifiers


In [5]:
# Convert to Hugging face dataset
dataset = Dataset.from_pandas(df)

In [6]:
# Split into train and test data
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test['train']
test_dataset = train_test['test']

In [7]:
from transformers import BartTokenizer

# Load the pre-trained BART tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]



In [9]:
# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples['Abstract'], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples['Topic'], max_length=128, truncation=True, padding="max_length")
    inputs['labels'] = targets['input_ids']
    return inputs

In [10]:
# Tokenize the dataset
train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [11]:
from transformers import BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load the BART model for conditional generation
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [12]:
from transformers import BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load the BART model for conditional generation
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

In [13]:
# Set up the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',                   # output directory for model checkpoints
    evaluation_strategy='epoch',              # evaluate after each epoch
    learning_rate=2e-5,                       # learning rate
    per_device_train_batch_size=4,            # batch size for training
    per_device_eval_batch_size=4,             # batch size for evaluation
    weight_decay=0.01,                        # weight decay
    save_total_limit=3,                       # limit the total number of checkpoints
    num_train_epochs=3,                       # number of epochs
    predict_with_generate=True                # whether to predict using generation (for text tasks)
)



In [15]:
# Set up the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer
)

In [16]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.205982
2,No log,0.111889
3,1.381200,0.108769


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=600, training_loss=1.1660851844151814, metrics={'train_runtime': 14820.8957, 'train_samples_per_second': 0.162, 'train_steps_per_second': 0.04, 'total_flos': 731683749888000.0, 'train_loss': 1.1660851844151814, 'epoch': 3.0})

In [17]:
# Evaluate the model on validation set
results = trainer.evaluate()
print(results)

{'eval_loss': 0.10876873135566711, 'eval_runtime': 328.0626, 'eval_samples_per_second': 0.61, 'eval_steps_per_second': 0.152, 'epoch': 3.0}


In [18]:
# Saving model
directory = "Email_Subject_Generate"
trainer.save_model(directory)

# Saving model tokenizer
tokenizer.save_pretrained(directory)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('Email_Subject_Generate/tokenizer_config.json',
 'Email_Subject_Generate/special_tokens_map.json',
 'Email_Subject_Generate/vocab.json',
 'Email_Subject_Generate/merges.txt',
 'Email_Subject_Generate/added_tokens.json')

In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from huggingface_hub import HfApi, HfFolder, Repository

model_name = "dasunFdo25/bart_email_subject_trained"

model = AutoModelForSeq2SeqLM.from_pretrained("Email_Subject_Generate")
tokenizer = AutoTokenizer.from_pretrained("Email_Subject_Generate")

model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dasunFdo25/bart_email_subject_trained/commit/fb4d944d3bda5ce815ebc86bd2336b1bb93d222d', commit_message='Upload tokenizer', commit_description='', oid='fb4d944d3bda5ce815ebc86bd2336b1bb93d222d', pr_url=None, pr_revision=None, pr_num=None)

In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Replace 'your_username' and 'your_model_name' with your Hugging Face account and model name
model_name = "dasunFdo25/bart_email_subject_trained"

# Load the model and tokenizer from the Hugging Face Hub
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Function to generate an email subject based on the content
def generate_subject(abstract):
    inputs = tokenizer(abstract, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    output_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    subject = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return subject


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [2]:
# Example usage
abstract = "This email discusses the upcoming project details and the timeline for IRWA model project report submission."
subject = generate_subject(abstract)
print("Generated Subject:", subject)

Generated Subject: IRWA Model Project Report Submission
