In [None]:
!pip install jsonlines transformers
!pip install jsonlines
!pip install jsonlines transformers datasets
!pip install datasets
from transformers import GenerationConfig




In [3]:
!pip install transformers[torch] -U
!pip install accelerate -U




In [4]:
# Import required libraries
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments
import jsonlines

# Check if GPU is available
print("GPU Available: ", torch.cuda.is_available())
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# Path to your JSONL files in Google Drive
train_file_path = '/data/hindi_train.jsonl'
test_file_path = '/data/hindi_test.jsonl'
val_file_path = '/data/hindi_val.jsonl'

# Reading the JSONL files
with jsonlines.open(train_file_path) as reader:
    train_data = [obj for obj in reader]

with jsonlines.open(test_file_path) as reader:
    test_data = [obj for obj in reader]

with jsonlines.open(val_file_path) as reader:
    val_data = [obj for obj in reader]

# Convert the lists to datasets
train_dataset = Dataset.from_dict({'text': [item['text'] for item in train_data], 'summary': [item['summary'] for item in train_data]})
test_dataset = Dataset.from_dict({'text': [item['text'] for item in test_data], 'summary': [item['summary'] for item in test_data]})
val_dataset = Dataset.from_dict({'text': [item['text'] for item in val_data], 'summary': [item['summary'] for item in val_data]})


Mounted at /content/drive
GPU Available:  False


In [4]:
print(train_dataset[0].keys())


dict_keys(['text', 'summary'])


In [5]:
small_train_dataset = train_dataset.select(range(7000))
small_test_dataset = test_dataset.select(range(1000))
small_val_dataset = val_dataset.select(range(1000))

In [6]:
# Load IndicBART model and tokenizer
model_name = "ai4bharat/IndicBART-XLSUM"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/976M [00:00<?, ?B/s]

In [7]:
# Preprocess function
def preprocess_function(batch):
    inputs = batch['text']
    targets = batch['summary']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=150, truncation=True, padding="max_length")
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


In [8]:
# Apply preprocessing
train_dataset = small_train_dataset.map(preprocess_function, batched=True, batch_size=500)
test_dataset = small_test_dataset.map(preprocess_function, batched=True, batch_size=500)
val_dataset = small_val_dataset.map(preprocess_function, batched=True, batch_size=500)


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
print(train_dataset[0])

{'text': "नेपाल में इस दौरे की भरपूर तैयारियां चल रही हैं. इसकी ख़ास वजह भी है क्योंकि 23 साल बाद कोई चीनी राष्ट्रपति नेपाल पहुंच रहा है. नेपाल के प्रधानमंत्री केपी शर्मा ओली के विदेश मामलों के सलाहकार डॉ. राजन भट्टाराई ने इस दौरे को ऐतिहासिक बताया है. उन्होंने बताया कि चीन के राष्ट्रपति के साथ उनका एक प्रतिनिधि दल भी होगा. दोनों देशों के बीच कई समझौते होने की बात भी कही जा रही है. चीन के राष्ट्रपति और नेपाल के प्रधानमंत्री के बीच आधिकारिक बैठक भी तय हुई है. समाप्त नेपाल जाने से पहले चीन के राष्ट्रपति भारत में प्रधानमंत्री नरेंद्र मोदी के साथ अनौपचारिक मुलाक़ात करेंगे. इससे पहले शी जिनपिंग पाकिस्तान के प्रधानमंत्री इमरान ख़ान के साथ भी बैठक कर चुके हैं. चीन का प्रभाव दक्षिण एशिया में लगातार बढ़ रहा है. वो चाहे नेपाल, श्रीलंका, पाकिस्तान या बांग्लादेश हो. हर जगह चीन की मौजूदगी बढ़ी है. ये सभी देश चीन की वन बेल्ट वन रोड परियोजना में शामिल हो गए हैं. दूसरी तरफ़ भारत इस परियोजना के पक्ष में नहीं है. नेपाल की राष्ट्रपति विद्या देवी भंडारी और चीन के राष्ट्रपति शी जिनपिंग नेपाल की तरफ़ बढ़ता 

In [11]:
# Remove columns other than input_ids and labels
train_dataset = train_dataset.remove_columns(['text', 'summary'])
test_dataset = test_dataset.remove_columns(['text', 'summary'])
val_dataset = val_dataset.remove_columns(['text', 'summary'])


In [11]:
print(train_dataset[0])

{'input_ids': [2, 29627, 15, 132, 24019, 19, 18589, 4422, 2658, 3363, 506, 69, 5, 5448, 51962, 3990, 83, 17, 3360, 1998, 941, 323, 779, 20771, 8938, 29627, 5326, 453, 17, 5, 29627, 12, 3636, 12, 1752, 3680, 2412, 284, 12, 11302, 16412, 12, 46635, 2632, 5, 32708, 33813, 3564, 1908, 45, 132, 24019, 29, 18777, 1073, 17, 5, 570, 1073, 65, 5121, 12, 8938, 12, 288, 3562, 68, 22761, 4935, 83, 1448, 5, 2019, 9311, 12, 1835, 1088, 39233, 725, 19, 873, 83, 15102, 315, 506, 17, 5, 5121, 12, 8938, 43, 29627, 12, 3636, 12, 1835, 34892, 2777, 83, 9061, 1117, 17, 5, 14417, 29627, 1325, 34, 710, 5121, 12, 8938, 579, 15, 3636, 4744, 1247, 12, 288, 5509, 2139, 371, 272, 21891, 19112, 10129, 212, 5093, 5, 2520, 710, 8, 1182, 9047, 37228, 2467, 12, 3636, 34574, 11305, 4650, 12, 288, 83, 2777, 123, 6548, 69, 5, 5121, 37, 8979, 6105, 32073, 15, 7144, 5087, 453, 17, 5, 1864, 15566, 29627, 6, 22926, 6, 2467, 76, 26337, 164, 5, 1424, 5224, 5121, 19, 52536, 41931, 17, 5, 660, 1322, 720, 5121, 19, 6831, 2086, 17

In [12]:
# Combine datasets into a DatasetDict
datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': val_dataset
})

In [13]:
for key in datasets:
        print(f"Dataset Split: {key}")
        print(datasets[key])
        print("\n")


Dataset Split: train
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 7000
})


Dataset Split: test
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})


Dataset Split: validation
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})




In [14]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Reduced batch size to avoid memory issues
    per_device_eval_batch_size=1,  # Reduced batch size to avoid memory issues
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    gradient_accumulation_steps=4,  # Accumulate gradients
    fp16=False,  # Enable mixed precision training
)



In [15]:
# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    tokenizer=tokenizer
)

# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.1688,0.649425


Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}


Epoch,Training Loss,Validation Loss
1,1.1688,0.649425
2,1.0385,0.615485
3,1.0192,0.606554


Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}


TrainOutput(global_step=5250, training_loss=1.6676777808779761, metrics={'train_runtime': 4835.2949, 'train_samples_per_second': 4.343, 'train_steps_per_second': 1.086, 'total_flos': 1.1377695522816e+16, 'train_loss': 1.6676777808779761, 'epoch': 3.0})

In [16]:
# Evaluate the model
evaluation_results = trainer.evaluate(eval_dataset=datasets['test'])
print(evaluation_results)


{'eval_loss': 0.5919754505157471, 'eval_runtime': 60.947, 'eval_samples_per_second': 16.408, 'eval_steps_per_second': 16.408, 'epoch': 3.0}


In [17]:
model_save_path = '/indicbart_results'


In [18]:
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Save the generation configuration
gen_config = GenerationConfig.from_model_config(model.config)
gen_config.save_pretrained(model_save_path)

print("Model, tokenizer, and generation config saved to Google Drive!")

Non-default generation parameters: {'forced_eos_token_id': 2}


Model, tokenizer, and generation config saved to Google Drive!
