In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Collecting torch
  Using cached torch-2.7.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting torchvision
  Using cached torchvision-0.22.1-cp311-cp311-win_amd64.whl.metadata (6.1 kB)
Collecting torchaudio
  Using cached torchaudio-2.7.1-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Using cached torch-2.7.1-cp311-cp311-win_amd64.whl (216.1 MB)
Using cached torchvision-0.22.1-cp311-cp311-win_amd64.whl (1.7 MB)
Using cached torchaudio-2.7.1-cp311-cp311-win_amd64.whl (2.5 MB)
Using cached sympy-1.14.0-py3-none-any.whl (6.3 MB)
Using cached networkx-3.5-py3-none-any.whl (2.0 MB)
Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Installing collected packages: mpmath, sympy, n



In [2]:
pip install datasets evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=7e5e98ef3a7d1caf9d4913c3638335f525e5d3ee88f77e483bdf7c8e3b

In [3]:
import pandas as pd
import numpy as np
import torch
import re, unicodedata
from datasets import Dataset
from transformers import (
  AutoTokenizer,
  AutoModelForSeq2SeqLM,
  Seq2SeqTrainingArguments,
  Seq2SeqTrainer,
  DataCollatorForSeq2Seq
)
from rouge_score import rouge_scorer


df_train = pd.read_csv('/kaggle/input/ilsum-hindi/hindi_ilsum_2024_train.csv')
df_val = pd.read_csv('/kaggle/input/ilsum-hindi/hindi_ilsum_2024_val.csv')


# Text preprocessing
def normalize_text(text):
  text = str(text).strip()
  text = unicodedata.normalize("NFKC", text)
  text = re.sub(r'(.)\1{2,}', r'\1\1', text)
  return text

def clean_mixed_hindi_english(text):
  text = normalize_text(text)
  text = re.sub(r"http\S+|www\S+", "[URL]", text)
  text = re.sub(r"@\w+", "[USER]", text)
  text = re.sub(r"[^\w\s\u0900-\u097F]", " ", text)
  text = re.sub(r"\s+", " ", text).strip()
  return text

df_train["Article"] = df_train["Article"].apply(clean_mixed_hindi_english)
df_train["Summary"] = df_train["Summary"].apply(clean_mixed_hindi_english)
df_val["Article"] = df_val["Article"].apply(clean_mixed_hindi_english)
df_val["Summary"] = df_val["Summary"].apply(clean_mixed_hindi_english)

train_dataset = Dataset.from_pandas(df_train[['Article', 'Summary']])
val_dataset = Dataset.from_pandas(df_val[['Article', 'Summary']])


# Initializing model
model_name = "ai4bharat/IndicBART"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


# Preprocess function for tokenization

def preprocess_function(examples):
  inputs = tokenizer(
    examples["Article"],
    max_length=512,
    truncation=True,
    padding="max_length"
  )

  labels = tokenizer(
    examples["Summary"],
    max_length=128,
    truncation=True,
    padding="max_length"
  )

  labels["input_ids"] = [
    [(token if token != tokenizer.pad_token_id else -100) for token in label]
    for label in labels["input_ids"]
  ]

  inputs["labels"] = labels["input_ids"]
  return inputs


# Apply preprocessing
tokenized_train = train_dataset.map(
  preprocess_function,
  batched=True,
  remove_columns=train_dataset.column_names
)
tokenized_val = val_dataset.map(
  preprocess_function,
  batched=True,
  remove_columns=val_dataset.column_names
)


# Data collator
data_collator = DataCollatorForSeq2Seq(
  tokenizer=tokenizer,
  model=model,
  padding=True
)


# Helper function for computing ROUGE metrics
def compute_metrics(pred):
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

  predictions = tokenizer.batch_decode(
      pred.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
  )

  # Replace -100 in the labels as we can't decode them
  labels = np.where(pred.label_ids != -100, pred.label_ids, tokenizer.pad_token_id)
  references = tokenizer.batch_decode(
      labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
  )

  # Compute ROUGE scores
  rouge_scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}
  for pred, ref in zip(predictions, references):
    scores = scorer.score(ref, pred)
    rouge_scores['rouge1'] += scores['rouge1'].fmeasure
    rouge_scores['rouge2'] += scores['rouge2'].fmeasure
    rouge_scores['rougeL'] += scores['rougeL'].fmeasure

  # Average scores
  num_samples = len(predictions)
  for key in rouge_scores:
      rouge_scores[key] /= num_samples

  return rouge_scores



# Training arguments
training_args = Seq2SeqTrainingArguments(
  output_dir="/kaggle/working/",
  eval_strategy="epoch",
  learning_rate=3e-5,
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  weight_decay=0.01,
  save_total_limit=3,
  num_train_epochs=6,
  predict_with_generate=True,
  report_to="tensorboard",
  save_strategy="epoch",
  load_best_model_at_end=True,
  metric_for_best_model="rouge1",
)



# Initialize the trainer
trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_train,
  eval_dataset=tokenized_val,
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics
)



# Training
print("Starting training...")
trainer.train()
print("Training completed!")


# Save model and tokenizer
model.save_pretrained("/kaggle/working/indicbart-summarizer")
tokenizer.save_pretrained("/kaggle/working/indicbart-summarizer")



# Inference function for using the model
def generate_summary(article_text, model, tokenizer, max_length=128):
  inputs = tokenizer(
    article_text,
    max_length=1024,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
  )

  # Move to GPU if available
  if torch.cuda.is_available():
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    model = model.to("cuda")

  # Generate summary
  summary_ids = model.generate(
    inputs["input_ids"],
    num_beams=4,
    min_length=30,
    max_length=max_length,
    early_stopping=True,
    no_repeat_ngram_size=3
  )

  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
  return summary


# Example usage
sample_article = df_val.iloc[0]["Article"]
model_path = "/kaggle/working/indicbart-summarizer"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
summary = generate_summary(sample_article, model, tokenizer)
print(f"Generated Summary: {summary}")


2025-07-05 10:36:29.546593: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751711789.760141      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751711789.828241      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/976M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/976M [00:00<?, ?B/s]

Map:   0%|          | 0/10427 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Starting training...




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,2.2653,1.875048,0.501344,0.200758,0.499495
2,1.9248,1.724087,0.505506,0.205547,0.503884
3,1.7745,1.660347,0.505461,0.206373,0.503548
4,1.6562,1.587062,0.503277,0.208438,0.501479
5,1.617,1.569547,0.503678,0.207103,0.501714
6,1.5832,1.55278,0.503572,0.207904,0.501837


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Training completed!
Generated Summary: MeToo कैंपेन के जरिए कई महिलाएं वर्किंग प्लेस में अपने साथ हुई यौन उत्पीड़न की घटनाओं को सामने ला चुकी हैं जिनमें कई बड़े नाम शामिल हैं लेकिन वह हंसने के लिए लायक बिल्कुल नहीं था इसके आगे वह बताती हैं कि जब उन्होंने सैलरी के तौर पर 500 रुपये मांगे तो दुआ ने उनसे कहा था तुम्हारी औकात है क्या निष्ठा जैन कहती हैं कि उस दिन उनका जन्मदिन था लेकिन इस अनुभव की वजह से उनका पूरा दिन खराब हो गया अपनी पोस्ट में दूसरी घटना का जिक्र करते हुए निष्ठा जैन ने लिखा मुझे दूसरे ऑफिस में वीडियो एडिटर की नौकरी मिल गई विनोद दुआ के दोस्त वहां काम करते थे इसलिए उन्हें इस


In [1]:
# Inference function for using the model
import pandas as pd
import torch
from transformers import (
  AutoTokenizer,
  AutoModelForSeq2SeqLM,
  Seq2SeqTrainingArguments,
  Seq2SeqTrainer,
  DataCollatorForSeq2Seq
)

df_val = pd.read_csv("hindi_ilsum_2024_val.csv")
def generate_summary(article_text, model, tokenizer, max_length=128):
  inputs = tokenizer(
    article_text,
    max_length=128,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
  )

  # Move to GPU if available
  if torch.cuda.is_available():
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    model = model.to("cuda")

  # Generate summary
  summary_ids = model.generate(
    inputs["input_ids"],
    num_beams=4,
    min_length=30,
    max_length=max_length,
    early_stopping=True,
    no_repeat_ngram_size=3
  )

  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
  return summary


# Example usage
sample_article = df_val.iloc[0]["Article"]
model_path = "indicbart_summarizer"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
summary = generate_summary(sample_article, model, tokenizer)
print(f"Generated Summary: {summary}")

RuntimeError: Failed to import transformers.trainer_seq2seq because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
Traceback (most recent call last):
  File "C:\Users\adars\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.