In [1]:
import tqdm as notebook_tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch
import pandas as pd
import numpy as np
import re

  from .autonotebook import tqdm as notebook_tqdm
2025-03-24 16:48:54.041524: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-24 16:48:54.139024: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-24 16:48:54.193414: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-24 16:48:54.206759: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-24 16:48:54.2

In [2]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

In [3]:
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)            # Normalize spaces
    return text.strip()

In [4]:
df = pd.read_csv("hingconvoupdated.csv")

# Ensure text is in string format
df["Conversation"] = df["Conversation"].astype(str).apply(preprocess_text)

# Convert dataset into a list of text sequences
hinglish_texts = df["Conversation"].tolist()

dataset = Dataset.from_dict({"text": hinglish_texts})

# Tokenize function
def tokenize_function(examples):
    return gpt2_tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

print(tokenized_dataset[0])

Map: 100%|██████████| 9999/9999 [00:01<00:00, 6783.30 examples/s]

{'text': 'Hey Radhika! Kaisi ho?', 'input_ids': [10814, 5325, 71, 9232, 0, 11611, 23267, 8169, 30, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 




In [9]:
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
# Define Data Collator for Language Modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=gpt2_tokenizer,
    mlm=False  # We do not use masked language modeling for GPT-2
)

training_args = TrainingArguments(
    output_dir="./gpt2_hinglish_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,        # Increased epochs for better convergence
    save_total_limit=2,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Using same dataset for evaluation (change if needed)
    tokenizer=gpt2_tokenizer,
    data_collator=data_collator
)

# 🚀 Start fine-tuning GPT-2
trainer.train()

gpt2_model.save_pretrained("gpt2_hinglish_model")
gpt2_tokenizer.save_pretrained("gpt2_hinglish_model")



  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.2858,2.74576
2,2.7337,2.463664
3,2.5325,2.303988
4,2.4126,2.216388
5,2.3396,2.182438


('gpt2_hinglish_model/tokenizer_config.json',
 'gpt2_hinglish_model/special_tokens_map.json',
 'gpt2_hinglish_model/vocab.json',
 'gpt2_hinglish_model/merges.txt',
 'gpt2_hinglish_model/added_tokens.json')

## Predict first word

In [11]:
def clean_generated_text(text):
    text = re.sub(r'^[^\w]+|[^\w]+$', '', text)
    return text.strip()

In [30]:
def predict_first_complete_word(input_text, max_new_tokens=10, seed=42):
    # Set seed for reproducibility
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    # Tokenize input
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate
    output = gpt2_model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=50,
        temperature=0.7,
        num_return_sequences=1
    )

    # Decode full text
    generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
    print("The text which has been generated from the gpt2 tokens :->  ", generated_text)

    # Get only the new continuation
    continuation = generated_text[len(input_text):]

    # If input doesn't end with space, GPT-2 likely continued a word
    if input_text and not input_text.endswith(' ') and continuation:
        # Attach to last fragment
        prefix = input_text.split()[-1]
        combined = prefix + continuation

        # Find where the prefix ends (first word)
        first_match = re.match(r'^(\S+)', combined)
        if first_match:
            remaining = combined[len(first_match.group(1)):].lstrip()
        else:
            remaining = combined
    else:
        remaining = continuation.lstrip()

    # Now extract the next complete word
    next_match = re.match(r'^(\w+)', remaining)
    next_word = next_match.group(1) if next_match else ""

    return clean_generated_text(next_word)


In [40]:
input_text = input("Enter a Hinglish phrase: ")
first_word = predict_first_complete_word(input_text)
print("the input text used for generation    :->", input_text)
print(f"First complete word: {first_word}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The text which has been generated from the gpt2 tokens :->   haan, maine bhi yeh project pe a
the input text used for generation    :-> haan
First complete word: maine
