In [1]:
import tqdm as notebook_tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch
import pandas as pd
import numpy as np
import re

  from .autonotebook import tqdm as notebook_tqdm
2025-03-24 21:27:28.366470: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-24 21:27:28.374604: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-24 21:27:28.383965: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-24 21:27:28.386828: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-24 21:27:28.3

In [2]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
def tokenize_function(examples):
    return gpt2_tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    
data_collator = DataCollatorForLanguageModeling(
    tokenizer=gpt2_tokenizer,
    mlm=False  
)

training_args = TrainingArguments(
    output_dir="./gpt2_hinglish_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_total_limit=2,
    report_to="none",
    fp16=True
)




In [5]:
chunk_size = 10000
chunk_iterator = pd.read_csv("hingconvo.csv", chunksize=chunk_size)

In [6]:
for chunk_idx, chunk in enumerate(chunk_iterator):
    print(f"\n🚀 Starting training on chunk {chunk_idx + 1}")

    chunk["Conversation"] = chunk["Conversation"].astype(str).apply(preprocess_text)
    chunk_texts = chunk["Conversation"].tolist()

    chunk_dataset = Dataset.from_dict({"text": chunk_texts})
    tokenized_chunk = chunk_dataset.map(tokenize_function, batched=True)
    tokenized_chunk.set_format("torch", columns=["input_ids", "attention_mask"])

    trainer = Trainer(
        model=gpt2_model,
        args=training_args,
        train_dataset=tokenized_chunk,
        eval_dataset=tokenized_chunk,
        tokenizer=gpt2_tokenizer,
        data_collator=data_collator
    )

    trainer.train()

    gpt2_model.save_pretrained(f"temp/gpt2_hinglish_model_chunk_{chunk_idx + 1}")
    gpt2_tokenizer.save_pretrained(f"temp/gpt2_hinglish_model_chunk_{chunk_idx + 1}")

print("\n✅ Finished training on all chunks. Saving final model...")
gpt2_model.save_pretrained("gpt2_hinglish_model_final")
gpt2_tokenizer.save_pretrained("gpt2_hinglish_model_final")



🚀 Starting training on chunk 1


Map: 100%|██████████| 10000/10000 [00:01<00:00, 6336.02 examples/s]
  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.2949,2.763762
2,2.7624,2.529295
3,2.6038,2.454994



🚀 Starting training on chunk 2


Map: 100%|██████████| 10000/10000 [00:01<00:00, 5464.58 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.7552,2.473769
2,2.5174,2.315716
3,2.3989,2.255392



🚀 Starting training on chunk 3


Map: 100%|██████████| 10000/10000 [00:01<00:00, 5093.74 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.6075,2.364974
2,2.417,2.230248
3,2.3132,2.181392



🚀 Starting training on chunk 4


Map: 100%|██████████| 4884/4884 [00:01<00:00, 4650.92 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.5196,2.284542
2,2.352,2.162186
3,2.2484,2.118873



✅ Finished training on all chunks. Saving final model...


('gpt2_hinglish_model_final/tokenizer_config.json',
 'gpt2_hinglish_model_final/special_tokens_map.json',
 'gpt2_hinglish_model_final/vocab.json',
 'gpt2_hinglish_model_final/merges.txt',
 'gpt2_hinglish_model_final/added_tokens.json')

## Predict first word

In [7]:
def clean_generated_text(text):
    text = re.sub(r'^[^\w]+|[^\w]+$', '', text)
    return text.strip()

In [None]:
def predict_first_complete_word(input_text, max_new_tokens=10, seed=42):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    input_ids = gpt2_tokenizer.encode(input_text, return_tensors="pt").to(device)

    output = gpt2_model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=50,
        temperature=0.7,
        num_return_sequences=1
    )

    generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
    print("The text which has been generated from the gpt2 tokens :->  ", generated_text)

    continuation = generated_text[len(input_text):]

    if input_text and not input_text.endswith(' ') and continuation:
        prefix = input_text.split()[-1]
        combined = prefix + continuation

        first_match = re.match(r'^(\S+)', combined)
        if first_match:
            remaining = combined[len(first_match.group(1)):].lstrip()
        else:
            remaining = combined
    else:
        remaining = continuation.lstrip()

    next_match = re.match(r'^(\w+)', remaining)
    next_word = next_match.group(1) if next_match else ""

    return clean_generated_text(next_word)


In [9]:
input_text = input("Enter a Hinglish phrase: ")
first_word = predict_first_complete_word(input_text)
print("the input text used for generation    :->", input_text)
print(f"First complete word: {first_word}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The text which has been generated from the gpt2 tokens :->   maine bhi bahut khushi ho rahi
the input text used for generation    :-> maine
First complete word: bhi
