In [1]:
import tqdm as notebook_tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch
import pandas as pd
import numpy as np
import re


  from .autonotebook import tqdm as notebook_tqdm
2025-03-01 19:22:32.235027: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-01 19:22:32.328862: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-01 19:22:32.382193: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-01 19:22:32.394732: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-01 19:22:32.4

In [2]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  

gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

In [3]:
def preprocess_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'([,.!?;:])', r' \1 ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [4]:
df = pd.read_csv("hingconvoupdated.csv")

# Ensure text is in string format
df["Conversation"] = df["Conversation"].astype(str).apply(preprocess_text)

# Convert dataset into a list of text sequences
hinglish_texts = df["Conversation"].tolist()

def tokenize_function(examples):
    return gpt2_tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

dataset = Dataset.from_dict({"text": hinglish_texts})

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 9999/9999 [00:01<00:00, 5975.45 examples/s]


In [5]:
# Add a padding token for GPT-2
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

# Define Data Collator for Language Modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=gpt2_tokenizer,
    mlm=False  # We do not use masked language modeling for GPT-2
)

training_args = TrainingArguments(
    output_dir="./gpt2_hinglish_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,        # Increased epochs for better convergence
    save_total_limit=2,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Using same dataset for evaluation (change if needed)
    tokenizer=gpt2_tokenizer,
    data_collator=data_collator
)

# 🚀 Start fine-tuning GPT-2
trainer.train()

gpt2_model.save_pretrained("gpt2_hinglish_model")
gpt2_tokenizer.save_pretrained("gpt2_hinglish_model")



  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.3051,2.748601


KeyboardInterrupt: 

## Predict first word

In [21]:
def clean_generated_text(text):
    text = re.sub(r'^[^\w]+|[^\w]+$', '', text)
    return text.strip()

In [53]:
def predict_first_complete_word(input_text, max_new_tokens=10, seed=42):
    # Optionally set a fixed seed for reproducibility
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    # Tokenize input and prepare attention mask
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
    device = gpt2_model.device
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # Generate additional tokens
    output = gpt2_model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=50,
        temperature=0.7,
        num_return_sequences=1
    )
    

    # Decode generated text without stripping the leading space
    generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
    print("this is a sample generation", generated_text)
    # Remove the input text from the beginning of the generated string
    continuation = generated_text[len(input_text):]

    # If the generated continuation starts with a space, it indicates a new word.
    # Otherwise, it’s a continuation of the current (incomplete) word.
    if continuation and continuation[0] != " ":
        # Extract the last word fragment from the input (if any)
        last_fragment = input_text.split()[-1] if input_text.strip() != "" else ""
        # Concatenate the last fragment with the generated continuation
        combined = last_fragment + continuation
        # Extract the complete word (until the first whitespace)
        match = re.match(r'^(\S+)', combined)
        complete_word = match.group(1) if match else combined
    else:
        # If continuation starts with a space, extract the first word normally
        match = re.match(r'^\s*(\S+)', continuation)
        complete_word = match.group(1) if match else continuation.strip()

    # Clean any stray punctuation at the boundaries
    complete_word = re.sub(r'^[^\w]+|[^\w]+$', '', complete_word)
    return complete_word


In [None]:
input_text = input("Enter a Hinglish phrase: ")
first_word = predict_first_complete_word(input_text)
print(f"First complete word: {first_word}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


this is a sample generation haan , you're crazy ! But I really enjoy watching
First complete word: 


: 

## predict nest token

In [None]:
def predict_next_gpt2_finetuned(input_text, top_n=3):
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors="pt")

    # Create an attention mask
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    # Move tensors to the same device as the model
    device = gpt2_model.device
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    output = gpt2_model.generate(
        input_ids, 
        attention_mask=attention_mask,  # Fix: Include attention mask
        max_length=len(input_ids[0]) + 1, 
        num_return_sequences=1, 
        do_sample=True, 
        top_k=50
    )

    predicted_ids = output[:, -1]  # Get last token
    predicted_words = [gpt2_tokenizer.decode([idx]).strip() for idx in predicted_ids]

    return predicted_words[:top_n]



In [15]:
input_text = input("Enter a Hinglish phrase: ")
predicted_words = predict_next_gpt2_finetuned(input_text)
print(f"Predicted next words using Fine-Tuned GPT-2: {predicted_words}")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicted next words using Fine-Tuned GPT-2: ['h']


# something old

In [3]:
from tokenizers import ByteLevelBPETokenizer

# Train a new tokenizer
hinglish_tokenizer = ByteLevelBPETokenizer()
hinglish_tokenizer.train(files=["hingconvoupdated.csv"], vocab_size=52000, min_frequency=2)

# Save tokenizer
hinglish_tokenizer.save_model("hinglish_tokenizer")

gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.add_special_tokens({"pad_token": "[PAD]"})  # Add padding token

data_collator = DataCollatorForLanguageModeling(
    tokenizer=gpt2_tokenizer,
    mlm=False  # We do not use masked language modeling for GPT-2
)






In [4]:
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

In [5]:
# prep dataset for finetuning
df = pd.read_csv("hingconvoupdated.csv")

# Ensure text is in string format
df["Conversation"] = df["Conversation"].astype(str)

# Convert dataset into a list of text sequences
hinglish_texts = df["Conversation"].tolist()

# Define a function to tokenize text
def tokenize_function(examples):
    return gpt2_tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Convert data into Hugging Face Dataset format
dataset = Dataset.from_dict({"text": hinglish_texts})
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])


Map: 100%|██████████| 9999/9999 [00:01<00:00, 6072.15 examples/s]


In [7]:
training_args = TrainingArguments(
    output_dir="./gpt2_hinglish_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,  
    save_total_limit=2,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Using same dataset for evaluation
    tokenizer=gpt2_tokenizer,
    data_collator=data_collator
)

trainer.train()

# 🚀 Step 9: Save the Fine-Tuned GPT-2 Model
gpt2_model.save_pretrained("gpt2_hinglish_model")
gpt2_tokenizer.save_pretrained("gpt2_hinglish_model")


  trainer = Trainer(


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import re

def predict_next_gpt2(input_text, top_k=40, top_p=0.85, temperature=0.6):
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors="pt")

    output = gpt2_model.generate(
        input_ids,
        max_length=len(input_ids[0]) + 1,
        num_return_sequences=1,
        do_sample=True,
        top_k=top_k,  
        top_p=top_p,  
        temperature=temperature,  
    )

    predicted_ids = output[:, -1]  # Get last token
    predicted_words = [gpt2_tokenizer.decode([idx]).strip() for idx in predicted_ids]

    # 🔹 Fix: Remove leading commas, periods, and extra spaces
    cleaned_words = [re.sub(r"^[^\w]+", "", word) for word in predicted_words]

    return cleaned_words[:3]  # Return top 3 predicted words

# 🚀 Step 11: Test Fine-Tuned GPT-2 on Hinglish Input
input_text = input("Enter a Hinglish phrase: ")
predicted_words = predict_next_gpt2(input_text)
print(f"Predicted next words using Fine-Tuned GPT-2: {predicted_words}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicted next words using Fine-Tuned GPT-2: [',']


In [2]:
import tqdm as notebook_tqdm
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch
import pandas as pd
import re

# Tokenizer Training (Crucial for Hinglish)
from tokenizers import ByteLevelBPETokenizer

# 1. Train a new tokenizer with a larger vocab size and enough occurrences
hinglish_tokenizer = ByteLevelBPETokenizer()
hinglish_tokenizer.train(files=["hingconvoupdated.csv"], vocab_size=52000, min_frequency=5)  # Increased min_frequency

# 2. Save the tokenizer
hinglish_tokenizer.save_model("hinglish_tokenizer")

# Load the trained tokenizer using GPT2TokenizerFast
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("hinglish_tokenizer")

# Add special tokens if they're not already present. Be cautious about re-adding.
if gpt2_tokenizer.pad_token is None:
    gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]' })


# Data Preparation
df = pd.read_csv("hingconvoupdated.csv")
df["Conversation"] = df["Conversation"].astype(str).apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x)) #Clean non-ASCII chars






In [3]:
# Model Loading and Configuration. Use 'facebook/bart-base' for smaller size
# gpt2_model = GPT2LMHeadModel.from_pretrained("facebook/bart-base")   
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")   
gpt2_model.resize_token_embeddings(len(gpt2_tokenizer)) # Resize embeddings to match tokenizer

Embedding(6661, 768)

In [None]:

def tokenize_function(examples):
    return gpt2_tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

dataset = Dataset.from_dict({"text": df["Conversation"].tolist()})
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


# Training Arguments: Adjust based on your resources
training_args = TrainingArguments(
    output_dir="./gpt2_hinglish_finetuned",
    evaluation_strategy = "steps",
    eval_steps = 500, # Evaluation every 500 steps
    logging_steps = 500,
    save_steps = 500,
    per_device_train_batch_size=16, # Reduce batch size if facing CUDA out of memory
    per_device_eval_batch_size=16,
    num_train_epochs=3,  # Start with fewer epochs
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
    weight_decay=0.01,       # Add weight decay for regularization
    warmup_ratio=0.06,
    fp16=True,               # Enable mixed precision if your GPU supports it      
    save_total_limit=2,

)

# Trainer
trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False),
)

trainer.train()
trainer.save_model()

# Prediction
def predict_next_gpt2(input_text, top_k=5, temperature=0.7): # reduced top_k
  input_ids = gpt2_tokenizer.encode(input_text, return_tensors="pt").to(trainer.model.device)
  output = trainer.model.generate(input_ids, max_length=len(input_ids[0]) + 30,  num_return_sequences=1, top_k=top_k, temperature=temperature) # Generate more tokens
  predicted_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
  return predicted_text


input_text = "kya haal hai?"
print("Input:", input_text)
generated_text = predict_next_gpt2(input_text)
print("Generated:", generated_text)

Map: 100%|██████████| 9999/9999 [00:00<00:00, 31140.97 examples/s]
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input: kya haal hai?
Generated: kya haal hai? I can toh, mujhe bhi, I'm. Tum bhi hai, I'm a toh hai. I'm, I'm you. I'm to


In [8]:
def predict_next_gpt2(input_text, top_k=5, temperature=0.7): # reduced top_k
  input_ids = gpt2_tokenizer.encode(input_text, return_tensors="pt").to(trainer.model.device)
  output = trainer.model.generate(input_ids, max_length=len(input_ids[0]) + 30,  num_return_sequences=1, top_k=top_k, temperature=temperature) # Generate more tokens
  predicted_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
  return predicted_text


input_text = "theek"
print("Input:", input_text)
generated_text = predict_next_gpt2(input_text)
print("Generated:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: theek
Generated: theek, toh. I can toh bhi toh bhi, I can bhi bahut hai. I can you, mujhe bhi na. I can, I can,


In [None]:
def predict_next_3_words(input_text, max_new_tokens=7, top_k=5, temperature=0.7):
    # Tokenize the input text and move the tensor to the model's device.
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors="pt").to(trainer.model.device)
    
    # Generate new tokens.
    output = trainer.model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        do_sample=True,
        top_k=top_k,
        temperature=temperature,
        pad_token_id=gpt2_tokenizer.eos_token_id  # Set pad token to avoid warnings.
    )
    
    # Decode the full generated text.
    full_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Remove the original input text from the generated output if it's present.
    if full_text.startswith(input_text):
        generated_part = full_text[len(input_text):].strip()
    else:
        generated_part = full_text.strip()
    
    # Split the generated text into words and extract the first three.
    words = generated_part.split()
    next_three_words = " ".join(words[:3])
    
    return next_three_words

# Example usage:
input_text = "mai"
print("Input:", input_text)
print("Next 3 words:", predict_next_3_words(input_text))


Input: mai
Next 3 words: , I can


: 