In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import torch

# FineTuning

In [3]:
model_name = "gpt2"  # or "gpt2-medium" for 355M
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
def load_dataset(file_path, tokenizer):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128
    )
    return dataset

In [7]:
train_dataset = load_dataset("chat.txt", tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (276626 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
model = AutoModelForCausalLM.from_pretrained(model_name)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [11]:
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
    fp16=False,  # No GPU
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [15]:
trainer.train()
trainer.save_model("./gpt2-friend-model")
tokenizer.save_pretrained("./gpt2-friend-model")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.9162
200,1.6356
300,1.5635
400,1.4577
500,1.5146
600,1.4032
700,1.3909
800,1.3802
900,1.3733
1000,1.3035


('./gpt2-friend-model\\tokenizer_config.json',
 './gpt2-friend-model\\special_tokens_map.json',
 './gpt2-friend-model\\vocab.json',
 './gpt2-friend-model\\merges.txt',
 './gpt2-friend-model\\added_tokens.json',
 './gpt2-friend-model\\tokenizer.json')

In [41]:
import re
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2-friend-model", tokenizer="./gpt2-friend-model")
prompt = "Me: ki korchis re? \nHer:"


response = generator(
    prompt,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    pad_token_id=50256,
)

generated_text = response[0]['generated_text']

def clean_output(text):
    lines = text.splitlines()
    cleaned = []
    for line in lines:
        if re.match(r'^\d{2}/\d{2}/\d{4}, \d{2}:\d{2} - ', line):
            continue 
        if '<Media omitted>' in line:
            continue 
        cleaned.append(line)
    return "\n".join(cleaned)

print(clean_output(generated_text))


Device set to use cpu


Me: ki korchis re? 
Her: korchis


In [80]:
# from transformers import pipeline

# generator = pipeline('text-generation', model="./gpt2-friend-model", tokenizer="./gpt2-friend-model")

# prompt = "A funny and slightly sarcastic Bengali girl is chatting with her best friend using English letters.\nFriend: AU Sir ki poralo re?\nHer:"
# response = generator(prompt, max_new_tokens=50, do_sample=True, temperature=0.7)
# print(response[0]['generated_text'])


# LSTM from Scratch

In [13]:
# transforming chat.txt dataset


import re

# === CONFIGURATION ===
input_file = "chat.txt"       # your original chat file
output_file = "new_chat.txt"      # the output file name

# === NAME MAPPING ===
speaker_map = {
    "Sneha Dutta AEC CSE": "Her",
    "Debkumar Baksi": "Me"
}

# === PATTERN TO MATCH WHATSAPP MESSAGES ===
message_pattern = re.compile(r'^(\d{2}/\d{2}/\d{4}), \d{2}:\d{2} - ([^:]+): (.*)')

# === PROCESSING ===
processed_lines = []
current_speaker = None
current_message = []

with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if "<media omitted>" in line.lower():
            continue
        match = message_pattern.match(line)
        if match:
            # Save previous message
            if current_speaker and current_message:
                speaker_label = speaker_map.get(current_speaker, current_speaker)
                processed_lines.append(f"{speaker_label}: {' '.join(current_message)}")
            # Start new message
            current_speaker = match.group(2)
            current_message = [match.group(3)]
        else:
            # Continuation of the previous message
            if current_message is not None:
                current_message.append(line)

# Add the last message
if current_speaker and current_message:
    speaker_label = speaker_map.get(current_speaker, current_speaker)
    processed_lines.append(f"{speaker_label}: {' '.join(current_message)}")

# === SAVE TO FILE ===
with open(output_file, "w", encoding="utf-8") as f:
    f.write("\n".join(processed_lines))

print(f"Chat converted and saved to '{output_file}'")


Chat converted and saved to 'new_chat.txt'


## LSTM

In [23]:
# Load cleaned WhatsApp chat
with open("new_chat.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Character mappings
chars = sorted(list(set(text)))
char2idx = {ch: idx for idx, ch in enumerate(chars)}
idx2char = {idx: ch for ch, idx in char2idx.items()}
vocab_size = len(chars)

# Print info
print(f"Unique characters: {vocab_size}")


Unique characters: 216


In [19]:
import torch
from torch.utils.data import Dataset, DataLoader

class ChatDataset(Dataset):
    def __init__(self, text, seq_len=100):
        self.data = text
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        x_seq = self.data[idx : idx + self.seq_len]
        y_seq = self.data[idx + 1 : idx + self.seq_len + 1]
        x = torch.tensor([char2idx[ch] for ch in x_seq], dtype=torch.long)
        y = torch.tensor([char2idx[ch] for ch in y_seq], dtype=torch.long)
        return x, y


In [17]:
import torch.nn as nn

class CharLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size=256, num_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embed(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden


In [25]:
import time
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm  # progress bar

seq_len = 50
batch_size = 32
hidden_size = 64
num_layers = 1
epochs = 5
lr = 0.003

dataset = ChatDataset(text, seq_len)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

model = CharLSTM(vocab_size, hidden_size, num_layers).to("cpu")
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()


print("🔁 Starting training...\n")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    start_time = time.time()

    pbar = tqdm(loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
    for x, y in pbar:
        x, y = x.to("cpu"), y.to("cpu")

        out, _ = model(x)
        loss = loss_fn(out.view(-1, vocab_size), y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pbar.set_postfix(loss=loss.item())

    epoch_time = time.time() - start_time
    print(f"✅ Epoch {epoch+1} complete - Loss: {total_loss / len(loader):.4f} - Time: {epoch_time:.2f}s")

print("\n✅ Training finished.")


🔁 Starting training...



                                                                                                                       

KeyboardInterrupt: 

## save model after training

In [70]:
# Save model and mappings
torch.save({
    'model_state_dict': model.state_dict(),
    'char2idx': char2idx,
    'idx2char': idx2char,
    'vocab_size': vocab_size,
    'hidden_size': hidden_size,
    'num_layers': num_layers
}, "chat_lstm_model.pth")

print("✅ Model saved as chat_lstm_model.pth")


✅ Model saved as chat_lstm_model.pth


## load model

In [27]:
# Load checkpoint
checkpoint = torch.load("chat_lstm_model.pth")

# Rebuild the mappings and model
char2idx = checkpoint['char2idx']
idx2char = checkpoint['idx2char']
vocab_size = checkpoint['vocab_size']
hidden_size = checkpoint['hidden_size']
num_layers = checkpoint['num_layers']

# Rebuild and load the model
model = CharLSTM(vocab_size, hidden_size, num_layers)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print("✅ Model loaded successfully")


✅ Model loaded successfully


In [29]:
# def generate_text(prompt, length=300):
#     model.eval()
#     input_idx = torch.tensor([char2idx.get(ch, 0) for ch in prompt], dtype=torch.long).unsqueeze(0)
#     generated = list(prompt)
#     hidden = None

#     for _ in range(length):
#         with torch.no_grad():
#             output, hidden = model(input_idx[:, -1:], hidden)
#         next_id = torch.argmax(output[0, -1]).item()
#         next_char = idx2char[next_id]
#         generated.append(next_char)
#         input_idx = torch.cat([input_idx, torch.tensor([[next_id]])], dim=1)

#     return ''.join(generated)



import torch.nn.functional as F

def sample_next(output, temperature=1.0):
    logits = output[0, -1] / temperature
    probs = F.softmax(logits, dim=0)
    return torch.multinomial(probs, num_samples=1).item()

def generate_text(prompt, length=300, temperature=1.0):
    model.eval()
    input_idx = torch.tensor([char2idx.get(ch, 0) for ch in prompt], dtype=torch.long).unsqueeze(0)
    generated = list(prompt)
    hidden = None

    for _ in range(length):
        with torch.no_grad():
            output, hidden = model(input_idx[:, -1:], hidden)
        next_id = sample_next(output, temperature)
        next_char = idx2char[next_id]
        generated.append(next_char)
        input_idx = torch.cat([input_idx, torch.tensor([[next_id]])], dim=1)

    return ''.join(generated)

In [39]:
prompt = "Me:ki korchhis re?\nHer:"
print(generate_text(prompt))

Me:ki korchhis re?
Her: 🙂🙂🙂🙂byoum
Her: amdr
Me: Mondr kore ni
Me: upch ker 6e n
Her: kal lab lag6er
Herr: saruu
Her: Kame jbo onek diye
Me: Oito hoye6lin?
Her: Amke
Her: Miss
Me: Ami aubha
Her: poc korechhis?
Me: Ei nijekh
Me: 🤣🤣🤣🤣...j to join oble
Her: au kisr
Her: Bhai to jbi kalke e krbe softa ta
Me: indic e to bollo
M
