In [1]:
%cd /content/drive/MyDrive/final_attempt_on_bot

/content/drive/MyDrive/final_attempt_on_bot


In [2]:
!pip install -q pandas numpy torch transformers peft bitsandbytes sentence-transformers faiss-cpu datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import re
import pandas as pd
import numpy as np
import torch
import pickle
# import streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from sentence_transformers import SentenceTransformer
import faiss
from datasets import Dataset

In [4]:
# WhatsApp data cleaning with Hinglish support
def clean_whatsapp_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Remove API keys and sensitive info
    api_patterns = [
        r'sk-[A-Za-z0-9]{20,}',
        r'AIza[A-Za-z0-9_-]{35}',
        r'hf_[A-Za-z0-9]{20,}',
        r'mongodb\+srv://[^\s]+',
        r'[A-Za-z0-9]{32,}',
        r'\b\d{10,}\b'  # Remove long numbers (phone numbers)
    ]

    for pattern in api_patterns:
        content = re.sub(pattern, '[REDACTED]', content)

    # Remove unwanted users (Hinglish names)
    unwanted_users = ["Papa", "Wife Jio", "Mummy", "Office Group"]
    lines = content.split('\n')

    filtered_lines = []
    for line in lines:
        if not any(user in line for user in unwanted_users):
            filtered_lines.append(line)

    return '\n'.join(filtered_lines)

In [5]:
# Shorten Indian/Hinglish names
def shorten_user_names(text):
    name_mapping = {
        "Nipun Dhiman SRM Adhiyaman 9th Floor": "Nipun",
        "Achintya Gupta": "Achintya",
        "Priyanka Sharma Didi": "Priyanka Didi",
        "Rahul Bhaiya": "Rahul",
        "Mummy Papa Group": "Family Group"
    }

    for long_name, short_name in name_mapping.items():
        text = text.replace(long_name, short_name)

    return text

In [6]:
# Parse WhatsApp data with Hinglish support
def parse_whatsapp_data(cleaned_text):
    # Enhanced pattern for Hinglish chats (supports 12/24 hour format)
    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4},? \d{1,2}:\d{2}\s?[ap]m?) - ([^:]+): (.+)'

    messages = []
    for match in re.finditer(pattern, cleaned_text):
        timestamp, sender, message = match.groups()
        messages.append({
            'timestamp': pd.to_datetime(timestamp, dayfirst=True),
            'sender': sender.strip(),
            'message': message.strip()
        })

    return pd.DataFrame(messages)

In [7]:
# Filter for your messages only (Hinglish name support)
def extract_your_messages(df, your_name="ACHINTYA GUPTA"):
    your_messages = df[df['sender'].str.contains(your_name, case=False)]['message'].tolist()
    return [msg for msg in your_messages if len(msg.split()) > 3]  # Filter short messages


In [8]:
# Create conversation pairs for Hinglish chats
def create_conversation_pairs(df, your_name="ACHINTYA GUPTA"):
    conversations = []
    for i in range(len(df) - 1):
        current_msg = df.iloc[i]
        next_msg = df.iloc[i + 1]

        # Create input-output pairs where others message you, you respond
        if (your_name not in current_msg['sender']) and (your_name in next_msg['sender']):
            conversations.append({
                'input': current_msg['message'],
                'output': next_msg['message']
            })

    return conversations

In [9]:
# Dataset preparation for fine-tuning
def prepare_dataset(conversations):
    dataset = Dataset.from_pandas(pd.DataFrame(conversations))
    return dataset.map(lambda x: {
        'text': f"Input: {x['input']}\nOutput: {x['output']}"
    })

In [10]:
# Enhanced RAG class for Hinglish support
class SimpleRAG:
    def __init__(self, documents):
        self.documents = documents
        self.embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # Multilingual model
        self.index = None
        self.build_index()

    def build_index(self):
        embeddings = self.embedder.encode(self.documents)
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(embeddings.astype('float32'))

    def retrieve(self, query, k=3):
        query_embedding = self.embedder.encode([query])
        distances, indices = self.index.search(query_embedding.astype('float32'), k)
        return [self.documents[i] for i in indices[0]]

    def save(self, file_path):
        # Save only the documents and index (embedder will be reinitialized)
        with open(file_path, 'wb') as f:
            pickle.dump({
                'documents': self.documents,
                'index': self.index
            }, f)

    @classmethod
    def load(cls, file_path):
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        rag = cls(data['documents'])
        rag.index = data['index']
        return rag

    def generate_response(self, query, model, tokenizer):
        context = self.retrieve(query)
        prompt = f"Context: {' '.join(context)}\nQuestion: {query}\nAnswer:"

        inputs = tokenizer.encode(prompt, return_tensors='pt').to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                inputs,
                max_length=200,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("Answer:")[-1].strip()


In [11]:
def train_model():
    # Process data
    raw_data = clean_whatsapp_data("/content/drive/MyDrive/final_attempt_on_bot/mergedTextNew.txt")
    processed_data = shorten_user_names(raw_data)
    df = parse_whatsapp_data(processed_data)

    # Prepare training data
    your_messages = extract_your_messages(df)
    conversations = create_conversation_pairs(df)
    dataset = prepare_dataset(conversations)

    # Save RAG system
    rag_system = SimpleRAG(your_messages)
    rag_system.save('rag_system.pkl')

    # Initialize model with proper quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
    tokenizer.pad_token = tokenizer.eos_token  # Set pad token

    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/DialoGPT-medium",
        quantization_config=bnb_config,
        device_map="auto"
    )

    # Correct LoRA configuration for DialoGPT
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8,
        lora_alpha=16,
        target_modules=["attn.c_attn"],
        lora_dropout=0.05,
        bias="none"
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # Tokenize dataset with padding
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            max_length=256,
            padding="max_length",  # Pad to max_length
            return_tensors="pt"
        )

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    # Use proper data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./achintya-chatbot",
        num_train_epochs=3,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        save_steps=500,
        eval_steps=500,
        save_total_limit=3,
        report_to="none"
    )

    # Trainer with proper data collator
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator
    )

    # Train
    trainer.train()
    model.save_pretrained("./achintya-chatbot")
    tokenizer.save_pretrained("./achintya-chatbot")

if __name__ == "__main__":
    train_model()

Map:   0%|          | 0/21970 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 786,432 || all params: 355,609,600 || trainable%: 0.2212


Map:   0%|          | 0/21970 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,8.1743
200,5.2411
300,4.753
400,4.5735
500,4.3829
600,4.2168
700,4.0733
800,4.0326
900,4.015
1000,3.8807
