In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!rm -rf /content/drive/MyDrive/AchintyaCloneBot/server/outputModels
!rm -rf /content/drive/MyDrive/AchintyaCloneBot/server/wandb

In [None]:
!pip install transformers accelerate bitsandbytes datasets torch trl peft
!pip install -q sentencepiece tokenizers
!pip install fastapi uvicorn streamlit
!pip install -q wandb

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.18.2-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  D

In [None]:
import os
import re
import json
import zipfile
import pandas as pd
from datetime import datetime
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
from collections import defaultdict
import logging

import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer, DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from trl import SFTTrainer
import wandb

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
%cd /content/drive/MyDrive/AchintyaCloneBot/server

/content/drive/MyDrive/AchintyaCloneBot/server


In [None]:
@dataclass
class ChatMessage:
    author: str
    message: str
    timestamp: datetime

class DataCleaner:
    @staticmethod
    def clean_message(message: str) -> str:
        """Clean and normalize message text while preserving personal style."""
        if not message:
            return ""

        message = message.strip()
        message = ' '.join(message.split())

        # Remove URLs but keep other elements
        message = re.sub(r'http\S+|www.\S+', '[URL]', message)

        return message

    @staticmethod
    def validate_message(message: str) -> bool:
        """Validate if a message should be included in the dataset."""
        if not message or len(message.strip()) < 2:
            return False

        # Skip media and system messages
        skip_patterns = [
            '<media omitted>', 'media omitted', '[redacted]', 'message deleted',
            'image omitted', 'video omitted', 'audio omitted', 'document omitted',
            'this message was deleted', 'sticker omitted', 'gif omitted'
        ]

        # Skip very long messages (over 500 chars)
        if len(message) > 500:
            return False

        return not any(pattern.lower() in message.lower() for pattern in skip_patterns)

class ChatParser:
    def __init__(self):
        self.date_patterns = [
            # Standard WhatsApp formats
            r"(\d{1,2}/\d{1,2}/\d{2,4}),\s*(\d{1,2}:\d{2}(?::\d{2})?\s*[aApP][mM])\s*-\s*([^:]+):\s*(.+)",
            r"(\d{1,2}/\d{1,2}/\d{2,4}),\s*(\d{1,2}:\d{2}(?::\d{2})?)\s*-\s*([^:]+):\s*(.+)",
            r"\[(\d{1,2}/\d{1,2}/\d{2,4}),\s*(\d{1,2}:\d{2}(?::\d{2})?\s*[aApP][mM])\]\s*([^:]+):\s*(.+)",
        ]

    def parse_line(self, line: str) -> Optional[ChatMessage]:
        """Parse a single line of chat."""
        line = line.strip()
        if not line:
            return None

        for pattern in self.date_patterns:
            match = re.search(pattern, line, re.IGNORECASE)
            if match:
                date, time, author, message = match.groups()

                # Try to parse timestamp
                timestamp = None
                for date_format in ["%d/%m/%y", "%m/%d/%y", "%d/%m/%Y", "%m/%d/%Y"]:
                    try:
                        if "am" in time.lower() or "pm" in time.lower():
                            time_format = "%I:%M:%S %p" if time.count(':') == 2 else "%I:%M %p"
                        else:
                            time_format = "%H:%M:%S" if time.count(':') == 2 else "%H:%M"

                        timestamp = datetime.strptime(f"{date} {time}", f"{date_format} {time_format}")
                        break
                    except ValueError:
                        continue

                if timestamp is None:
                    timestamp = datetime.now()  # Fallback

                return ChatMessage(
                    author=author.strip(),
                    message=message.strip(),
                    timestamp=timestamp
                )
        return None


In [None]:
def extract_and_process_chats(zip_folder_path: str, your_name: str = "ACHINTYA GUPTA") -> pd.DataFrame:
    """Extract and process all WhatsApp chat files from zip folders."""

    all_messages = []
    chat_parser = ChatParser()

    # Process all zip files in the folder
    for filename in os.listdir(zip_folder_path):
        if filename.endswith('.zip'):
            zip_path = os.path.join(zip_folder_path, filename)

            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                # Extract to temporary folder
                temp_folder = f"/tmp/{filename[:-4]}"
                zip_ref.extractall(temp_folder)

                # Find .txt files
                for root, dirs, files in os.walk(temp_folder):
                    for file in files:
                        if file.endswith('.txt'):
                            txt_path = os.path.join(root, file)

                            try:
                                with open(txt_path, 'r', encoding='utf-8') as f:
                                    for line in f:
                                        message = chat_parser.parse_line(line)
                                        if message and DataCleaner.validate_message(message.message):
                                            cleaned_message = DataCleaner.clean_message(message.message)
                                            if cleaned_message:
                                                all_messages.append({
                                                    'author': message.author,
                                                    'message': cleaned_message,
                                                    'timestamp': message.timestamp,
                                                    'source_file': filename
                                                })
                            except Exception as e:
                                logger.warning(f"Error processing {txt_path}: {e}")

    df = pd.DataFrame(all_messages)
    df = df.sort_values('timestamp').reset_index(drop=True)

    logger.info(f"Processed {len(df)} messages from {len(df['source_file'].unique())} chat files")
    logger.info(f"Messages from {your_name}: {len(df[df['author'] == your_name])}")

    return df

In [None]:
def create_conversational_dataset(df: pd.DataFrame, your_name: str = "ACHINTYA GUPTA",
                                context_length: int = 3) -> List[Dict]:
    """Create conversational dataset for fine-tuning."""

    conversations = []
    current_context = []

    for _, row in df.iterrows():
        author = row['author']
        message = row['message']

        # Add message to context
        current_context.append(f"{author}: {message}")

        # If this is your message and we have context, create a training example
        if author == your_name and len(current_context) > 1:
            # Get context (excluding your current message)
            context = current_context[:-1]

            # Limit context length
            if len(context) > context_length:
                context = context[-context_length:]

            conversation_data = {
                "input": "\n".join(context),
                "output": message,
                "instruction": f"You are {your_name}. Respond naturally to this conversation:"
            }
            conversations.append(conversation_data)

        # Maintain sliding window
        if len(current_context) > context_length * 2:
            current_context = current_context[-context_length:]

    logger.info(f"Created {len(conversations)} training examples")
    return conversations

In [None]:
def setup_model_and_tokenizer():
    """Setup BitNet model and tokenizer with proper configuration."""

    # Use available BitNet model or fallback to similar efficient model
    model_names = [
        "1bitLLM/bitnet_b1_58-large",
        "microsoft/DialoGPT-medium",  # Fallback
        "microsoft/DialoGPT-small"    # Smaller fallback
    ]

    model_name = None
    tokenizer = None
    model = None

    for name in model_names:
        try:
            tokenizer = AutoTokenizer.from_pretrained(name)

            # Configure for efficient training
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
            )

            model = AutoModelForCausalLM.from_pretrained(
                name,
                quantization_config=bnb_config,
                device_map="auto",
                trust_remote_code=True,
            )

            model_name = name
            logger.info(f"Successfully loaded model: {model_name}")
            break

        except Exception as e:
            logger.warning(f"Failed to load {name}: {e}")
            continue

    if model is None:
        raise ValueError("Could not load any compatible model")

    # Setup tokenizer
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer, model_name

def setup_lora_config(model):
    """Setup LoRA configuration for efficient fine-tuning."""

    # Find target modules based on model architecture
    target_modules = []

    # Check what modules are available in the model
    for name, module in model.named_modules():
        if any(key in name for key in ["attn", "attention"]):
            if any(proj in name for proj in ["q_proj", "k_proj", "v_proj", "out_proj"]):
                target_modules.append(name.split('.')[-1])

    # Fallback for DialoGPT and similar models
    if not target_modules:
        # Common attention layer names in different architectures
        possible_targets = [
            "c_attn", "c_proj",  # GPT-2 style
            "attn.c_attn", "attn.c_proj",
            "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.out_proj",
            "attention.query", "attention.key", "attention.value", "attention.dense"
        ]

        for name, _ in model.named_modules():
            for target in possible_targets:
                if target in name:
                    base_name = target.split('.')[-1]
                    if base_name not in target_modules:
                        target_modules.append(base_name)

    # Default fallback
    if not target_modules:
        target_modules = ["c_attn", "c_proj"]  # GPT-2/DialoGPT default

    print(f"Using LoRA target modules: {target_modules}")

    lora_config = LoraConfig(
        r=4,
        lora_alpha=8,
        target_modules=target_modules,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

    return lora_config

def format_training_data(conversations: List[Dict], tokenizer) -> Dataset:
    """Format conversations for training."""

    def format_conversation(example):
        instruction = example["instruction"]
        input_text = example["input"]
        output_text = example["output"]

        # Create chat format
        text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output_text}{tokenizer.eos_token}"

        return {"text": text}

    # Convert to dataset
    dataset = Dataset.from_list(conversations)
    dataset = dataset.map(format_conversation, remove_columns=dataset.column_names)

    return dataset

def train_model(model, tokenizer, dataset, output_dir: str):

    def tokenize_dataset(dataset, tokenizer):
      """Tokenize the dataset manually for training."""
      def tokenize(example):
          return tokenizer(
              example["text"],
              truncation=True,
              padding="max_length",
              max_length=512,
          )
      return dataset.map(tokenize, batched=True)
    """Train the model using SFTTrainer."""

    # Prepare model for training
    model = prepare_model_for_kbit_training(model)

    # Apply LoRA
    lora_config = setup_lora_config(model)
    model = get_peft_model(model, lora_config)

    tokenized_dataset = tokenize_dataset(dataset, tokenizer)

    # Training arguments optimized for Colab free tier
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=2,
        num_train_epochs=1,
        learning_rate=5e-5,
        fp16=True,
        logging_steps=100,
        save_steps=100,
        save_total_limit=1,
        remove_unused_columns=False,
        dataloader_pin_memory=False,
        gradient_checkpointing=True,
        warmup_steps=10,
        optim="paged_adamw_8bit",
    )

    # Initialize trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_dataset,
        # tokenizer=tokenizer,
        args=training_args,
        # max_seq_length=512,
        # dataset_text_field="text",
        # packing=False,
    )

    # Train the model
    trainer.train()

    # Save the model
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)

    return trainer

In [None]:
def main_training_pipeline():
    """Main training pipeline for Google Colab."""

    # Configuration
    YOUR_NAME = "ACHINTYA GUPTA"
    ZIP_FOLDER_PATH = "/content/drive/MyDrive/AchintyaCloneBot/server/data"  # Update this path
    OUTPUT_DIR = "/content/drive/MyDrive/AchintyaCloneBot/server/outputModels"

    print("🚀 Starting WhatsApp BitNet Training Pipeline...")

    # Step 1: Process WhatsApp data
    print("📱 Processing WhatsApp chat data...")
    df = extract_and_process_chats(ZIP_FOLDER_PATH, YOUR_NAME)

    # Step 2: Create conversational dataset
    print("💬 Creating conversational dataset...")
    conversations = create_conversational_dataset(df, YOUR_NAME)

    if len(conversations) < 10:
        raise ValueError("Not enough training data. Need at least 10 conversations.")

    # Step 3: Setup model and tokenizer
    print("🤖 Setting up BitNet model...")
    model, tokenizer, model_name = setup_model_and_tokenizer()

    # Step 4: Format training data
    print("📊 Formatting training data...")
    dataset = format_training_data(conversations, tokenizer)

    # Step 5: Train the model
    print("🏋️ Training the model...")
    trainer = train_model(model, tokenizer, dataset, OUTPUT_DIR)

    print(f"✅ Training completed! Model saved to {OUTPUT_DIR}")
    print("📊 Training statistics:")
    print(f"- Total conversations: {len(conversations)}")
    print(f"- Model used: {model_name}")
    print(f"- Output directory: {OUTPUT_DIR}")

# Run the training pipeline
if __name__ == "__main__":
    main_training_pipeline()

🚀 Starting WhatsApp BitNet Training Pipeline...
📱 Processing WhatsApp chat data...
💬 Creating conversational dataset...
🤖 Setting up BitNet model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

📊 Formatting training data...


Map:   0%|          | 0/21833 [00:00<?, ? examples/s]

🏋️ Training the model...
Using LoRA target modules: ['c_attn', 'c_proj']


Map:   0%|          | 0/21833 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/21833 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m69homiesatwork[0m ([33m69homiesatwork-srm-institute-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.5345
200,1.0868
300,0.827
400,0.6765
500,0.572
600,0.533
700,0.5015
800,0.4592
900,0.464
1000,0.4389


✅ Training completed! Model saved to /content/drive/MyDrive/AchintyaCloneBot/server/outputModels
📊 Training statistics:
- Total conversations: 21833
- Model used: microsoft/DialoGPT-medium
- Output directory: /content/drive/MyDrive/AchintyaCloneBot/server/outputModels
