In [13]:
import pandas as pd
import re
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset


In [None]:
CONFIG = {
    "max_samples": 10000,      
    "test_size": 0.1,
    "max_length": 96,        
    "batch_size": 16,         
    "epochs": 2,            
    "base_model": "t5-small"
}
print("Configuration set:")
print(CONFIG)

Configuration set:
{'max_samples': 10000, 'test_size': 0.1, 'max_length': 96, 'batch_size': 16, 'epochs': 2, 'base_model': 't5-small'}


In [15]:
print("\nLoading and processing data...")
df = pd.read_csv('data.csv')
tweet_id_map = {row['tweet_id']: row for _, row in tqdm(df.iterrows(), total=len(df), desc="Creating ID map")}


Loading and processing data...


Creating ID map:   0%|          | 0/2811774 [00:00<?, ?it/s]

In [16]:

pairs = []
valid_ids = set()

print("\nGenerating valid conversation pairs...")
with tqdm(total=CONFIG["max_samples"], desc="Pairs generated") as pbar:
    for _, row in df.iterrows():
        if row['inbound']:  # Only process customer messages
            response_id = row['response_tweet_id']
            if pd.notna(response_id):
                response_ids = str(response_id).split(',')
                for rid in response_ids:
                    rid = rid.strip()
                    if rid.isdigit():
                        rid = int(rid)
                        if rid in tweet_id_map:
                            bot_row = tweet_id_map[rid]
                            if not bot_row['inbound']:
                                pairs.append((row['text'], bot_row['text']))
                                valid_ids.add(rid)
                                pbar.update(1)
                                if len(pairs) >= CONFIG["max_samples"]:
                                    break
                    if len(pairs) >= CONFIG["max_samples"]:
                        break
        if len(pairs) >= CONFIG["max_samples"]:
            break

# Ensure we have at least some data
if not pairs:
    raise ValueError("No valid conversation pairs found! Check data format.")


Generating valid conversation pairs...


Pairs generated:   0%|          | 0/10000 [00:00<?, ?it/s]

In [17]:
def clean_text(text):
    text = re.sub(r'@\w+|http\S+', '', text)
    return text.strip()[:200]  # Limit to 200 characters

print("\nCleaning text...")
inputs = [clean_text(p[0]) for p in tqdm(pairs)]
targets = [clean_text(p[1]) for p in tqdm(pairs)]



Cleaning text...


  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [18]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=CONFIG["test_size"], random_state=42
)
print(f"\nTraining samples: {len(train_inputs)}")
print(f"Validation samples: {len(val_inputs)}")



Training samples: 9000
Validation samples: 1000


In [19]:
class ChatDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_len):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.inputs)

    def __getitem__(self, idx):
        input_enc = self.tokenizer(
            self.inputs[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_enc = self.tokenizer(
            self.targets[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': input_enc['input_ids'].squeeze(),
            'attention_mask': input_enc['attention_mask'].squeeze(),
            'labels': target_enc['input_ids'].squeeze()
        }

In [20]:
!pip install sentencepiece --quiet

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nInitializing model on {device}...")

tokenizer = T5Tokenizer.from_pretrained(CONFIG["base_model"])
model = T5ForConditionalGeneration.from_pretrained(CONFIG["base_model"]).to(device)

train_dataset = ChatDataset(train_inputs, train_targets, tokenizer, CONFIG["max_length"])
val_dataset = ChatDataset(val_inputs, val_targets, tokenizer, CONFIG["max_length"])


Initializing model on cuda...


In [22]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=CONFIG["epochs"],
    per_device_train_batch_size=CONFIG["batch_size"],
    per_device_eval_batch_size=CONFIG["batch_size"],
    evaluation_strategy='epoch',
    fp16=True,
    logging_steps=50,
    save_strategy='epoch',
    report_to='none',
    optim="adafactor",  # Uses less memory
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



In [23]:
print("\nStarting training...")
train_result = trainer.train()
print("\nTraining completed!")




Starting training...


Epoch,Training Loss,Validation Loss
1,0.9843,0.882171
2,0.9277,0.854607



Training completed!


In [24]:
print("\nSaving model...")
model.save_pretrained('chatbot_model')
tokenizer.save_pretrained('chatbot_model')
print("Model saved to 'chatbot_model' directory")


Saving model...
Model saved to 'chatbot_model' directory


In [25]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import gradio as gr
import re

tokenizer = T5Tokenizer.from_pretrained('chatbot_model')
model = T5ForConditionalGeneration.from_pretrained('chatbot_model')

def clean_input(text):
    return re.sub(r'@\w+|http\S+', '', text).strip()

def respond(message, history):
    message = clean_input(message)
    inputs = tokenizer.encode(
        message,
        return_tensors='pt',
        max_length=128,
        truncation=True
    )
    outputs = model.generate(
        inputs,
        max_length=160,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

gr.ChatInterface(respond).launch()

  self.chatbot = Chatbot(


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


