# Fine


In [None]:
!pip install datasets

In [None]:
from datasets import Dataset
import pandas as pd
import re

# Load the CSV dataset
dataset = Dataset.from_pandas(pd.read_csv('/content/drive/MyDrive/Project_1/counsel_chat.csv'), encoding='latin1')

# Preprocess the dataset
def preprocess_example(example):
    # Remove non-text symbols using regex
    example['user'] = re.sub(r'[^\w\s\'",!?]', '', example['user'])
    example['therapist'] = re.sub(r'[^\w\s\'",!?]', '', example['therapist'])

    return example

dataset = dataset.map(preprocess_example)

In [None]:
from datasets import train_test_split

train_dataset, val_dataset = train_test_split(dataset, test_size=0.2)

In [None]:
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration, Trainer, TrainingArguments

# Load the tokenizer and model
tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
model = BlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()


In [None]:
trainer.save_model('fine-tuned-model')

In [None]:
from transformers import BlenderbotForConditionalGeneration

# Load the fine-tuned model
model = BlenderbotForConditionalGeneration.from_pretrained('fine-tuned-model')

# Generate responses for some example inputs
inputs = ["user: Hello", "user: How are you?"]
for input in inputs:
    input_ids = tokenizer.encode(input, return_tensors='pt')
    output_ids = model.generate(input_ids)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print("Bot:", response)

In [None]:
pip install --upgrade torch transformers

In [None]:
pip install transformers[torch]

In [None]:
pip install accelerate -U

In [None]:
import pandas as pd
import torch
from transformers import BlenderbotTokenizer, BlenderbotForCausalLM, AdamW

# Load your dataset into a pandas DataFrame
df = pd.read_csv("/content/drive/MyDrive/Project_1/counsel_chat.csv", encoding='ISO-8859-1')

# Prepare the conversations from your dataset
conversations = []
for _, row in df.iterrows():
    conversation = [
        row["user"],
        row["therapist"]
    ]
    conversations.append(conversation)

# Load the tokenizer and model
tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
model = BlenderbotForCausalLM.from_pretrained("facebook/blenderbot-400M-distill")

# Move the model to GPU
model.to("cuda")

# Tokenize the conversations
tokenized_conversations = tokenizer(conversations, padding=True, truncation=True, return_tensors="pt", max_length=512, add_special_tokens=True)

# Convert the tokenized conversations into a torch dataset
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx].to("cuda") for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

dataset = MyDataset(tokenized_conversations)

# Define the training arguments
num_train_epochs = 3
per_device_train_batch_size = 1
learning_rate = 1e-4

optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_train_epochs):
    for i in range(0, len(dataset), per_device_train_batch_size):
        batch = {key: val[i:i+per_device_train_batch_size].to("cuda") for key, val in tokenized_conversations.items()}

        # Ensure all sequences in the batch have the same length
        max_len = max(len(seq) for seq in batch["input_ids"])
        for key in batch:
            batch[key] = torch.stack([torch.cat([seq, torch.tensor([tokenizer.pad_token_id] * (max_len - len(seq)))]).to("cuda") for seq in batch[key]])[0]

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


In [None]:
pip install transformers[torch,accelerate]

In [None]:
max_sequence_length = 512  # Set your desired maximum sequence length
df['concatenated'] = df['concatenated'].apply(lambda x: x[:max_sequence_length])

In [None]:
class MyDataset(Dataset):
    def __init__(self, tokenizer, text_column):
        self.examples = tokenizer(
            text_column.tolist(),
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, idx):
        return {"input_ids": self.examples["input_ids"][idx]}

# Create the custom dataset
train_dataset = MyDataset(tokenizer, df['concatenated'])


In [None]:
print("Updated sequence lengths:", [len(tokenizer.encode(text)) for text in df['concatenated']])


Updated sequence lengths: [108, 123, 124, 111, 116, 109, 105, 107, 116, 116, 111, 104, 117, 109, 116, 112, 119, 118, 127, 105, 98, 116, 117, 113, 132, 116, 127, 142, 114, 119, 112, 133, 121, 119, 121, 137, 107, 121, 120, 122, 119, 133, 120, 96, 109, 119, 112, 109, 137, 102, 121, 105, 111, 111, 115, 116, 132, 119, 123, 106, 117, 127, 115, 121, 140, 114, 125, 123, 126, 109, 112, 115, 121, 108, 104, 112, 116, 115, 121, 111, 113, 123, 68, 117, 119, 122, 109, 108, 125, 146, 120, 88, 132, 141, 118, 111, 122, 118, 123, 115, 106, 142, 142, 114, 101, 150, 142, 109, 116, 108, 112, 132, 113, 103, 124, 115, 45, 114, 131, 115, 111, 111, 85, 142, 112, 114, 119, 110, 118, 122, 118, 111, 112, 124, 104, 119, 123, 109, 112, 99, 129, 114, 124, 117, 131, 118, 120, 130, 123, 127, 114, 108, 127, 109, 101, 121, 110, 112, 122, 92, 110, 115, 100, 128, 118, 119, 106, 134, 131, 128, 129, 126, 109, 112, 110, 116, 121, 117, 122, 125, 102, 122, 118, 111, 120, 118, 112, 121, 111, 120, 111, 118, 115, 120, 116, 107, 1

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
import pandas as pd

# Load and preprocess your dataset
# Specify a different encoding if needed
dataset_path = '/content/drive/MyDrive/Project_1/processed_counsel_chat.csv'
df = pd.read_csv(dataset_path, encoding='ISO-8859-1')
df['concatenated'] = df['user'] + df['therapist']
df.to_csv('/content/drive/MyDrive/Project_1/processed_counsel_chat.csv', index=False)

# Tokenize the dataset
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Define a custom dataset class
class MyDataset(Dataset):
    def __init__(self, tokenizer, text_column):
        self.examples = tokenizer(
            text_column.tolist(),
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, idx):
        return {"input_ids": self.examples["input_ids"][idx]}

# Create the custom dataset
train_dataset = MyDataset(tokenizer, df['concatenated'])

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        inputs = batch["input_ids"]
        outputs = model(inputs, labels=inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the fine-tuned model
model.save_pretrained("./your_fine_tuned_model")
