<a href="https://colab.research.google.com/github/Freedisch/ml-formativeChatbot/blob/main/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BertForSequenceClassification, AdamW

# Step 1: Preprocess Data
Preprocessing involves cleaning, tokenizing, and preparing input tensors for BERT.

In [7]:
import pandas as pd

file_path = 'Covid.csv'
df = pd.read_csv(file_path)

print(df.head())

import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

# Preprocess data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['question'] = df['question'].apply(clean_text)
df['answer'] = df['answer'].apply(clean_text)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512

df['question_tokens'] = df['question'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_len, truncation=True, padding='max_length'))
df['answer_tokens'] = df['answer'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_len, truncation=True, padding='max_length'))



                                            question  \
0                       What is a novel coronavirus?   
1  Why is the disease being called coronavirus di...   
2  Why might someone blame or avoid individuals a...   
3  How can people help stop stigma related to COV...   
4                   What is the source of the virus?   

                                              answer  
0  A novel coronavirus is a new coronavirus that ...  
1  On February 11, 2020 the World Health Organiza...  
2  People in the U.S. may be worried or anxious a...  
3  People can fight stigma and help, not hurt, ot...  
4  Coronaviruses are a large family of viruses. S...  


# Step 2: Prepare Input Tensors for BERT
Convert tokens to input tensors.

In [3]:
class ChatbotDataset(Dataset):
    def __init__(self, questions, answers):
        self.questions = questions
        self.answers = answers

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        question_attention_mask = [1] * len(question)
        answer_attention_mask = [1] * len(answer)
        return torch.tensor(question), torch.tensor(question_attention_mask), torch.tensor(answer), torch.tensor(answer_attention_mask)

questions = df['question_tokens'].tolist()
answers = df['answer_tokens'].tolist()

dataset = ChatbotDataset(questions, answers)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


# Step 3: Fine-tune the BERT Model
Fine-tune BERT for a question-answering task.

In [5]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df))
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    for batch in dataloader:
        questions, question_attention_masks, answers, answer_attention_masks = batch
        optimizer.zero_grad()


        assert questions.size(0) == answers.size(0), "Batch size mismatch between questions and answers"

        print(f"Questions shape: {questions.shape}, Answers shape: {answers.shape}")

        outputs = model(input_ids=questions, attention_mask=question_attention_masks, labels=answers)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch}, Loss {1.303003}')

model.save_pretrained('finetuned_bert_model')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Questions shape: torch.Size([2, 512]), Answers shape: torch.Size([2, 512])
Epoch 0, Loss 1.303003
Questions shape: torch.Size([2, 512]), Answers shape: torch.Size([2, 512])
Epoch 0, Loss 1.303003
Questions shape: torch.Size([2, 512]), Answers shape: torch.Size([2, 512])
Epoch 0, Loss 1.303003
Questions shape: torch.Size([2, 512]), Answers shape: torch.Size([2, 512])
Epoch 0, Loss 1.303003
Questions shape: torch.Size([2, 512]), Answers shape: torch.Size([2, 512])
Epoch 0, Loss 1.303003
Questions shape: torch.Size([2, 512]), Answers shape: torch.Size([2, 512])
Epoch 0, Loss 1.303003
Questions shape: torch.Size([2, 512]), Answers shape: torch.Size([2, 512])
Epoch 0, Loss 1.303003
Questions shape: torch.Size([2, 512]), Answers shape: torch.Size([2, 512])
Epoch 0, Loss 1.303003
Questions shape: torch.Size([2, 512]), Answers shape: torch.Size([2, 512])
Epoch 0, Loss 1.303003
Questions shape: torch.Size([2, 512]), Answers shape: torch.Size([2, 512])
Epoch 0, Loss 1.303003
Questions shape: tor

# Step 4: Build an Interface for the Model
Create a simple interface to interact with the model.

In [6]:
def get_response(question):
    inputs = tokenizer(question, return_tensors='pt', max_length=max_len, truncation=True, padding='max_length')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    answer_idx = torch.argmax(outputs.logits, dim=1).item()
    return df['answer'].iloc[answer_idx]

question = "What are the recommendation to be protected against COVID-19?"
response = get_response(question)
print(response)


