<a href="https://colab.research.google.com/github/AgamjotKC/Intel_Unnati_Program/blob/main/ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Installing the necessary libraries
!pip install datasets transformers accelerate matplotlib -U

# Importing necessary libraries
import json
import time
import matplotlib.pyplot as plt
from datasets import Dataset, load_metric
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, pipeline

# Starting the timer to measure how long the whole process takes
start_time = time.time()

# Downloading the Alpaca dataset from GitHub
!wget https://github.com/gururise/AlpacaDataCleaned/raw/main/alpaca_data_cleaned.json

# Loading the dataset from the downloaded JSON file
with open("alpaca_data_cleaned.json", "r") as f:
    data = json.load(f)

# Initializing lists to store the input and target texts
input_texts = []
target_texts = []

# Extracting input and target texts from the dataset
for entry in data:
    if "instruction" in entry and "input" in entry and "output" in entry:
        input_texts.append(entry["instruction"] + " " + entry["input"])
        target_texts.append(entry["output"])
    else:
        print(f"Skipping entry due to missing fields: {entry}")

# Printing the total number of examples extracted
print(f"Total examples extracted: {len(input_texts)}")

# Creating a Hugging Face Dataset from the extracted texts
dataset = Dataset.from_dict({"input_text": input_texts, "target_text": target_texts})

# Initializing the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the end-of-sequence token

# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples["target_text"], padding="max_length", truncation=True, max_length=128)
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"],
    }

# Tokenizing the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])

# Printing the total number of tokenized examples
print(f"Total tokenized examples: {len(tokenized_datasets)}")

# Splitting the tokenized dataset into training and validation sets (80% train, 20% validation)
train_size = int(len(tokenized_datasets) * 0.8)
train_dataset = tokenized_datasets.select(range(train_size))
val_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

# Printing the number of examples in the training and validation sets
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")

# Visualizing the dataset distribution
labels = ['Training', 'Validation']
sizes = [len(train_dataset), len(val_dataset)]
colors = ['skyblue', 'lightgreen']
explode = (0.1, 0)  # explode the first slice

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("Dataset Distribution")
plt.show()

# Displaying samples from the dataset
print("Sample training data (input -> output):")
for i in range(3):
    print(f"Input: {train_dataset[i]['input_ids'][:10]}... -> Output: {train_dataset[i]['labels'][:10]}...")

# Defining training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_steps=500,
    warmup_steps=500,
    save_total_limit=3,
    remove_unused_columns=False
)

# Initializing the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Initializing the Trainer with the model, training arguments, datasets, and tokenizer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Training the model
trainer.train()

# Loading the BLEU metric for evaluation
metric = load_metric("bleu", trust_remote_code=True)

# Initializing text generation pipeline with the trained model and tokenizer
chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Printing the total time taken for the entire process
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken for the process: {total_time:.2f} seconds")

# Interactive loop to use the chatbot
print("Hello! I'm your friendly AI chatbot. How can I assist you today?")
while True:
    try:
        user_input = input("You: ")

        if user_input.lower() == "exit":
            print("Chatbot: Goodbye! Have a great day!")
            break

        # Generate a response from the chatbot
        bot_response = chatbot(user_input, max_length=50, do_sample=True)[0]['generated_text']
        print("Chatbot:", bot_response)
    except Exception as e:
        print(f"An error occurred: {e}")
