In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#clean the data
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load the CSV file
def load_data(csv_file):
    return pd.read_csv(csv_file)

# Clean the conversation data
def clean_data(df):
    # 1. Remove unnecessary columns (If there's any column you don't need)
    df = df[['instruction', 'response']]

    # 2. Remove duplicate rows
    df = df.drop_duplicates(subset=['instruction', 'response'])

    # 3. Handle missing values by removing rows with missing input or response
    df = df.dropna(subset=['instruction', 'response'])

    # 4. Normalize text (removing special characters, multiple spaces, etc.)
    df['instruction'] = df['instruction'].apply(lambda x: clean_text(x))
    df['response'] = df['response'].apply(lambda x: clean_text(x))

    return df

# Text cleaning function to normalize the input and response
def clean_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove special characters (you can customize this pattern)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenize the text (split into words)
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Rejoin the tokens back into a cleaned sentence
    cleaned_text = ' '.join(tokens)

    return cleaned_text

# Save cleaned data back to CSV
def save_cleaned_data(df, output_file):
    df.to_csv(output_file, index=False)

# Main function to clean and save the data
def clean_and_save(csv_file, output_file):
    df = load_data(csv_file)
    cleaned_df = clean_data(df)
    save_cleaned_data(cleaned_df, output_file)

# Specify the input and output files
input_file = "/content/drive/MyDrive/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv"  # Your original CSV file with data
output_file = "/content/drive/MyDrive/cleaned_conversation_log.csv"  # File to save the cleaned data

# Run the cleaning process
clean_and_save(input_file, output_file)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/cleaned_conversation_log.csv'
all_data = pd.read_csv(file_path)

#data = all_data[['instruction', 'response']].dropna().sample(10000, random_state=42)


In [None]:
all_data.head()

Unnamed: 0,instruction,response
0,question cancelling order order number,ive understood question regarding canceling or...
1,question cancelling oorder order number,ive informed question canceling order order nu...
2,need help cancelling puchase order number,sense youre seeking assistance canceling purch...
3,need cancel purchase order number,understood need assistance canceling purchase ...
4,afford order cancel purchase order number,im sensitive fact youre facing financial diffi...


In [None]:
from torch.utils.data import Dataset

class CustomerSupportDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = f"Customer support: {self.data.iloc[idx]['instruction']}"
        target_text = self.data.iloc[idx]['response']

        # Tokenize the input and target texts
        inputs = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        targets = self.tokenizer(target_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        # Return tokenized input and target with proper formatting
        return {
            "input_ids": inputs.input_ids.flatten(),
            "attention_mask": inputs.attention_mask.flatten(),
            "labels": targets.input_ids.flatten()
        }


In [None]:

from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Load Flan-T5 model and tokenizer
model_name = "google/flan-t5-small"  # Replace with "google/flan-t5-large" if more resources are available
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

train_data, eval_data = train_test_split(data, test_size=0.2, random_state=42)

# Prepare the dataset for fine-tuning
train_dataset = CustomerSupportDataset(train_data, tokenizer)
eval_dataset = CustomerSupportDataset(eval_data, tokenizer)  # Create evaluation dataset

# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/customer_support_flan_t5",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Fine-tune the model
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.4975,0.398881
2,0.442,0.361263
3,0.4285,0.352471


TrainOutput(global_step=6000, training_loss=0.9371358083089193, metrics={'train_runtime': 2656.0636, 'train_samples_per_second': 9.036, 'train_steps_per_second': 2.259, 'total_flos': 4461372112896000.0, 'train_loss': 0.9371358083089193, 'epoch': 3.0})

In [None]:
model.save_pretrained("customer_support_flan_t5")
tokenizer.save_pretrained("customer_support_flan_t5")

('customer_support_flan_t5/tokenizer_config.json',
 'customer_support_flan_t5/special_tokens_map.json',
 'customer_support_flan_t5/spiece.model',
 'customer_support_flan_t5/added_tokens.json')

In [None]:
 !zip -r customer_support_flan_t5.zip customer_support_flan_t5/

  adding: customer_support_flan_t5/ (stored 0%)
  adding: customer_support_flan_t5/tokenizer_config.json (deflated 94%)
  adding: customer_support_flan_t5/model.safetensors (deflated 7%)
  adding: customer_support_flan_t5/checkpoint-6000/ (stored 0%)
  adding: customer_support_flan_t5/checkpoint-6000/optimizer.pt (deflated 24%)
  adding: customer_support_flan_t5/checkpoint-6000/rng_state.pth (deflated 25%)
  adding: customer_support_flan_t5/checkpoint-6000/model.safetensors (deflated 7%)
  adding: customer_support_flan_t5/checkpoint-6000/scheduler.pt (deflated 56%)
  adding: customer_support_flan_t5/checkpoint-6000/trainer_state.json (deflated 73%)
  adding: customer_support_flan_t5/checkpoint-6000/training_args.bin (deflated 51%)
  adding: customer_support_flan_t5/checkpoint-6000/config.json (deflated 62%)
  adding: customer_support_flan_t5/checkpoint-6000/generation_config.json (deflated 30%)
  adding: customer_support_flan_t5/special_tokens_map.json (deflated 85%)
  adding: customer

In [None]:
!unzip "/content/customer_support_flan_t5.zip" -d "/content"

Archive:  /content/customer_support_flan_t5.zip
   creating: /content/customer_support_flan_t5/
  inflating: /content/customer_support_flan_t5/tokenizer_config.json  
  inflating: /content/customer_support_flan_t5/model.safetensors  
   creating: /content/customer_support_flan_t5/checkpoint-6000/
  inflating: /content/customer_support_flan_t5/checkpoint-6000/optimizer.pt  
  inflating: /content/customer_support_flan_t5/checkpoint-6000/rng_state.pth  
  inflating: /content/customer_support_flan_t5/checkpoint-6000/model.safetensors  
  inflating: /content/customer_support_flan_t5/checkpoint-6000/scheduler.pt  
  inflating: /content/customer_support_flan_t5/checkpoint-6000/trainer_state.json  
  inflating: /content/customer_support_flan_t5/checkpoint-6000/training_args.bin  
  inflating: /content/customer_support_flan_t5/checkpoint-6000/config.json  
  inflating: /content/customer_support_flan_t5/checkpoint-6000/generation_config.json  
  inflating: /content/customer_support_flan_t5/speci

In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.7.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.0 (from gradio)
  Downloading gradio_client-1.5.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [None]:
import gradio as gr
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the fine-tuned model and tokenizer
model_name_or_path = "/content/drive/MyDrive/customer_support_flan_t5"  # Directory where the fine-tuned model was saved
tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)

# Define the response generation function
def generate_response(user_input):
    # Prepare the input for Flan-T5
    input_text = f"Customer support: {user_input}"
    inputs = tokenizer(input_text, return_tensors="pt")

    # Generate the response
    outputs = model.generate(
        inputs.input_ids,
        max_length=150,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    # Decode and return the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Define the Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=7, label="User Input"),  # Use gr.Textbox directly
    outputs=gr.Textbox(label="Bot Response"),        # Use gr.Textbox directly
    title="Customer Support Chatbot",
    description="Enter your query, and the chatbot will respond accordingly.",
    theme="compact"
)

# Launch the interface
iface.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

Sorry, we can't find the page you are looking for.


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a899c990ee894a3e5a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
test_input = "How can I cancel my order?"
inputs = tokenizer(f"Customer support: {test_input}", return_tensors="pt")
outputs = model.generate(inputs.input_ids, max_length=50)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Test response:", response)

Test response: I'm here to assist you in canceling your order. To cancel your order, you can visit our website and navigate to the "Order Settings" or "Order Settings" section. You will find a list of all the


In [None]:
import gradio as gr
import csv
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load your fine-tuned model
model_name = "/content/drive/MyDrive/customer_support_flan_t5"  # Replace with your model's path or Hugging Face model ID
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Function to generate responses from the model
def chat(input_text):
    # Tokenize the input
    inputs = tokenizer.encode("Customer: " + input_text + " Response:", return_tensors="pt")

    # Generate a response
    outputs = model.generate(inputs, max_length=200, num_beams=4, no_repeat_ngram_size=2, early_stopping=True)

    # Decode the output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Function to save conversation to CSV
def save_conversation(input_text, response):
    with open('conversation_log.csv', mode='a', newline='') as file:
        writer = csv.writer(file)
        # If the file is empty, write headers
        if file.tell() == 0:
            writer.writerow(["Input", "Response"])  # Header
        writer.writerow([input_text, response])  # Write the conversation row

# Define the Gradio interface
interface = gr.Interface(fn=chat,
                         inputs="text",
                         outputs="text",
                         title="Customer Support Chat",
                         description="Chat with the fine-tuned model. It generates responses based on your input.")

# Launch the interface
interface.launch()



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d48a50963f22c13db4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# Function to save conversation to CSV
def save_conversation(input_text, response):
    with open('conversation_log.csv', mode='a', newline='') as file:
        writer = csv.writer(file)
        # If the file is empty, write headers
        if file.tell() == 0:
            writer.writerow(["Input", "Response"])  # Header
        writer.writerow([input_text, response])  # Write the conversation row

# Modify the chat function to save each conversation
def chat(input_text):
    inputs = tokenizer.encode("Customer: " + input_text + " Response:", return_tensors="pt")
    outputs = model.generate(inputs, max_length=200, num_beams=4, no_repeat_ngram_size=2, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Save conversation to file
    save_conversation(input_text, response)

    return response