In [1]:
!pip install torch torchvision torchaudio



In [2]:
!pip install transformers datasets scikit-learn pandas matplotlib



In [3]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [5]:
dataset_path = "/content/drive/MyDrive/Chatbot_Dataset/data_full.json"
import json

with open(dataset_path, "r") as f:
    dataset = json.load(f)

print(dataset.keys())   # should print: dict_keys(['train', 'val', 'test'])
print("Train samples:", len(dataset["train"]))
print("Val samples:", len(dataset["val"]))
print("Test samples:", len(dataset["test"]))


dict_keys(['oos_val', 'val', 'train', 'oos_test', 'test', 'oos_train'])
Train samples: 15000
Val samples: 3000
Test samples: 4500


In [7]:
import json, os, pandas as pd
from google.colab import drive
drive.mount('/content/drive')

# <-- change this path if your folder name differs
dataset_path = "/content/drive/MyDrive/Chatbot_Dataset/data_full.json"

with open(dataset_path, "r") as f:
    raw = json.load(f)

# Try to infer the item schema automatically (list of dicts or list of lists)
def split_to_df(split_key, include_oos=False):
    items = raw[split_key]
    rows = []
    for it in items:
        if isinstance(it, dict):
            text = it.get("text") or it.get("utterance") or it.get("sentence") or it.get("query")
            label = it.get("intent") or it.get("label")
        else:  # list/tuple
            text, label = it[0], it[1]
        rows.append({"text": text, "intent": label})
    df = pd.DataFrame(rows)
    if include_oos:
        oos_items = raw.get(f"oos_{split_key}", [])
        for it in oos_items:
            if isinstance(it, dict):
                text = it.get("text") or it.get("utterance") or it.get("sentence") or it.get("query")
            else:
                text = it[0]
            rows.append({"text": text, "intent": "oos"})
        df = pd.DataFrame(rows)
    return df

# Choose whether you want out-of-scope detection as a label
USE_OOS = True

train_df = split_to_df("train", include_oos=USE_OOS)
val_df   = split_to_df("val",   include_oos=USE_OOS)
test_df  = split_to_df("test",  include_oos=USE_OOS)

print(train_df.head(), train_df.intent.nunique(), train_df.shape)

# Save in your repo layout
os.makedirs("data", exist_ok=True)
train_df.to_csv("data/train.csv", index=False)
val_df.to_csv("data/val.csv", index=False)
test_df.to_csv("data/test.csv", index=False)

# Make label map (id order is sorted by name for reproducibility)
labels = sorted(train_df["intent"].unique())
label2id = {lbl:i for i,lbl in enumerate(labels)}
id2label = {i:lbl for lbl,i in label2id.items()}

import json, pathlib
pathlib.Path("data").mkdir(parents=True, exist_ok=True)
with open("data/label_map.json","w") as f:
    json.dump(label2id, f, indent=2)

len(labels), list(labels)[:10]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                                text     intent
0  what expression would i use to say i love you ...  translate
1  can you tell me how to say 'i do not speak muc...  translate
2  what is the equivalent of, 'life is good' in f...  translate
3  tell me how to say, 'it is a beautiful morning...  translate
4  if i were mongolian, how would i say that i am...  translate 151 (15100, 2)


(151,
 ['accept_reservations',
  'account_blocked',
  'alarm',
  'application_status',
  'apr',
  'are_you_a_bot',
  'balance',
  'bill_balance',
  'bill_due',
  'book_flight'])

In [19]:
import json, numpy as np, pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Load CSVs as HuggingFace datasets
dataset = load_dataset("csv", data_files={"train": "data/train.csv", "validation": "data/val.csv", "test": "data/test.csv"})

# Labels
with open("data/label_map.json") as f:
    label2id = json.load(f)
id2label = {v:k for k,v in label2id.items()}
num_labels = len(label2id)

# Tokenizer + model
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    enc = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
    enc["labels"] = [label2id[i] for i in batch["intent"]]
    return enc

encoded = dataset.map(preprocess, batched=True, remove_columns=["text","intent"])

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_weighted": f1_score(labels, preds, average="weighted")
    }

# Training args
args = TrainingArguments(
    output_dir="results",
    eval_strategy="epoch", # Corrected argument name
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True
)



trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

Map:   0%|          | 0/3100 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcristianoadrian6984[0m ([33mamitesh1[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,2.361,2.039554,0.815161,0.792499
2,0.9507,0.875631,0.904839,0.894095
3,0.5596,0.649611,0.91871,0.90938


TrainOutput(global_step=2832, training_loss=1.8472670188731393, metrics={'train_runtime': 729.4423, 'train_samples_per_second': 62.102, 'train_steps_per_second': 3.882, 'total_flos': 1504179608601600.0, 'train_loss': 1.8472670188731393, 'epoch': 3.0})

In [20]:
model.save_pretrained("/content/chatbot_model")
tokenizer.save_pretrained("/content/chatbot_model")

# Backup to Google Drive
!cp -r /content/chatbot_model /content/drive/MyDrive/


In [21]:
from transformers import pipeline

inference_pipeline = pipeline(
    "text-classification",
    model="/content/chatbot_model",
    tokenizer="/content/chatbot_model"
)

texts = [
    "book me a flight to Delhi",
    "cancel my train ticket",
    "what’s the weather in Mumbai",
    "reset my account password"
]

for text in texts:
    print(text, " --> ", inference_pipeline(text))


Device set to use cuda:0


book me a flight to Delhi  -->  [{'label': 'book_flight', 'score': 0.8328499794006348}]
cancel my train ticket  -->  [{'label': 'cancel', 'score': 0.12397707253694534}]
what’s the weather in Mumbai  -->  [{'label': 'weather', 'score': 0.7945302724838257}]
reset my account password  -->  [{'label': 'freeze_account', 'score': 0.14758086204528809}]


In [23]:
# Save model + tokenizer
output_dir = "/content/drive/MyDrive/multi_intent_model"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model saved at:", output_dir)


Model saved at: /content/drive/MyDrive/multi_intent_model
