In [None]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0))


CUDA Available: True
GPU Name: Tesla T4


In [None]:
!pip install transformers datasets scikit-learn




In [None]:
import pandas as pd

In [None]:
df=pd.read_csv("/content/dataset_1.csv")

In [None]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# ----------------------------
# LOAD DATA
# ----------------------------


# ----------------------------
# INTENT MODEL
# ----------------------------

intent_labels = df['intent'].unique().tolist()
intent_label2id = {label: idx for idx, label in enumerate(intent_labels)}
intent_id2label = {idx: label for label, idx in intent_label2id.items()}
df['intent_label'] = df['intent'].map(intent_label2id)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['intent_label'].tolist(),
    test_size=0.2,
    random_state=42
)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

train_dataset = Dataset.from_dict({
    **train_encodings,
    "labels": train_labels
})

val_dataset = Dataset.from_dict({
    **val_encodings,
    "labels": val_labels
})

intent_model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(intent_labels)
)



training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    logging_steps=100,
    save_strategy="no"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

intent_trainer = Trainer(
    model=intent_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

intent_trainer.train()

# SAVE INTENT MODEL
intent_model.save_pretrained("intent_model")
tokenizer.save_pretrained("intent_model")





In [None]:
# ----------------------------
# SENTIMENT MODEL (IMPROVED VERSION)
# ----------------------------

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import (
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

# ----------------------------
# LABEL MAPPING
# ----------------------------

sentiment_labels = df['sentiment'].unique().tolist()
sent_label2id = {label: idx for idx, label in enumerate(sentiment_labels)}
sent_id2label = {idx: label for label, idx in sent_label2id.items()}

df['sent_label'] = df['sentiment'].map(sent_label2id)

# ----------------------------
# 3-WAY SPLIT (Train / Val / Test)
# ----------------------------

train_texts_s, temp_texts_s, train_labels_s, temp_labels_s = train_test_split(
    df['text'],
    df['sent_label'],
    test_size=0.3,
    random_state=42,
    stratify=df['sent_label']
)

val_texts_s, test_texts_s, val_labels_s, test_labels_s = train_test_split(
    temp_texts_s,
    temp_labels_s,
    test_size=0.5,
    random_state=42,
    stratify=temp_labels_s
)

print("Sentiment Train size:", len(train_texts_s))
print("Sentiment Validation size:", len(val_texts_s))
print("Sentiment Test size:", len(test_texts_s))

# ----------------------------
# TOKENIZATION
# ----------------------------

train_enc_s = tokenizer(list(train_texts_s), truncation=True, padding=True)
val_enc_s = tokenizer(list(val_texts_s), truncation=True, padding=True)
test_enc_s = tokenizer(list(test_texts_s), truncation=True, padding=True)

train_dataset_s = Dataset.from_dict({
    **train_enc_s,
    "labels": list(train_labels_s)
})

val_dataset_s = Dataset.from_dict({
    **val_enc_s,
    "labels": list(val_labels_s)
})

test_dataset_s = Dataset.from_dict({
    **test_enc_s,
    "labels": list(test_labels_s)
})

# ----------------------------
# LOAD MODEL
# ----------------------------

sentiment_model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(sentiment_labels),
    id2label=sent_id2label,
    label2id=sent_label2id
)

# ----------------------------
# TRAINING ARGUMENTS
# ----------------------------

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    logging_steps=50
)


# ----------------------------
# METRICS
# ----------------------------

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

# ----------------------------
# TRAINER
# ----------------------------

sent_trainer = Trainer(
    model=sentiment_model,
    args=training_args_s,
    train_dataset=train_dataset_s,
    eval_dataset=val_dataset_s,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# ----------------------------
# TRAIN
# ----------------------------

sent_trainer.train()

# ----------------------------
# FINAL TEST EVALUATION (IMPORTANT)
# ----------------------------

print("\nEvaluating Sentiment Model on UNSEEN Test Set:")
test_results_s = sent_trainer.evaluate(test_dataset_s)
print(test_results_s)

# ----------------------------
# SAVE MODEL
# ----------------------------

sentiment_model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_model")


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Step,Training Loss
100,0.234138
200,0.006104
300,0.002758
400,0.00171
500,0.001211
600,0.000934
700,0.000757
800,0.000655
900,0.000594
1000,0.000565


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('sentiment_model/tokenizer_config.json', 'sentiment_model/tokenizer.json')

In [None]:
!zip -r intent_model.zip intent_model
!zip -r sentiment_model.zip sentiment_model


  adding: intent_model/ (stored 0%)
  adding: intent_model/tokenizer_config.json (deflated 42%)
  adding: intent_model/config.json (deflated 54%)
  adding: intent_model/model.safetensors (deflated 8%)
  adding: intent_model/tokenizer.json (deflated 71%)
  adding: sentiment_model/ (stored 0%)
  adding: sentiment_model/tokenizer_config.json (deflated 42%)
  adding: sentiment_model/config.json (deflated 52%)
  adding: sentiment_model/model.safetensors (deflated 8%)
  adding: sentiment_model/tokenizer.json (deflated 71%)


In [None]:
df

Unnamed: 0,text,intent,sentiment,clean_text,clean_text_st,vader_sentiment,label,intent_label,sent_label
0,Kindly assist me. I am experiencing packet los...,Technical,Neutral,kindly assist me i am experiencing packet loss...,kindly assist experiencing packet loss issue f...,Neutral,0,0,0
1,I am satisfied with the service. I was charged...,Billing,Positive,i am satisfied with the service i was charged ...,satisfied service charged twice g service indi...,Positive,1,1,1
2,I am unhappy with the service. I am experienci...,Technical,Negative,i am unhappy with the service i am experiencin...,unhappy service experiencing connection drops ...,Negative,0,0,2
3,This is unacceptable. I was overcharged $40 fo...,Billing,Negative,this is unacceptable i was overcharged for my ...,unacceptable overcharged sim card india center,Negative,1,1,2
4,This is unacceptable. I want to report long wa...,Complaint,Negative,this is unacceptable i want to report long wai...,unacceptable want report long waiting time reg...,Neutral,2,2,2
...,...,...,...,...,...,...,...,...,...
19995,Could you help me? I am experiencing packet lo...,Technical,Neutral,could you help me i am experiencing packet los...,help experiencing packet loss issue internatio...,Neutral,0,0,0
19996,Glad this was resolved. I am experiencing unst...,Technical,Positive,glad this was resolved i am experiencing unsta...,glad resolved experiencing unstable connection...,Neutral,0,0,1
19997,I am very disappointed. I requested a refund o...,Refund,Negative,i am very disappointed i requested a refund of...,disappointed requested refund router purchased...,Negative,4,4,2
19998,Thank you for your assistance. I was given wro...,Billing,Positive,thank you for your assistance i was given wron...,thank assistance given wrong invoice g service...,Negative,1,1,1


In [None]:
!ls intent_model


config.json  model.safetensors	tokenizer_config.json  tokenizer.json
