# INTENT CLASSIFICATION USING ROBERTA
### https://huggingface.co/transformers/model_doc/roberta.html
### by/ AHMED ESSAM Abd Elgwad
### [https://www.linkedin.com/in/ahmed-essam-045161204/](https://www.linkedin.com/in/ahmedessamabdelatif/)

## Importing the libraries

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers.modeling_utils import PreTrainedModel
import torch
from torch.utils.data import Dataset

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

from transformers import (
    RobertaTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    PreTrainedModel  
)



ModuleNotFoundError: No module named 'wordcloud'

## Reading the jason file (data)

In [None]:



# Step 1: Load and Preprocess JSON Data
def load_and_preprocess_data(json_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    rows = []
    for intent in data['intents']:
        texts = intent['text']  # List of text inputs
        for text in texts:
            rows.append({
                'Intent': intent['intent'],
                'Text': text
            })
    
    # Create DataFrame
    df = pd.DataFrame(rows)
    
    # Encode intents to numeric labels
    intent_labels = df['Intent'].unique()
    label2id = {label: idx for idx, label in enumerate(intent_labels)}
    id2label = {idx: label for label, idx in label2id.items()}
    df['Label'] = df['Intent'].map(label2id)
    
    return df, label2id, id2label


     Intent      Text
0  Greeting        Hi
1  Greeting        Hi
2  Greeting        Hi
3  Greeting  Hi there
4  Greeting  Hi there


In [None]:
# Load Data and Split into Train/Test
json_file_path = 'D:/archive/Intent.json'  
df, label2id, id2label = load_and_preprocess_data(json_file_path)


## EDA on the intent and text features 

In [2]:
df.shape

(1918, 2)

In [None]:


# Clean Text for Analysis
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

# EDA Functions
def intent_distribution(df):
    intent_counts = df['Intent'].value_counts()
    print("Intent Distribution:")
    print(intent_counts)
    
    # Plot bar chart
    plt.figure(figsize=(10, 6))
    sns.barplot(x=intent_counts.index, y=intent_counts.values, palette='viridis')
    plt.title('Distribution of Intents')
    plt.xlabel('Intent')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()



In [None]:
def most_frequent_words(df, top_n=10):
    all_tokens = []
    for text in df['Text']:
        tokens = clean_text(text)
        all_tokens.extend(tokens)
    
    word_freq = Counter(all_tokens)
    common_words = word_freq.most_common(top_n)
    print(f"Top {top_n} Most Frequent Words:")
    for word, freq in common_words:
        print(f"{word}: {freq}")
    
    # Plot bar chart for frequent words
    words, freqs = zip(*common_words)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(words), y=list(freqs), palette='magma')
    plt.title(f'Top {top_n} Most Frequent Words in Text')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:

def word_cloud(df):
    # Combine all text into one string
    all_text = ' '.join(df['Text'].apply(lambda x: ' '.join(clean_text(x))))
    
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white', 
                          max_words=100, min_font_size=10).generate(all_text)
    
    # Plot word cloud
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Text Column')
    plt.show()


In [None]:

def text_length_distribution(df):
    # Compute length of each text (in words)
    df['Text_Length'] = df['Text'].apply(lambda x: len(word_tokenize(x)))
    
    # Plot histogram
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Text_Length'], bins=20, kde=True, color='blue')
    plt.title('Distribution of Text Lengths (in Words)')
    plt.xlabel('Number of Words')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()



In [None]:
def bigram_frequency(df, top_n=10):
    # Generate bigrams (2-word pairs)
    all_bigrams = []
    for text in df['Text']:
        tokens = clean_text(text)
        bigrams = list(nltk.bigrams(tokens))
        all_bigrams.extend(bigrams)
    
    # Count bigram frequencies
    bigram_freq = Counter(all_bigrams)
    common_bigrams = bigram_freq.most_common(top_n)
    print(f"Top {top_n} Most Frequent Bigrams:")
    for bigram, freq in common_bigrams:
        print(f"{' '.join(bigram)}: {freq}")
    
    # Plot bar chart for bigrams
    bigrams, freqs = zip(*common_bigrams)
    bigram_labels = [' '.join(bigram) for bigram in bigrams]
    plt.figure(figsize=(10, 6))
    sns.barplot(x=bigram_labels, y=list(freqs), palette='coolwarm')
    plt.title(f'Top {top_n} Most Frequent Bigrams in Text')
    plt.xlabel('Bigrams')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()



In [None]:
        # Perform EDA
print("=== NLP EDA ===")

intent_distribution(df)          # Intent distribution and bar plot


In [None]:
    # Perform EDA
print("=== NLP EDA ===")

most_frequent_words(df, top_n=10)  # Most frequent words and bar plot


In [None]:
print("=== NLP EDA ===")
    
word_cloud(df)                   # Word cloud


In [None]:
print("=== NLP EDA ===")
    
text_length_distribution(df)     # Text length distribution


In [None]:
print("=== NLP EDA ===")
    
bigram_frequency(df, top_n=10)   # Bigram frequency and bar plot

## Initialize the model 

In [None]:

print("Trainer is working.")


class IntentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


  from .autonotebook import tqdm as notebook_tqdm


Trainer is working.


In [None]:


# Split into training and testing sets (80-20 split)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Label'])



In [None]:



# Reset indices
train_texts, train_labels = train_df['Text'].values, train_df['Label'].values
test_texts, test_labels = test_df['Text'].values, test_df['Label'].values

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")


tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Create Datasets
train_dataset = IntentDataset(train_texts, train_labels, tokenizer)
test_dataset = IntentDataset(test_texts, test_labels, tokenizer)






Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
id2label

{0: 'Greeting',
 1: 'GreetingResponse',
 2: 'CourtesyGreeting',
 3: 'CourtesyGreetingResponse',
 4: 'CurrentHumanQuery',
 5: 'NameQuery',
 6: 'RealNameQuery',
 7: 'TimeQuery',
 8: 'Thanks',
 9: 'NotTalking2U',
 10: 'UnderstandQuery',
 11: 'Shutup',
 12: 'Swearing',
 13: 'GoodBye',
 14: 'CourtesyGoodBye',
 15: 'WhoAmI',
 16: 'Clever',
 17: 'Gossip',
 18: 'Jokes',
 19: 'PodBayDoor',
 20: 'PodBayDoorResponse',
 21: 'SelfAware'}

In [9]:
label2id

{'Greeting': 0,
 'GreetingResponse': 1,
 'CourtesyGreeting': 2,
 'CourtesyGreetingResponse': 3,
 'CurrentHumanQuery': 4,
 'NameQuery': 5,
 'RealNameQuery': 6,
 'TimeQuery': 7,
 'Thanks': 8,
 'NotTalking2U': 9,
 'UnderstandQuery': 10,
 'Shutup': 11,
 'Swearing': 12,
 'GoodBye': 13,
 'CourtesyGoodBye': 14,
 'WhoAmI': 15,
 'Clever': 16,
 'Gossip': 17,
 'Jokes': 18,
 'PodBayDoor': 19,
 'PodBayDoorResponse': 20,
 'SelfAware': 21}

## Training the model

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=100,              # Number of epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Warmup steps
    weight_decay=0.01,               # Weight decay
    logging_dir='./logs',            # Directory for logs
    logging_steps=10,                # Log every 10 steps
    eval_strategy='epoch',     # Evaluate at the end of each epoch
    save_strategy='epoch',           # Save model at the end of each epoch
    load_best_model_at_end=True,     # Load the best model based on metric
    metric_for_best_model='accuracy' # Metric to optimize
)

# Step 7: Define Compute Metrics Function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    report = classification_report(labels, preds, target_names=label2id.keys(), output_dict=True)
    return {
        'accuracy': accuracy,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1': report['weighted avg']['f1-score']
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the Model
print("Starting training...")
trainer.train()
print("Training completed.")

# Evaluate the Model
print("Evaluating model...")
eval_results = trainer.evaluate()
print("Evaluation results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# Save the Model and Tokenizer
model.save_pretrained('./intent_classifier_model')
tokenizer.save_pretrained('./intent_classifier_model')
print("Model and tokenizer saved to './intent_classifier_model'")



Starting training...




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0997,3.102257,0.034483,0.001189,0.034483,0.002299
2,3.0933,3.100822,0.034483,0.001189,0.034483,0.002299
3,3.0948,3.09862,0.034483,0.001189,0.034483,0.002299
4,3.1161,3.09393,0.034483,0.001189,0.034483,0.002299
5,3.0845,3.083192,0.034483,0.001189,0.034483,0.002299
6,3.0137,3.044394,0.068966,0.01,0.068966,0.016446
7,2.9764,2.78488,0.482759,0.399687,0.482759,0.409195
8,2.6095,2.445746,0.586207,0.479885,0.586207,0.5
9,2.345,2.068279,0.62069,0.508621,0.62069,0.532184
10,1.8602,1.738625,0.724138,0.623563,0.724138,0.655172


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Training completed.
Evaluating model...




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation results:
eval_loss: 0.5974
eval_accuracy: 0.8966
eval_precision: 0.8621
eval_recall: 0.8966
eval_f1: 0.8736
eval_runtime: 4.8491
eval_samples_per_second: 5.9800
eval_steps_per_second: 0.8250
epoch: 100.0000
Model and tokenizer saved to './intent_classifier_model'


## Testing the model

In [None]:
# Make Predictions
def predict_intent(text, model, tokenizer, id2label, max_length=128):
    model.eval()
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        outputs = model(
            input_ids=encoding['input_ids'],
            attention_mask=encoding['attention_mask']
        )
        logits = outputs.logits
        predicted_id = torch.argmax(logits, dim=1).item()
    
    return id2label[predicted_id]



Sample text: 'Hello there'
Predicted intent: Greeting


In [12]:
# Example prediction
sample_text = "Hello there"
predicted_intent = predict_intent(sample_text, model, tokenizer, id2label)
print(f"Sample text: '{sample_text}'")
print(f"Predicted intent: {predicted_intent}")

Sample text: 'Hello there'
Predicted intent: Greeting


In [13]:
# Example prediction
sample_text = "That's helpful"
predicted_intent = predict_intent(sample_text, model, tokenizer, id2label)
print(f"Sample text: '{sample_text}'")
print(f"Predicted intent: {predicted_intent}")

Sample text: 'That's helpful'
Predicted intent: Thanks
