In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load the dataset
df = pd.read_csv("Bitext_Sample_Customer_Service_Training_Dataset.csv")

# Display the first few rows to understand its structure
df.head()

Unnamed: 0,flags,utterance,category,intent
0,BM,I have problems with canceling an order,ORDER,cancel_order
1,BIM,how can I find information about canceling ord...,ORDER,cancel_order
2,B,I need help with canceling the last order,ORDER,cancel_order
3,BIP,could you help me cancelling the last order I ...,ORDER,cancel_order
4,B,problem with cancelling an order I made,ORDER,cancel_order


In [3]:
#Checking for missing values
df.isnull().sum()

flags        0
utterance    0
category     0
intent       0
dtype: int64

In [4]:
#Basic text cleaning
df['cleaned_utterance'] = df['utterance'].str.lower().str.replace('[^\w\s]', ' ').str.strip()

#Tokenize the text
df['tokenized_utterance'] = df['cleaned_utterance'].str.split()

#First few rows after cleaning and tokenization
df[['utterance', 'cleaned_utterance', 'tokenized_utterance']].head()

  df['cleaned_utterance'] = df['utterance'].str.lower().str.replace('[^\w\s]', ' ').str.strip()


Unnamed: 0,utterance,cleaned_utterance,tokenized_utterance
0,I have problems with canceling an order,i have problems with canceling an order,"[i, have, problems, with, canceling, an, order]"
1,how can I find information about canceling ord...,how can i find information about canceling orders,"[how, can, i, find, information, about, cancel..."
2,I need help with canceling the last order,i need help with canceling the last order,"[i, need, help, with, canceling, the, last, or..."
3,could you help me cancelling the last order I ...,could you help me cancelling the last order i ...,"[could, you, help, me, cancelling, the, last, ..."
4,problem with cancelling an order I made,problem with cancelling an order i made,"[problem, with, cancelling, an, order, i, made]"


In [5]:
nltk.download('stopwords')

#Stopwords removal
stop_words = set(stopwords.words('english'))
df['cleaned_utterance'] = df['cleaned_utterance'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

#TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_utterance']).toarray()

#Encode the intent labels
encoder = LabelEncoder()
y = encoder.fit_transform(df['intent'])

#Splitting the data into training (80%), validation (10%), and testing (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

#Output the shape of the datasets
(X_train.shape, X_val.shape, X_test.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


((6540, 641), (817, 641), (818, 641))

In [6]:
#Initialize the model
model = RandomForestClassifier()

#Train the model
model.fit(X_train, y_train)

#Validate the model
y_val_pred = model.predict(X_val)
print("Validation Set Metrics:")
print(classification_report(y_val, y_val_pred))

#Accuracy score for validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

Validation Set Metrics:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        31
           1       1.00      0.97      0.98        30
           2       0.97      1.00      0.98        30
           3       1.00      1.00      1.00        30
           4       1.00      1.00      1.00        32
           5       1.00      1.00      1.00        30
           6       1.00      1.00      1.00        30
           7       1.00      1.00      1.00        30
           8       1.00      0.97      0.98        30
           9       1.00      1.00      1.00        30
          10       0.91      0.97      0.94        30
          11       1.00      1.00      1.00        30
          12       1.00      1.00      1.00        30
          13       1.00      1.00      1.00        30
          14       1.00      1.00      1.00        30
          15       1.00      1.00      1.00        32
          16       1.00      1.00      1.00        30
   

In [7]:
y_test_pred = model.predict(X_test)
print("Test Set Metrics:")
print(classification_report(y_test, y_test_pred))

Test Set Metrics:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        30
           1       1.00      1.00      1.00        31
           2       0.97      1.00      0.98        29
           3       0.97      1.00      0.98        30
           4       1.00      1.00      1.00        33
           5       1.00      1.00      1.00        29
           6       0.97      0.97      0.97        30
           7       1.00      1.00      1.00        30
           8       0.97      0.97      0.97        30
           9       1.00      1.00      1.00        29
          10       1.00      0.93      0.97        30
          11       1.00      0.97      0.98        30
          12       1.00      1.00      1.00        30
          13       1.00      1.00      1.00        30
          14       0.97      1.00      0.98        29
          15       1.00      1.00      1.00        33
          16       1.00      0.97      0.98        29
         

In [8]:
# Accuracy score for test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9890
