In [1]:
import pandas as pd

In [2]:
dataset= pd.read_csv("Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv")

In [3]:
dataset.head(3)

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   flags        26872 non-null  object
 1   instruction  26872 non-null  object
 2   category     26872 non-null  object
 3   intent       26872 non-null  object
 4   response     26872 non-null  object
dtypes: object(5)
memory usage: 1.0+ MB


In [5]:
dataset.isnull().sum()

flags          0
instruction    0
category       0
intent         0
response       0
dtype: int64

In [6]:
dataset.duplicated().sum()

np.int64(0)

In [6]:
dataset.drop(["flags"], axis=1, inplace=True)

In [7]:
dataset.columns

Index(['instruction', 'category', 'intent', 'response'], dtype='object')

In [8]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip()

dataset["instruction"] = dataset["instruction"].apply(clean_text)
dataset["response"] = dataset["response"].apply(clean_text)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [10]:
X = dataset["instruction"]
y = dataset["intent"]  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=300)
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 99.31162790697674
                          precision    recall  f1-score   support

            cancel_order       0.99      0.98      0.99       187
            change_order       0.98      0.98      0.98       187
 change_shipping_address       1.00      1.00      1.00       216
  check_cancellation_fee       1.00      1.00      1.00       199
           check_invoice       0.96      0.99      0.98       192
   check_payment_methods       1.00      1.00      1.00       206
     check_refund_policy       1.00      1.00      1.00       200
               complaint       1.00      1.00      1.00       203
contact_customer_service       1.00      0.99      0.99       208
     contact_human_agent       0.99      1.00      0.99       201
          create_account       1.00      0.99      0.99       217
          delete_account       0.99      0.99      0.99       178
        delivery_options       1.00      1.00      1.00       218
         delivery_period   

In [11]:
dataset["category"].unique()


array(['ORDER', 'SHIPPING', 'CANCEL', 'INVOICE', 'PAYMENT', 'REFUND',
       'FEEDBACK', 'CONTACT', 'ACCOUNT', 'DELIVERY', 'SUBSCRIPTION'],
      dtype=object)

In [12]:
dataset["intent"].unique()


array(['cancel_order', 'change_order', 'change_shipping_address',
       'check_cancellation_fee', 'check_invoice', 'check_payment_methods',
       'check_refund_policy', 'complaint', 'contact_customer_service',
       'contact_human_agent', 'create_account', 'delete_account',
       'delivery_options', 'delivery_period', 'edit_account',
       'get_invoice', 'get_refund', 'newsletter_subscription',
       'payment_issue', 'place_order', 'recover_password',
       'registration_problems', 'review', 'set_up_shipping_address',
       'switch_account', 'track_order', 'track_refund'], dtype=object)

In [14]:
# Category Model

X_cat = dataset["instruction"]
y_cat = dataset["category"]

X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_cat, y_cat, test_size=0.2, random_state=42)

vectorizer_cat = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_cat_tfidf = vectorizer_cat.fit_transform(X_train_cat)
X_test_cat_tfidf = vectorizer_cat.transform(X_test_cat)

clf_cat = LogisticRegression(max_iter=300)
clf_cat.fit(X_train_cat_tfidf, y_train_cat)

y_pred_cat = clf_cat.predict(X_test_cat_tfidf)
print("Category Accuracy:", accuracy_score(y_test_cat, y_pred_cat)*100)
print(classification_report(y_test_cat, y_pred_cat))

#  Intent Model

X_intent = dataset["instruction"]
y_intent = dataset["intent"]

X_train_intent, X_test_intent, y_train_intent, y_test_intent = train_test_split(X_intent, y_intent, test_size=0.2, random_state=42)

vectorizer_intent = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_intent_tfidf = vectorizer_intent.fit_transform(X_train_intent)
X_test_intent_tfidf = vectorizer_intent.transform(X_test_intent)

clf_intent = LogisticRegression(max_iter=300)
clf_intent.fit(X_train_intent_tfidf, y_train_intent)

y_pred_intent = clf_intent.predict(X_test_intent_tfidf)
print("Intent Accuracy:", accuracy_score(y_test_intent, y_pred_intent)*100)
print(classification_report(y_test_intent, y_pred_intent))


Category Accuracy: 99.70232558139534
              precision    recall  f1-score   support

     ACCOUNT       0.99      1.00      0.99      1160
      CANCEL       1.00      1.00      1.00       199
     CONTACT       1.00      1.00      1.00       409
    DELIVERY       0.99      1.00      1.00       389
    FEEDBACK       1.00      1.00      1.00       427
     INVOICE       1.00      1.00      1.00       407
       ORDER       1.00      1.00      1.00       763
     PAYMENT       1.00      0.99      0.99       410
      REFUND       1.00      1.00      1.00       601
    SHIPPING       1.00      1.00      1.00       444
SUBSCRIPTION       1.00      0.99      0.99       166

    accuracy                           1.00      5375
   macro avg       1.00      1.00      1.00      5375
weighted avg       1.00      1.00      1.00      5375

Intent Accuracy: 99.31162790697674
                          precision    recall  f1-score   support

            cancel_order       0.99      0.98   

In [46]:
# check intent prediction with train model with sample date
sample = ["i am facing issue in payment transaction", "how to delete my account", "i want to order item"]
pred_intent = clf.predict(vectorizer.transform(sample))

# Display results
for text, intent in zip(sample, pred_intent):
    print(f"Text: {text} -> Predicted Intent: {intent}")

Text: i am facing issue in payment transaction -> Predicted Intent: payment_issue
Text: how to delete my account -> Predicted Intent: delete_account
Text: i want to order item -> Predicted Intent: place_order


In [35]:
import joblib

# Save intent model + vectorizer together
joblib.dump({
    "vectorizer": vectorizer_intent,
    "model": clf_intent
}, "intent_model.pkl")

print("Intent model saved as intent_model.pkl")


Intent model saved as intent_model.pkl


In [40]:
# Load model and vectorizer
data = joblib.load("intent_model.pkl")
vectorizer_intent = data["vectorizer"]
clf_intent = data["model"]

# Example predictions with saved model and vectorizer (converts text data into numbers)
sample_texts = ["i want to change my account", "i forget my password so tell me how reset my password", "i wan to cancel subscription"]

# Convert text data into numerical vectors
sample_tfidf = vectorizer_intent.transform(sample_texts)

# Predict intents for all examples
predicted_intents = clf_intent.predict(sample_tfidf)

# Display results
for text, intent in zip(sample_texts, predicted_intents):
    print(f"Text: {text} -> Predicted Intent: {intent}")


Text: i want to change my account -> Predicted Intent: edit_account
Text: i forget my password so tell me how reset my password -> Predicted Intent: recover_password
Text: i wan to cancel subscription -> Predicted Intent: newsletter_subscription


In [None]:
#Ended