In [18]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [2]:
data = pd.read_csv('../.data/customer-support-chats.csv')

In [3]:
data.head()

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [4]:
df = data.copy()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   flags        26872 non-null  object
 1   instruction  26872 non-null  object
 2   category     26872 non-null  object
 3   intent       26872 non-null  object
 4   response     26872 non-null  object
dtypes: object(5)
memory usage: 1.0+ MB


In [6]:
cleaned_data = df.drop(columns=['flags', 'category', 'intent'])

In [7]:
cleaned_data = cleaned_data.rename(columns={'instruction': 'question', 'response': 'answer'})
cleaned_data.head()

Unnamed: 0,question,answer
0,question about cancelling order {{Order Number}},I've understood you have a question regarding ...
1,i have a question about cancelling oorder {{Or...,I've been informed that you have a question ab...
2,i need help cancelling puchase {{Order Number}},I can sense that you're seeking assistance wit...
3,I need to cancel purchase {{Order Number}},I understood that you need assistance with can...
4,"I cannot afford this order, cancel purchase {{...",I'm sensitive to the fact that you're facing f...


In [8]:
basic_qa = {
    "question": ['hi', 'hello', 'how are you?', 'how are you doing?', 'who are you?'], 
    "answer": ['hello', 'hi', 'great!', 'I am doing great! how are you?', 'i am arup, a customer service agent']
}

In [9]:
cleaned_data = pd.concat([cleaned_data, pd.DataFrame(basic_qa)], ignore_index=True)

In [10]:
df = cleaned_data.copy()

In [11]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'['+string.punctuation+']', '', text)
    text = [word for word in text.split(' ') if word not in stop_words]
    text = [stemmer.stem(word) for word in text]
    return text

In [12]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)), 
    ('tfidf', TfidfTransformer()), 
    ('classifier', KNeighborsClassifier())
])

In [13]:
pipeline.fit(df['question'], df['answer'])

In [14]:
pipeline.predict(['hi'])[0]

'I am doing great! how are you?'

In [15]:
pipeline.predict(['hello'])[0]

'I am doing great! how are you?'

In [16]:
pipeline.predict(['How are you?'])[0]

'I am doing great! how are you?'

In [17]:
pipeline.predict(['Can you help me ordering {{ORDER NUMBER}}?'])[0]

"Always good to connect! I'm attuned to the fact that you would like to know the whereabouts of your order with the order number {{Order Number}}. To assist you, could you please provide me with the name or tracking number associated with your order? This information will help me locate the exact status and current location of your order. Feel free to share any additional details or questions you may have, and I'll be more than happy to help you further!"

In [20]:
joblib.dump(pipeline, '../model/pipeline.joblib')

['../model/pipeline.joblib']