In [None]:
!source venv/scripts/activate

In [1]:
print("Hello World")

Hello World


# Model for classifying user messsage

In [1]:
import pandas as pd

In [2]:
df = pd.read_json("synthetic_chatbot_training_data.json")

In [3]:
df.head()

Unnamed: 0,message,label
0,How do I check the shipping status of my order?,Store Policy Enquiry
1,Can you check if my order #P123EF has shipped?,Order Status
2,Review for 'To Kill a Mockingbird': A timeless...,Review Submission
3,Review for 'The Catcher in the Rye': I found i...,Review Submission
4,Review for 'To Kill a Mockingbird': A must-rea...,Review Submission


In [4]:
df['label'].value_counts()

label
Store Policy Enquiry    25
Order Status            25
Review Submission       25
Book Search             25
Name: count, dtype: int64

In [5]:
X = df['message']
y = df['label']
len(X), len(y)

(100, 100)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

In [7]:
len(X_train), len(X_test), len(y_train), len(y_test)

(80, 20, 80, 20)

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, NuSVC
from sklearn.metrics import accuracy_score, classification_report

In [33]:
pipeMNB = Pipeline(
    [
        ('tfidf', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ]
)

pipeCNB = Pipeline(
    [
        ('tfidf', TfidfVectorizer()),
        ('clf', ComplementNB())
    ]
)

pipeSVC = Pipeline(
    [
        ('tfidf', TfidfVectorizer()),
        ('clf', LinearSVC())
    ]
)

pipeNuSVC = Pipeline(
    [
        ('tfidf', TfidfVectorizer()),
        ('clf', NuSVC(nu=0.3, kernel='linear'))
    ]
)

In [34]:
pipeMNB.fit(X_train, y_train)
predictMNB = pipeMNB.predict(X_test)
print(f'{accuracy_score(y_test, predictMNB)}')

pipeCNB.fit(X_train, y_train)
predictCNB = pipeCNB.predict(X_test)
print(f'{accuracy_score(y_test, predictCNB)}')

pipeSVC.fit(X_train, y_train)
predictSVC = pipeSVC.predict(X_test)
print(f'{accuracy_score(y_test, predictSVC)}')

pipeNuSVC.fit(X_train, y_train)
predictNuSVC = pipeNuSVC.predict(X_test)
print(f'{accuracy_score(y_test, predictNuSVC)}')

0.85
0.9
0.95
0.95


In [35]:
predictSVC = pipeSVC.predict(["What is the procedure to return the book?"])

In [36]:
predictSVC

array(['Store Policy Enquiry'], dtype=object)

In [39]:
import pickle
with open('pipeSVC.pkl', 'wb') as f:
    pickle.dump(pipeSVC, f)

In [40]:
with open('pipeSVC.pkl', 'rb') as f:
    pipeSVCLoaded = pickle.load(f)

In [41]:
pipeSVCLoaded.predict(["What is the procedure to return the book?"])

array(['Store Policy Enquiry'], dtype=object)

In [21]:
predictMNB

array(['Review Submission', 'Review Submission', 'Order Status',
       'Order Status', 'Store Policy Enquiry', 'Book Search',
       'Book Search', 'Order Status'], dtype='<U20')

In [25]:
accuracy_score(y_test, predictMNB)

0.75

In [49]:
print(classification_report(y_test, predictSVC))

                      precision    recall  f1-score   support

         Book Search       1.00      1.00      1.00         4
        Order Status       0.86      1.00      0.92         6
   Review Submission       1.00      1.00      1.00         3
Store Policy Enquiry       1.00      0.86      0.92         7

            accuracy                           0.95        20
           macro avg       0.96      0.96      0.96        20
        weighted avg       0.96      0.95      0.95        20



# Model to classify Store Policy

In [53]:
import pandas as pd

In [54]:
df_policy = pd.read_json("synthetic_policy.json")

In [55]:
df_policy.head()

Unnamed: 0,message,label
0,What's your return policy?,Return Policy
1,How can I get a refund?,Refund Policy
2,What is your shipping policy?,Shipping Policy
3,Do you offer free returns?,Return Policy
4,How long do I have to return a book?,Return Policy


In [56]:
df_policy['label'].value_counts()

label
Return Policy      10
Refund Policy       6
Shipping Policy     5
Name: count, dtype: int64

In [57]:
X = df_policy['message']
y = df_policy['label']
len(X), len(y)

(21, 21)

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

In [59]:
len(X_train), len(X_test), len(y_train), len(y_test)


(16, 5, 16, 5)

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [61]:
pipeSVCPolicy = Pipeline(
    [
        ('tfidf', TfidfVectorizer()),
        ('clf', LinearSVC())
    ]
)

pipeNuSVCPolicy = Pipeline(
    [
        ('tfidf', TfidfVectorizer()),
        ('clf', NuSVC(nu=0.3, kernel='linear'))
    ]
)

In [62]:
pipeSVCPolicy.fit(X_train, y_train)
predictSVCPolicy = pipeSVCPolicy.predict(X_test)
print(f'{accuracy_score(y_test, predictSVCPolicy)}')

pipeNuSVCPolicy.fit(X_train, y_train)
predictNuSVCPolicy = pipeNuSVCPolicy.predict(X_test)
print(f"{accuracy_score(y_test, predictNuSVCPolicy)}")

0.8
0.8


In [65]:
pipeSVCPolicy.predict(["How much money can I get for a damaged book?"])

array(['Refund Policy'], dtype=object)

In [64]:
predictSVC

array(['Store Policy Enquiry'], dtype=object)

In [66]:
with open('pipeSVCPolicy.pkl', 'wb') as f:
    pickle.dump(pipeSVCPolicy, f)