In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import json 
import string
import nltk  #Natural language tool kit
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from time import sleep
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import joblib
import pickle
import random

string.punctuation
warnings.filterwarnings('ignore')

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data

In [3]:
def load_pandas_data(json_file_name):
    # reading the JSON data using json.load()
    with open(json_file_name) as train_file:
        dict_train = json.load(train_file)
    #pull data from the json format 
    tags = []
    patterns = []
    for idx in range(len(dict_train["intents"])):
        tags.append(dict_train["intents"][idx]["tag"])
        patterns.append(dict_train["intents"][idx]["patterns"])

    #fill the dataframe 
    tags_column = []
    patterns_column = []

    for tag_idx in range(len(tags)):
        for p_idx in range(len(patterns[tag_idx])):
            tags_column.append(tags[tag_idx])
            patterns_column.append(patterns[tag_idx][p_idx])
    #create the dataframe
    data = pd.DataFrame({
        "Tag":tags_column,
        "Patterns":patterns_column,
    })
    
    #add the answers column 
    answers_column = []
    for idx in range(data.shape[0]):
        tag_idx = tags.index(data["Tag"][idx])
        answers_column.append(dict_train["intents"][tag_idx]["responses"])
    data["Responses_list"] = answers_column 
    le = LabelEncoder()
    le.fit(data["Tag"])

    #transform the Tags data (creating new column)
    data["Label"] = le.transform(data["Tag"])

    return data 


In [4]:
data = load_pandas_data("intents.json")
data 

Unnamed: 0,Tag,Patterns,Responses_list,Label
0,date_range,What is the date range of the data?,"[The data covers the period from January 4, 20...",55
1,date_range,Can you tell me the start and end dates of the...,"[The data covers the period from January 4, 20...",55
2,date_range,What are the beginning and ending dates of the...,"[The data covers the period from January 4, 20...",55
3,date_range,From which date to which date does the data span?,"[The data covers the period from January 4, 20...",55
4,date_range,What is the time period covered by the data?,"[The data covers the period from January 4, 20...",55
...,...,...,...,...
4010,service_inquiry,What kind of help do you offer?,[I offer various services including answering ...,139
4011,service_inquiry,What services are included?,[I offer various services including answering ...,139
4012,service_inquiry,What are the details of what you offer?,[I offer various services including answering ...,139
4013,service_inquiry,How do your services work?,[I offer various services including answering ...,139


In [5]:
data["Label"].nunique() 

235

# Pre-processing

In [6]:
#initialize the stemmer 
ps=PorterStemmer()

In [7]:
#initiliaze the text convertor 
cv = CountVectorizer()        # function to transform text to vector


In [8]:
#data preprocessing 

corpus=[] #our data  #list of the cleaned texts

for i in tqdm(range(len(data))):

    s=re.sub('[^a-zA-Z0-9]'," ",data['Patterns'][i])
    s= s.lower()
    s=s.split()
    s= [word for word in s if word not in stopwords.words('english')]
    s=" ".join(s)
    s= ps.stem(s)
    corpus.append(s)

100%|█████████████████████████████████████████████████████████████████████████████| 4015/4015 [00:26<00:00, 150.61it/s]


In [9]:
#transform the input data 
training_data = cv.fit_transform(corpus).toarray()
training_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
#convert the target column (tags) into numerical column
#label encoding 
le = LabelEncoder()
le.fit(data["Tag"])

#transform the Tags data (creating new column)
data["Label"] = le.transform(data["Tag"])

In [11]:
#define Y_train 
y_train = data["Label"]

In [12]:
y_train

0        55
1        55
2        55
3        55
4        55
       ... 
4010    139
4011    139
4012    139
4013    139
4014    139
Name: Label, Length: 4015, dtype: int32

# Modeling

In [13]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV , StratifiedKFold,cross_val_score
from sklearn.metrics import accuracy_score

In [114]:
#the parameter grid
param_grid = {
    'C': [1, 10, 100, 1000],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.001, 0.01],
    'degree': [2, 3, 4],
    'class_weight': [None, 'balanced']
}

In [14]:
#stratified k-fold cross validation 
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [116]:
svm = SVC()

In [117]:
# GridSearchCV + cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, 
                           cv=stratified_kfold, scoring='roc_auc', verbose=3, n_jobs=-1)

In [118]:
grid_search.fit(training_data, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [119]:
grid_search.best_estimator_

In [120]:
#svm using gridsearch
svm_best = grid_search.best_estimator_

In [18]:
#svm using personal parameters (based on my knowledge)
svm_personal = SVC(kernel="linear",C=1000,class_weight="balanced").fit(training_data,y_train)

# Evaluation

## Evaluation on trainset

In [20]:
y_pred_best = svm_best.predict(training_data)
y_pred_personal = svm_personal.predict(training_data)

In [122]:
y_pred_best

array([ 55,  55,  55, ..., 139, 139, 139])

In [23]:
print("accuracy of the SVM with gridsearch parameters",accuracy_score(y_train,y_pred_best))
print("accuracy of the SVM with personal parameters",accuracy_score(y_train,y_pred_personal))

accuracy of the SVM with gridsearch parameters 0.9202988792029888
accuracy of the SVM with personal parameters 0.9469489414694894


## Stratified kfold cross validation (Evaluation on test sets)

In [24]:
#we used the stratified kfold cross validation to ensure that each class will be present in each set (train and test sets)

In [25]:
print("SVM with GridSearch parameters")
cross_val_score(svm_best, training_data, y_train, cv = stratified_kfold)

SVM with GridSearch parameters


array([0.78580324, 0.77708593, 0.76961395, 0.78455791, 0.78082192])

In [26]:
print("SVM with personal parameters")
cross_val_score(svm_personal, training_data, y_train, cv = stratified_kfold)


SVM with personal parameters


array([0.75466999, 0.7758406 , 0.76089664, 0.76089664, 0.76712329])

accuracy of the SVM with gridsearch parameters 0.9202988792029888


# Save the models

In [28]:
#save the model 
joblib.dump(svm_best,"Model/svm_chatbot_best.joblib")
joblib.dump(svm_personal,"Model/svm_chatbot0.joblib")


['Model/svm_chatbot0.joblib']

# ChatBot

In [29]:
def message_preprocess(message):
    s=re.sub('[^a-zA-Z0-9]'," ",message)
    s= s.lower()
    s=s.split()
    s= [word for word in s if word not in stopwords.words('english')]
    s=" ".join(s)
    s= ps.stem(s)
    s = cv.transform([s]).toarray()
    return s

In [32]:
#define our interacting chatbot function
def chatbot():
    #user input
    while(True):

        inp = input("you=")
        if(inp == "quit"):
            break
        #clear the message
        inp = message_preprocess(inp)
        

        results = svm_best.predict(inp)[0]
        data_pred= data[data["Label"] == results]
        data_pred = data_pred.reset_index()
        
        sleep(1.5)
        bot = random.choice(data_pred["Responses_list"][0])
        print(bot)

        

In [38]:
chatbot()

you=hello
Hi there! What can I do for you?
you=what is your role
I assist by providing information and support based on your needs and questions.
you=how many countries does the dataset cover ?
The number of available countries in our dataset is only 1.
you=What country does the dataset cover?
The only available country in our dataset is the United States.
you=quit
