In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'training-dataset-for-chatbotsvirtual-assistants:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1953751%2F3314620%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240402%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240402T144425Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3De0938e148055b6ff23a8dcd9f4ee0be9161c0264f72ce874eb640c159288a4e54e3bbb040f30873cdbc34661821ee7400f5b8844b0e31450cf77b7547606e4ed567ca306ca779e34209fb1fff8252dd316975246d3d94eaa35db18882576225d59b1b7e76b00d0341a92225b91f3222d2cf211d212a2d518a1785e8fdb9cf039052aababaf5d206bb1de54c123d5ffe29b623ff2d8e1181916516c2f96f6eae52e2bbcb9b8cda24d9f0eca13d21740ef0b2bb4f9cdd5fbbed78b3fc6613074622d8a384b886ea90df633138d7b5d8ff151a37d7e383e33714d92009a6fd995392dd460ca85b2403f67a4743fc4a59a50195b7d3b16865931dece4eb19c520bcd'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading training-dataset-for-chatbotsvirtual-assistants, 1214677 bytes compressed
Downloaded and uncompressed: training-dataset-for-chatbotsvirtual-assistants
Data source import complete.


In [2]:
!pip install --upgrade --no-cache-dir gdown

Collecting gdown
  Downloading gdown-5.1.0-py3-none-any.whl (17 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.7.3
    Uninstalling gdown-4.7.3:
      Successfully uninstalled gdown-4.7.3
Successfully installed gdown-5.1.0


In [3]:
!gdown --id 1YDenjsJcRkcbSZNU2j30pZ4Qd_Oh0XKs

Downloading...
From: https://drive.google.com/uc?id=1YDenjsJcRkcbSZNU2j30pZ4Qd_Oh0XKs
To: /content/virtual.zip
100% 1.21M/1.21M [00:00<00:00, 13.4MB/s]


In [4]:
!unzip -o "virtual.zip"  -d  "/content"

Archive:  virtual.zip
  inflating: /content/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv  
  inflating: /content/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.xlsx  
  inflating: /content/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/LICENSE.txt  
  inflating: /content/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/README.txt  
  inflating: /content/Bitext_Sample_Customer_Service_Trainin

In [5]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

In [6]:
file="/content/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv"


In [8]:
content_df  = pd.read_csv( file)

In [9]:
content_df.head(10)

Unnamed: 0,flags,utterance,category,intent
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account
4,BLQC,"i want an online account, create one",ACCOUNT,create_account
5,BSLC,"i want an account, i need help opening one",ACCOUNT,create_account
6,BLZ,tell me if I can register two online accounts...,ACCOUNT,create_account
7,BPLQ,i want to know if i could create two profiles ...,ACCOUNT,create_account
8,BIMLQW,can you tell me if i can create more than one ...,ACCOUNT,create_account
9,BLZ,were to create an onlind account,ACCOUNT,create_account


In [10]:
content_df  = pd.read_csv( file)

In [11]:
content_df.category.value_counts()

PAYMENT             4636
ACCOUNT             4557
CONTACT             3081
INVOICES            2443
ORDER               2256
REFUNDS             1932
FEEDBACK            1326
DELIVERY             501
CANCELLATION_FEE     360
NEWSLETTER           236
SHIPPING             206
Name: category, dtype: int64

In [12]:
features=['utterance']

In [13]:
catagory_list =list(content_df.category.unique())

In [14]:
catagory_dict={catagory_list[i]:i for i in range(len(catagory_list))}

In [15]:
catagory_dict

{'ACCOUNT': 0,
 'CANCELLATION_FEE': 1,
 'CONTACT': 2,
 'DELIVERY': 3,
 'FEEDBACK': 4,
 'INVOICES': 5,
 'NEWSLETTER': 6,
 'ORDER': 7,
 'PAYMENT': 8,
 'REFUNDS': 9,
 'SHIPPING': 10}

In [16]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [17]:
content_df['NewTag']=""
for i in features:
  content_df[i] = content_df[i].fillna(' ')
for i in features:
  content_df['NewTag']+=(" "+content_df[i])
content_df['NewTag']=content_df['NewTag'].astype(str)

In [18]:
content_df['clean'] = content_df['NewTag']

In [19]:
content_df['clean'] =content_df['clean'].astype(str)

In [20]:
content_df.columns

Index(['flags', 'utterance', 'category', 'intent', 'NewTag', 'clean'], dtype='object')

In [21]:
content_df['category_target']=content_df['category']
content_df['category_target']=content_df['category_target'].apply(lambda x: catagory_dict[x] )


In [22]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/163.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [23]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

In [24]:
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import classification_report,accuracy_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
#gives model report in dataframe
def model_report(model,training_x,training_y,testing_x,testing_y,name) :
    model.fit(training_x,training_y)
    predictions  = model.predict(testing_x)
    accuracy     = accuracy_score(testing_y,predictions)
    recallscore  = recall_score(testing_y,predictions,average=None)
    precision    = precision_score(testing_y,predictions,average=None)
    f1score      = f1_score(testing_y,predictions,average=None)
    kappa_metric = cohen_kappa_score(testing_y,predictions)

    df = pd.DataFrame({"Model"           : [name],
                       "Accuracy_score"  : [accuracy],
                       "Recall_score"    : [recallscore],
                       "Precision"       : [precision],
                       "f1_score"        : [f1score],
                       "Kappa_metric"    : [kappa_metric],
                      })
    return df,model

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [26]:
logistic_model = LogisticRegression(random_state=1,multi_class='auto')

In [27]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [28]:
gb_model = GaussianNB()

In [29]:
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier(random_state=1)

In [30]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(n_estimators=20,max_depth=4)

In [31]:
content_df['category_intent']=content_df['category']+'--'+content_df['intent']


In [32]:
content_df['category_intent']=content_df['category']+'--'+content_df['intent']

In [40]:
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import classification_report,accuracy_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
#gives model report in dataframe
def model_report(model,training_x,training_y,testing_x,testing_y,name) :
    model.fit(training_x,training_y)
    predictions  = model.predict(testing_x)
    accuracy     = accuracy_score(testing_y,predictions)
    recallscore  = recall_score(testing_y,predictions,average=None)
    precision    = precision_score(testing_y,predictions,average=None)
    f1score      = f1_score(testing_y,predictions,average=None)
    kappa_metric = cohen_kappa_score(testing_y,predictions)

    df = pd.DataFrame({"Model"           : [name],
                       "Accuracy_score"  : [accuracy],
                       "Recall_score"    : [recallscore],
                       "Precision"       : [precision],
                       "f1_score"        : [f1score],
                       "Kappa_metric"    : [kappa_metric],
                      })
    return df,model

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [42]:
logistic_model = LogisticRegression(random_state=1,multi_class='auto')

In [43]:
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier(random_state=1)

In [44]:
from sklearn.ensemble import RandomForestClassifier
forest_model = RandomForestClassifier(random_state=1,max_depth=10,n_estimators=50)

In [45]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(n_estimators=20,max_depth=4)