### miss labeled classes intent_classification transformers


In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
# CUDA_VISIBLE_DEVICES options = 0/1/2,3. Make sure to restart
!echo $AVAILABLE_CUDA_DEVICES




In [11]:
import os
import time
import glob
import pandas as pd
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, trange
import pickle
from pprint import pprint

In [12]:
# configs
%matplotlib inline

# seeds
np.random.seed(42)


In [38]:
ROOT = os.getcwd()
data_dir = os.path.join(ROOT, 'model_save')
os.makedirs(data_dir, exist_ok=True)
print("Model_save dir: ", data_dir)



# all files
files_miss_count = glob.glob(pathname=ROOT+"/**/miss_count*", recursive=True)
df = pd.concat([pd.read_csv(file) for file in files_miss_count], ignore_index=True)
df.to_csv("combined_miss_labels.csv")

pprint(f"Written csvs {files_miss_count} to combined combined_miss_labels.csv")


Model_save dir:  /home/DATA/amit_kesari/IntentClassification/model_save
('Written csvs '
 "['/home/DATA/amit_kesari/IntentClassification/model_save/mpnet-base/miss_count', "
 "'/home/DATA/amit_kesari/IntentClassification/model_save/bert-base-uncased/miss_count', "
 "'/home/DATA/amit_kesari/IntentClassification/model_save/roberta-base/miss_count', "
 "'/home/DATA/amit_kesari/IntentClassification/model_save/deberta-base/miss_count', "
 "'/home/DATA/amit_kesari/IntentClassification/model_save/xlnet-base-cased/miss_count', "
 "'/home/DATA/amit_kesari/IntentClassification/model_save_torch1_7_0/distilbert-base-uncased/miss_count'] "
 'to combined combined_miss_labels.csv')


In [39]:
# check the three dataframes
print("df shape: ",df.shape)

df.head()


df shape:  (431, 3)


Unnamed: 0.1,Unnamed: 0,DOCUMENT_CLASS,MISS_COUNT
0,0,*Account Details/Updation*,10
1,1,*Account Login*,16
2,2,*Account*,51
3,3,*Alerts/Notification*,52
4,4,*App/Website*,72


In [40]:
# sort according to miss labelled classes
df.sort_values("MISS_COUNT", ascending=False, inplace=True)
df

Unnamed: 0.1,Unnamed: 0,DOCUMENT_CLASS,MISS_COUNT
321,39,*Others*,663
109,39,*Others*,652
185,39,*Others*,596
396,38,*Others*,559
254,39,*Others*,465
...,...,...,...
140,70,Password Reset,1
139,69,Order Placement,1
138,68,Order Issues,1
136,66,Exchange/Return/Replacement,1


In [41]:
# sum same labels
df = df.groupby(["DOCUMENT_CLASS"], as_index=False).agg("sum")
df.sort_values("MISS_COUNT", ascending=False, inplace=True)
df.head(20)



Unnamed: 0.1,DOCUMENT_CLASS,Unnamed: 0,MISS_COUNT
39,*Others*,232,3395
25,*Greetings*,150,1741
62,*Thanks*,370,1596
21,*Exchange/Return/Refund*,126,1263
56,*Staff/Service Personnel Behavior*,334,1202
15,*Delivery*,90,1202
43,*Price/Value*,256,924
41,*Payment/Billing*,244,894
46,*Purchase Intent*,274,855
13,*Customer Service Quality*,78,803


In [42]:
# load relevant/non relevant classes into dataframe
relevant_filename = os.path.join(ROOT, 'data/retail_relevant_intents_combined.xlsx')
relevant_df = pd.read_excel(relevant_filename, sheet_name="Sheet1")

# map to [1/0]
relevant_df['DOC_RELEVANT'] = relevant_df['relevant/non_relevant'].map({'relevant': 1, 'non_relevant': 0})
relevant_df

Unnamed: 0,DOC_CLASS,relevant/non_relevant,DOC_RELEVANT
0,Account,relevant,1
1,Account Closure,relevant,1
2,Account Number,relevant,1
3,Alerts/Notification,relevant,1
4,App/Website,relevant,1
...,...,...,...
110,*Information*,relevant,1
111,discount,relevant,1
112,*Queries*,relevant,1
113,*Feedback*,relevant,1


In [43]:
# convert relevant_df to dictionary by zipping 'DOC_CLASS' and 'DOC_RELEVANT'

# dic = relevant_df.set_index('DOC_CLASS').T.to_dict('list')
print('Unique classes (True/False): ', relevant_df['DOC_CLASS'].nunique() == relevant_df.shape[0])
print('Any Null values: ', relevant_df.isnull().values.any())

relevant_dict = dict([(i,a) for i, a in zip(relevant_df['DOC_CLASS'], relevant_df['DOC_RELEVANT'])])
relevant_dict

Unique classes (True/False):  True
Any Null values:  False


{'Account': 1,
 'Account Closure': 1,
 'Account Number': 1,
 'Alerts/Notification': 1,
 'App/Website': 1,
 'Awaiting/No Response': 0,
 'Customer Churn': 0,
 'Delivery': 1,
 'Discount': 1,
 'Exchange/Return/Replacement': 1,
 'General Assistance': 0,
 'General Complaint': 0,
 'General Praise': 0,
 'Greetings': 0,
 'Link': 1,
 'Live Chat': 1,
 'Login Issues': 1,
 'Order Cancellation': 1,
 'Order Issues': 1,
 'Order Placement': 1,
 'Others': 0,
 'Out Of Stock': 1,
 'Password Reset': 1,
 'Payment': 1,
 'Pm/Dm': 0,
 'Price': 1,
 'Ps5': 1,
 'Refund': 1,
 'Request Contact': 1,
 'Service Negative': 0,
 'Service Positive': 0,
 'Status Update': 0,
 'Thank You Messages': 0,
 'Tracking': 1,
 'Wrong/Missing Item': 1,
 'Xbox': 1,
 '*Others*': 0,
 '*Discounts/Offers*': 1,
 '*General-Complaints*': 0,
 '*Damage*': 1,
 '*Illegal/Unethical activity*': 1,
 '*Appearance/Ambience/Design*': 1,
 '*General-Praise*': 0,
 '*Request Help*': 0,
 '*Price/Value*': 1,
 '*Customer Service Quality*': 0,
 '*Staff/Service