In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from intent_classification_helper import *

In [None]:
np.ones((2,2))

array([[1., 1.],
       [1., 1.]])

In [None]:
f = open('/content/drive/MyDrive/nlp_datasets/CLINC150/clinc150_uci/data_full.json')
data = json.load(f)

train_df = pd.DataFrame.from_dict(data['train'])
train_df.columns = ['message', 'intent']

val_df = pd.DataFrame.from_dict(data['val'])
val_df.columns = ['message', 'intent']

test_df = pd.DataFrame.from_dict(data['test'])
test_df.columns = ['message', 'intent']

print('number of intent classes in training set: ', len(set(train_df['intent'])))
print('number of intent classes in val set: ', len(set(val_df['intent'])))
print('number of intent classes in test set: ', len(set(test_df['intent'])))

number of intent classes in training set:  150
number of intent classes in val set:  150
number of intent classes in test set:  150


In [None]:
# encode label
le = LabelEncoder()
train_df['intent'] = le.fit_transform(train_df['intent'])
val_df['intent'] = le.fit_transform(val_df['intent'])
test_df['intent'] = le.fit_transform(test_df['intent'])

onehot_encoder = OneHotEncoder()
y_train = onehot_encoder.fit_transform(train_df['intent'].values.reshape(-1, 1)).todense()
y_val = onehot_encoder.fit_transform(val_df['intent'].values.reshape(-1, 1)).todense()
y_test = onehot_encoder.fit_transform(test_df['intent'].values.reshape(-1, 1)).todense()

# preprocess text
print('Preprocessing text on training set...')
preprocess_text(train_df, 'message')

print('Preprocessing text on val set...')
preprocess_text(val_df, 'message')

print('Preprocessing text on test set...')
preprocess_text(test_df, 'message')

x_train = train_df['message'].to_list()
x_val = val_df['message'].to_list()
x_test = test_df['message'].to_list()

Preprocessing text on training set...


Start text preprocessing: 
--------------------------
Converting to lowercase...
--------------------------
Removing html tags...
--------------------------
Removing nonword characters...
--------------------------
Removing stopwords...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Text preprocessing completed.


Preprocessing text on val set...


Start text preprocessing: 
--------------------------
Converting to lowercase...
--------------------------
Removing html tags...
--------------------------
Removing nonword characters...
--------------------------
Removing stopwords...
Text preprocessing completed.


Preprocessing text on test set...


Start text preprocessing: 
--------------------------
Converting to lowercase...
--------------------------
Removing html tags...
--------------------------
Removing nonword characters...
--------------------------
Removing stopwords...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Text preprocessing completed.




In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# try different number of features
#tv = TfidfVectorizer(max_df=1.0, min_df=0, max_features=40)
#tv = TfidfVectorizer(max_df=1.0, min_df=0, max_features=100)
#tv = TfidfVectorizer(max_df=1.0, min_df=0, max_features=200)
#tv = TfidfVectorizer(max_df=1.0, min_df=0, max_features=1000)
tv = TfidfVectorizer(max_df=1.0, min_df=0)

tv.fit(x_train)
x_train = tv.transform(x_train).toarray()
x_val = tv.transform(x_val).toarray()
x_test = tv.transform(x_test).toarray()

vocab = tv.get_feature_names_out()

print(pd.DataFrame(x_train, columns=vocab))
print('TF-IDF vocabulary size: ', len(vocab))

        00  000  005  00am  00pm   01   02   03   05  098098  ...  zesty  \
0      0.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0     0.0  ...    0.0   
1      0.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0     0.0  ...    0.0   
2      0.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0     0.0  ...    0.0   
3      0.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0     0.0  ...    0.0   
4      0.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0     0.0  ...    0.0   
...    ...  ...  ...   ...   ...  ...  ...  ...  ...     ...  ...    ...   
14995  0.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0     0.0  ...    0.0   
14996  0.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0     0.0  ...    0.0   
14997  0.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0     0.0  ...    0.0   
14998  0.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0     0.0  ...    0.0   
14999  0.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  0.0     0.0  ...    0.0   

       zeus  zion  zippy  zippys  ziti  zombie  zone  zoo  zulu  
0       0.0   0.0    

In [None]:
from keras.layers import MaxPooling1D

In [None]:
def cnn_clinc150(x_train_cnn, y_train_cnn, batch_size, epochs, validation_data):
  model = Sequential()
  model.add(Conv1D(32, 3, activation='relu', input_shape = (x_train_cnn.shape[1],1)))
  model.add(BatchNormalization()) 
  model.add(Dropout(0.5)) 
  model.add(MaxPooling1D(2))
  model.add(Flatten())
  model.add(Dense(256, activation='relu'))
  model.add(Dense(150, activation='softmax'))
  model.compile(loss="categorical_crossentropy", optimizer = 'adam', metrics=["accuracy"])
  model.summary()
  return model

In [None]:
model = cnn_clinc150(x_train_cnn=x_train, y_train_cnn=y_train, batch_size=8, epochs=20, validation_data=(x_test, y_test))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_1 (Conv1D)           (None, 4899, 32)          128       
                                                                 
 batch_normalization_1 (Batc  (None, 4899, 32)         128       
 hNormalization)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 4899, 32)          0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 2449, 32)         0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 78368)             0         
                                                                 
 dense (Dense)               (None, 256)              

In [None]:
model.fit(x_train, y_train, batch_size=8, epochs=20, validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f15b7cbf3d0>