In [2]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import math
from sklearn import tree, ensemble, model_selection, metrics

import nltk
nltk.download('punkt')
import random
import string
import heapq
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import tensorflow 
from tensorflow import keras

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
from google.colab import files
Uploaded = files.upload()

# Loading the data for the dialog acts and utterance contents
dialog_acts = open("dialog_acts.dat", 'r')

# Read in data, convert to lowercase strings
dialog_list = []
for line in dialog_acts: 
    line = line.strip()
    line = line.lower()
    act = line.split(' ')[0]
    cont = line[(len(act) + 1):]
    dialog_list.append([act, cont])

dialog_acts = np.array(dialog_list)
# print(dialog_acts)

Saving dialog_acts.dat to dialog_acts.dat


In [20]:
# Function for splitting arrays in X and r
def split(data):
    return(data[:, 0, None], data[:, 1, None])

acts, contents = split(dialog_acts)
# print(acts.shape)


X_train, X_test, y_train, y_test = model_selection.train_test_split(contents, acts, test_size=0.15, train_size=0.85, random_state=0)

In [27]:
# https://stackabuse.com/python-for-nlp-creating-bag-of-words-model-from-scratch/

def get_word_freq(contents):
    wordfreq = {}
    for content in contents:
        tokens = nltk.word_tokenize(content[0])
        for token in tokens:
            if token not in wordfreq.keys():
                wordfreq[token] = 1
            else:
                wordfreq[token] += 1
    most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

    return most_freq


def get_bow(contents, most_freq):
    content_vectors = []
    for content in contents:
        content_tokens = nltk.word_tokenize(content[0])
        sent_vec = []
        for token in most_freq:
            if token in content_tokens:
                sent_vec.append(1)
            else:
                sent_vec.append(0)
        content_vectors.append(sent_vec)

    return content_vectors


frequent_words = get_word_freq(X_train)

vect_X_train = get_bow(X_train, frequent_words)
vect_X_test = get_bow(X_test, frequent_words)

In [28]:
estimator = tree.DecisionTreeClassifier(random_state=0)
param_distributions = {"max_depth": [x for x in range(1,1000)], "min_samples_split": [x for x in range(2,1000)],
                      "min_samples_leaf": [x for x in range(1,1000)], "max_leaf_nodes": [x for x in range(2,1000)]}
n_iter = 500
scoring = ["accuracy"]
cv = model_selection.StratifiedKFold(n_splits=2)
refit = "accuracy"

randomized_search_tree = model_selection.RandomizedSearchCV(estimator=estimator, param_distributions=param_distributions, n_iter=n_iter,
                                            scoring=scoring, cv=cv, refit=refit, random_state=0)

search_tree = randomized_search_tree.fit(vect_X_train, y_train)
results_tree = pd.DataFrame(search_tree.cv_results_)

In [30]:
best_params = search_tree.best_params_
print(best_params)
best_decision_tree = search_tree.best_estimator_

# Predict test set
labels_predict_search = best_decision_tree.predict(vect_X_test)

# Check metrics
report_randomized_search = metrics.classification_report(y_test, labels_predict_search)
print(report_randomized_search)

{'min_samples_split': 79, 'min_samples_leaf': 1, 'max_leaf_nodes': 74, 'max_depth': 29}
              precision    recall  f1-score   support

         ack       0.00      0.00      0.00         3
      affirm       0.99      0.98      0.99       150
         bye       0.96      0.94      0.95        50
     confirm       0.71      0.68      0.70        22
        deny       0.00      0.00      0.00         3
       hello       1.00      0.86      0.92        14
      inform       0.94      0.98      0.96      1522
      negate       0.97      0.97      0.97        65
        null       0.96      0.74      0.83       244
      repeat       0.00      0.00      0.00         5
     reqalts       0.93      0.95      0.94       277
     reqmore       0.00      0.00      0.00         1
     request       0.99      0.98      0.98       973
     restart       0.00      0.00      0.00         2
    thankyou       0.99      0.99      0.99       495

    accuracy                           0.96   

  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
Decision_Tree = tree.DecisionTreeClassifier(random_state=0)
Decision_Tree.fit(vect_X_train, y_train)

labels_predict = Decision_Tree.predict(vect_X_test)

report = metrics.classification_report(y_test, labels_predict)
print(report)

              precision    recall  f1-score   support

         ack       0.33      0.33      0.33         3
      affirm       1.00      0.99      0.99       150
         bye       1.00      0.94      0.97        50
     confirm       0.79      0.86      0.83        22
        deny       0.00      0.00      0.00         3
       hello       1.00      0.93      0.96        14
      inform       0.99      0.97      0.98      1522
      negate       0.98      0.98      0.98        65
        null       0.81      0.98      0.89       244
      repeat       1.00      0.80      0.89         5
     reqalts       0.98      0.94      0.96       277
     reqmore       0.00      0.00      0.00         1
     request       0.99      0.99      0.99       973
     restart       0.00      0.00      0.00         2
    thankyou       0.99      1.00      1.00       495

    accuracy                           0.97      3826
   macro avg       0.73      0.71      0.72      3826
weighted avg       0.98   

  _warn_prf(average, modifier, msg_start, len(result))
