In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import math
from sklearn import tree, ensemble, model_selection, metrics

import nltk
nltk.download('punkt')
import random
import string
import heapq
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import tensorflow 
from tensorflow import keras

[nltk_data] Downloading package punkt to
[nltk_data]     /home/aafjekapteijns/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Loading the data for the dialog acts and utterance contents
dialog_acts = open("dialog_acts.dat", 'r')

# Read in data, convert to lowercase strings
dialog_list = []
for line in dialog_acts: 
    line = line.strip()
    line = line.lower()
    act = line.split(' ')[0]
    cont = line[(len(act) + 1):]
    dialog_list.append([act, cont])

dialog_acts = np.array(dialog_list)
# print(dialog_acts)

In [3]:
# Function for splitting arrays in X and r
def split(data):
    return(data[:, 0, None], data[:, 1, None])

acts, contents = split(dialog_acts)
# print(acts.shape)

In [4]:
# Coens code

wordfreq = {}
for content in contents:
    tokens = nltk.word_tokenize(content[0])
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

content_vectors = []
for content in contents:
    content_tokens = nltk.word_tokenize(content[0])
    sent_vec = []
    for token in most_freq:
        if token in content_tokens:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    content_vectors.append(sent_vec)
    
# print(content_vectors)

# for instance, label in zip(np.array(content_vectors),acts):
#     print(instance, label)

In [5]:
# Split data in training and test set (85%, 15%)
X_train, X_test, y_train, y_test = model_selection.train_test_split(np.array(content_vectors), acts, test_size=0.15, train_size=0.85, random_state=0)

# Check
# print(X_train.shape)
# print(X_train)
# print(y_train.shape)
# print(y_train)
# print(X_test.shape)
# print(X_test)
# print(y_test.shape)
# print(y_test)

In [19]:
estimator = tree.DecisionTreeClassifier(random_state=0)
param_distributions = {"max_depth": [x for x in range(1,1000)], "min_samples_split": [x for x in range(2,1000)],
                      "min_samples_leaf": [x for x in range(1,1000)], "max_leaf_nodes": [x for x in range(2,1000)]}
n_iter = 500
scoring = ["accuracy"]
cv = model_selection.StratifiedKFold(n_splits=2)
refit = "accuracy"

randomized_search_tree = model_selection.RandomizedSearchCV(estimator=estimator, param_distributions=param_distributions, n_iter=n_iter,
                                            scoring=scoring, cv=cv, refit=refit, random_state=0)

search_tree = randomized_search_tree.fit(X_train, y_train)
results_tree = pd.DataFrame(search_tree.cv_results_)

In [20]:
print(results_tree)

     mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0         0.332405      0.016682         0.114514        0.008857   
1         0.574465      0.004066         0.135124        0.012396   
2         0.894503      0.096034         0.131173        0.017501   
3         0.351149      0.005995         0.123347        0.007391   
4         0.522559      0.016429         0.122907        0.013346   
..             ...           ...              ...             ...   
495       0.356252      0.005467         0.130119        0.003062   
496       0.578414      0.005903         0.115021        0.001983   
497       0.619475      0.001912         0.119385        0.010196   
498       0.354531      0.008458         0.111358        0.002050   
499       0.358189      0.009827         0.116222        0.004843   

    param_min_samples_split param_min_samples_leaf param_max_leaf_nodes  \
0                       463                    665                  999   
1                    

In [21]:
best_params = search_tree.best_params_
print(best_params)
best_decision_tree = search_tree.best_estimator_

# Predict test set
labels_predict_search = best_decision_tree.predict(X_test)

# Check metrics
report_randomized_search = metrics.classification_report(y_test, labels_predict_search)
print(report_randomized_search)

{'min_samples_split': 79, 'min_samples_leaf': 1, 'max_leaf_nodes': 74, 'max_depth': 29}


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         ack       0.00      0.00      0.00         3
      affirm       0.99      0.98      0.99       150
         bye       0.96      0.94      0.95        50
     confirm       0.71      0.68      0.70        22
        deny       0.00      0.00      0.00         3
       hello       1.00      0.86      0.92        14
      inform       0.94      0.98      0.96      1522
      negate       0.97      0.97      0.97        65
        null       0.96      0.74      0.83       244
      repeat       0.00      0.00      0.00         5
     reqalts       0.93      0.95      0.94       277
     reqmore       0.00      0.00      0.00         1
     request       0.99      0.98      0.98       973
     restart       0.00      0.00      0.00         2
    thankyou       0.99      0.99      0.99       495

    accuracy                           0.96      3826
   macro avg       0.63      0.60      0.62      3826
weighted avg       0.96   