In [1]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import plotly.express as px

import nbformat


In [2]:
# split input in dialog act and sentence
split_lines = []
with open('dialog_acts.dat', 'r') as f:  
    for line in f:
        split_lines.append(line.rstrip().split(' ', 1))
    
data = pd.DataFrame(split_lines, columns=['dialog_act', 'sentence'])

data.head()

Unnamed: 0,dialog_act,sentence
0,inform,im looking for a moderately priced restaurant ...
1,inform,any part of town
2,inform,bistro food
3,confirm,is there a moderately priced restaurant that s...
4,affirm,yes


In [3]:
# Split input into training and test data

x_train, x_test, y_train, y_test = train_test_split( data["sentence"], data["dialog_act"] , test_size=0.15, random_state=42)

In [4]:
# A simple baseline model that always predicts "inform"

def base_line1(data):
    return "inform"

In [5]:
#print the most comman sentences in the training data
print(data.value_counts().to_string())

dialog_act  sentence                                                                                                              
thankyou    thank you good bye                                                                                                        2565
request     phone number                                                                                                               914
affirm      yes                                                                                                                        803
request     address                                                                                                                    678
inform      i dont care                                                                                                                613
null        noise                                                                                                                      408
            sil                    

In [6]:
# A second baseline model that uses a set of rules to predict the dialog act
# The rules are based on the words in the sentence
# TODO: Improve the rules to increase the accuracy of the model

rules = {
    "ack": ["okay", "uhm", "fine", "sure"],
    "affirm": ["yes", "right", "good", "do that", "agreed"],
    "bye": ["bye", "goodbye", "see you", "see you later"],
    "repeat": ["repeat"],
    "reqalts": ["how", "else" ],
    "reqmore": ["more"],
    "request": ["address", "post", "what", "postal", "phone"],
    "confirm": ["is it", "does it"],
    "deny": ["not", "dont"],
    "hello": ["hello", "hi"],    
    "negate": ["no"],
    "null": ["cough", "noise","sil", "unintelligible"],   
    "restart": ["start over"],
    "thankyou" : ["thank" ],
    "inform" : ["looking", "any", "matter","care", "north" ,"west", "east", "south" "european", "italian", "korean", "food",
                "moderate" ,  "cheap","expensive", 
                ],
}

def base_line2(x):
    for key, value in rules.items():
        for v in value:
            if v in x:
                return key
    return "null"

In [7]:
# Evaluate the baseline models

y_baseline1 = x_test.apply(base_line1)
y_baseline2 = x_test.apply(base_line2)

print("Baseline 1:")
print(classification_report(y_test, y_baseline1))

print("Baseline 2:")
print(classification_report(y_test, y_baseline2))

Baseline 1:
              precision    recall  f1-score   support

         ack       0.00      0.00      0.00         5
      affirm       0.00      0.00      0.00       180
         bye       0.00      0.00      0.00        35
     confirm       0.00      0.00      0.00        22
        deny       0.00      0.00      0.00         6
       hello       0.00      0.00      0.00        14
      inform       0.40      1.00      0.57      1532
      negate       0.00      0.00      0.00        69
        null       0.00      0.00      0.00       232
      repeat       0.00      0.00      0.00         3
     reqalts       0.00      0.00      0.00       279
     reqmore       0.00      0.00      0.00         1
     request       0.00      0.00      0.00       972
     restart       0.00      0.00      0.00         2
    thankyou       0.00      0.00      0.00       474

    accuracy                           0.40      3826
   macro avg       0.03      0.07      0.04      3826
weighted avg  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Dedupe the data for the machine learning models

deduped_data = data.drop_duplicates(subset='sentence')

# Split the deduped data into training and test data

x_train_deduped, x_test_deduped, y_train_deduped, y_test_deduped = train_test_split( deduped_data["sentence"], deduped_data["dialog_act"] , test_size=0.15, random_state=42)
x_train_dupe, x_test_dupe, y_train_dupe, y_test_dupe = train_test_split( data["sentence"], data["dialog_act"] , test_size=0.15, random_state=42)

In [9]:
# Use a simple bag of words model to vectorize the input data
# TODO: also train the models on the non deduped data

count_vectorizer = CountVectorizer()
x_train_count = count_vectorizer.fit_transform(x_train_deduped)

# Create a visual representation of the data using PCA
# this can be used to get a feeling for the data and to see if the data is separable

pca = PCA(n_components=3)
x_train_pca = pca.fit_transform(x_train_count.toarray())

fig = px.scatter_3d(x=x_train_pca[:,0], y=x_train_pca[:,1], z=x_train_pca[:,2], color=y_train_deduped)

fig.show()

In [10]:
# do the pca with the duped data
count_vectorizer_dupe = CountVectorizer()
x_train_count_dupe = count_vectorizer_dupe.fit_transform(x_train_dupe)

pca = PCA(n_components=3)

x_train_pca_dupe = pca.fit_transform(x_train_count_dupe.toarray())

fig = px.scatter_3d(x=x_train_pca_dupe[:,0], y=x_train_pca_dupe[:,1], z=x_train_pca_dupe[:,2], color=y_train_dupe)

fig.show()

In [11]:
#compare the len on non deduped and deduped data
print("non deduped len:",len(data))
print("deduped len:",len(deduped_data))

non deduped len: 25501
deduped len: 5359


In [12]:
# Train a logistic regression model on the data

logistic_regression_model = LogisticRegression()

logistic_regression_model.fit(x_train_count, y_train_deduped)

x_test_count = count_vectorizer.transform(x_test_deduped)

y_pred = logistic_regression_model.predict(x_test_count)

print(classification_report(y_test_deduped, y_pred))

              precision    recall  f1-score   support

         ack       0.00      0.00      0.00         3
      affirm       1.00      0.83      0.91        24
         bye       1.00      0.56      0.71         9
     confirm       0.87      0.76      0.81        17
        deny       0.00      0.00      0.00         1
       hello       1.00      0.80      0.89         5
      inform       0.90      0.97      0.93       447
      negate       0.96      0.96      0.96        25
        null       0.57      0.26      0.36        46
      repeat       1.00      1.00      1.00         1
     reqalts       0.89      0.92      0.90        77
     request       0.94      0.95      0.94       135
     restart       1.00      1.00      1.00         1
    thankyou       0.92      0.85      0.88        13

    accuracy                           0.90       804
   macro avg       0.79      0.70      0.74       804
weighted avg       0.89      0.90      0.89       804




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [13]:
# Train a random forest model on the data

random_forest_model = RandomForestClassifier()

random_forest_model.fit(x_train_count, y_train_deduped)

x_test_count = count_vectorizer.transform(x_test_deduped)

y_pred = random_forest_model.predict(x_test_count)

print(classification_report(y_test_deduped, y_pred))

              precision    recall  f1-score   support

         ack       0.00      0.00      0.00         3
      affirm       1.00      0.71      0.83        24
         bye       1.00      0.56      0.71         9
     confirm       0.80      0.94      0.86        17
        deny       0.00      0.00      0.00         1
       hello       1.00      0.40      0.57         5
      inform       0.90      0.95      0.92       447
      negate       1.00      0.92      0.96        25
        null       0.43      0.33      0.37        46
      repeat       1.00      1.00      1.00         1
     reqalts       0.89      0.83      0.86        77
     request       0.93      0.96      0.95       135
     restart       1.00      1.00      1.00         1
    thankyou       0.87      1.00      0.93        13

    accuracy                           0.88       804
   macro avg       0.77      0.68      0.71       804
weighted avg       0.88      0.88      0.88       804




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [14]:
# train a knn model on teh data 

knn_model = KNeighborsClassifier(n_neighbors=5, metric="manhattan"  , algorithm="kd_tree"  , n_jobs=None)
 
knn_model.fit(x_train_count, y_train_deduped)

x_test_count = count_vectorizer.transform(x_test_deduped)

y_pred = knn_model.predict(x_test_count)

print(classification_report(y_test_deduped, y_pred))



cannot use tree with sparse input: using brute force



              precision    recall  f1-score   support

         ack       0.00      0.00      0.00         3
      affirm       0.83      0.21      0.33        24
         bye       0.83      0.56      0.67         9
     confirm       0.80      0.94      0.86        17
        deny       0.00      0.00      0.00         1
       hello       1.00      0.20      0.33         5
      inform       0.84      0.97      0.90       447
      negate       1.00      0.52      0.68        25
        null       0.39      0.20      0.26        46
      repeat       1.00      1.00      1.00         1
     reqalts       0.89      0.77      0.83        77
     request       0.91      0.93      0.92       135
     restart       1.00      1.00      1.00         1
    thankyou       0.77      0.77      0.77        13

    accuracy                           0.85       804
   macro avg       0.73      0.58      0.61       804
weighted avg       0.83      0.85      0.83       804




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [15]:
# Train a logistic regression model on the duped data

logistic_regression_model = LogisticRegression()

logistic_regression_model.fit(x_train_count_dupe, y_train_dupe)

x_test_count = count_vectorizer_dupe.transform(x_test_dupe)

y_pred = logistic_regression_model.predict(x_test_count)

print(classification_report(y_test_dupe, y_pred))


              precision    recall  f1-score   support

         ack       0.00      0.00      0.00         5
      affirm       0.99      0.98      0.99       180
         bye       0.94      0.91      0.93        35
     confirm       0.81      0.77      0.79        22
        deny       1.00      0.50      0.67         6
       hello       1.00      0.93      0.96        14
      inform       0.98      0.99      0.98      1532
      negate       1.00      0.99      0.99        69
        null       0.97      0.93      0.95       232
      repeat       1.00      0.67      0.80         3
     reqalts       0.95      0.97      0.96       279
     reqmore       0.00      0.00      0.00         1
     request       1.00      1.00      1.00       972
     restart       1.00      1.00      1.00         2
    thankyou       1.00      1.00      1.00       474

    accuracy                           0.98      3826
   macro avg       0.84      0.78      0.80      3826
weighted avg       0.98   


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [16]:
# Train a random forest model on the duped data

random_forest_model = RandomForestClassifier()

random_forest_model.fit(x_train_count_dupe, y_train_dupe)

x_test_count = count_vectorizer_dupe.transform(x_test_dupe)

y_pred = random_forest_model.predict(x_test_count)

print(classification_report(y_test_dupe, y_pred))


              precision    recall  f1-score   support

         ack       1.00      0.40      0.57         5
      affirm       0.99      0.94      0.97       180
         bye       0.97      1.00      0.99        35
     confirm       0.83      0.86      0.84        22
        deny       1.00      1.00      1.00         6
       hello       1.00      0.79      0.88        14
      inform       0.98      0.98      0.98      1532
      negate       1.00      0.96      0.98        69
        null       0.89      0.97      0.93       232
      repeat       1.00      0.67      0.80         3
     reqalts       0.97      0.97      0.97       279
     reqmore       1.00      1.00      1.00         1
     request       1.00      1.00      1.00       972
     restart       1.00      0.50      0.67         2
    thankyou       1.00      1.00      1.00       474

    accuracy                           0.98      3826
   macro avg       0.98      0.87      0.90      3826
weighted avg       0.98   

In [17]:
# train a knn model on teh duped data  

knn_model = KNeighborsClassifier(n_neighbors=3, metric="manhattan"  , algorithm="kd_tree"  , n_jobs=None)

knn_model.fit(x_train_count_dupe, y_train_dupe)

x_test_count = count_vectorizer_dupe.transform(x_test_dupe)

y_pred = knn_model.predict(x_test_count)

print(classification_report(y_test_dupe, y_pred))



cannot use tree with sparse input: using brute force



              precision    recall  f1-score   support

         ack       1.00      0.20      0.33         5
      affirm       1.00      0.86      0.92       180
         bye       0.89      0.94      0.92        35
     confirm       0.86      0.82      0.84        22
        deny       1.00      0.83      0.91         6
       hello       1.00      0.57      0.73        14
      inform       0.96      0.98      0.97      1532
      negate       1.00      0.71      0.83        69
        null       0.87      0.98      0.92       232
      repeat       1.00      0.67      0.80         3
     reqalts       0.97      0.95      0.96       279
     reqmore       1.00      1.00      1.00         1
     request       0.99      1.00      0.99       972
     restart       1.00      1.00      1.00         2
    thankyou       1.00      1.00      1.00       474

    accuracy                           0.97      3826
   macro avg       0.97      0.83      0.87      3826
weighted avg       0.97   