# TF IDF

## Requirements 
Installing requirements using pip

In [None]:
pip install -r requirements.txt

In [4]:
# Imports
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline

from statistics import mean

import numpy as np
import pandas as pd

import _helpers as hp

## Load Data
For chosing which ASR data change the variable by the key in the paths dictionary

In [5]:
# Defining paths for our data. "ASR" means the TDNNF-LFMMI method, and "new_ASR" means the Wave2vec method.
paths = {
    "ground_truth": "snips/merged_GT_data.csv",
    "ASR": "snips/ASR_with_labels.csv",
    "new_ASR": "snips/new_ASR_with_labels.csv",
    "new_ASR_Autocorrect": "snips/new_ASR_Autocorrection_with_labels.csv",
}

# Chosing which ASR data we will use
ASR_data = pd.read_csv(paths["new_ASR"])

# Groundtruth data
GT_data = pd.read_csv(paths["ground_truth"])

In [144]:
ASR_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,transcript,user_action,user_action_num
0,0,0,active igtl like an the entire house,SwitchLightOn,1
1,1,1,activate basement lights,SwitchLightOn,1
2,2,2,a djust the bedroom light in tentity of thirty...,SetLightBrightness,4
3,3,3,can you please change the light color to pink,SetLightColor,5
4,4,4,said the rightness to file,SetLightBrightness,4
...,...,...,...,...,...
1655,1655,1655,turn the large meeting room green,SetLightColor,5
1656,1656,1656,turn the laundry room lights to twenty two,SetLightBrightness,4
1657,1657,1657,don't the light intensity to level thirty nine,SetLightBrightness,4
1658,1658,1658,turned the late on,SwitchLightOn,1


### DATA manipulation

In [145]:
# Get ASR data into a numpy word array per sentence plus a numpy label array

XX_ASR = ASR_data["transcript"].apply(lambda x: x.split(" "))
X_ASR = list(XX_ASR)  # numpy word array per transcript


y_num_ASR = np.array(ASR_data["user_action_num"])  # labels

In [146]:
# Get Groundtruth data into a numpy word array per sentence plus a numpy label array

XX_GT = GT_data["transcript"].apply(lambda x: x.split(" "))
X_GT = list(XX_GT)  # numpy word array per transcript

y_num_GT = np.array(GT_data["user_action_num"])  # labels

In [147]:
# concat the 2 arrays into one
X_all = X_ASR + X_GT

## Keyword to user action

Label each sentence in the dataset acording with keywords to user action

## TF IDF vector representation 

### Create TD IDF model based in the dictionary of our dataset 

In [148]:
# Create TD*IDF vector represenation
dct_X = Dictionary(X_all)
corpusBOW = [dct_X.doc2bow(line) for line in X_all]
model = TfidfModel(corpusBOW)


corpusBOW_ASR = [dct_X.doc2bow(line) for line in X_ASR]
X_ASR_vec = model[corpusBOW_ASR]


corpusBOW_GT = [dct_X.doc2bow(line) for line in X_GT]
X_GT_vec = model[corpusBOW_GT]

In [149]:
def Tfidf2np(X, dct):
    """
    Converts gensim format to numpy array
    Input:
    X - TDidfModel vector (N x lenght("sentence"))
    dct - Dictionary object (lenght("unique words"))
    Output:
    X_np - N x length("unique words")
    """
    N_dict = len(dct)
    N_sent = len(X)
    X_np = np.zeros((N_sent, N_dict))
    i = 0
    for _list in X:
        for word in _list:
            X_np[i, word[0]] = word[1]
        i += 1
    return X_np

In [150]:
# Tokenize data
X_ASR_np = Tfidf2np(X_ASR_vec, dct_X)
X_GT_np = Tfidf2np(X_GT_vec, dct_X)

Lets create a function that vectorizes a sentence based in a embeded text model

In [151]:
def get_TFIDF_feature(sent, dct_X, model):
    """return a vector representative of a string"""

    corpusBOW_sent = [dct_X.doc2bow(line) for line in [sent]]

    sent_vec = model[corpusBOW_sent]

    sent_np = Tfidf2np(sent_vec, dct_X)

    return sent_np

# Classifiers

## Train/Test Split

The function beneath provides the features and labels needed for testing. Using the loaded ASR or not (then using ground truth data) is decided by input. As standard we use the ASR dataset.

In [152]:
def get_train_test_data(type_of_dataset="ASR", train_size=0.9):
    """Retrieves the relevant dataset and splits according to parameter"""
    # If ASR, give ASR features and labels
    if type_of_dataset == "ASR":
        train_features, test_features, train_labels, test_labels = train_test_split(
            X_ASR_np, y_num_ASR, train_size=train_size
        )
    # If the dataset is not the ASR data, use the ground truth data
    else:
        train_features, test_features, train_labels, test_labels = train_test_split(
            X_GT_np, y_num_GT, train_size=train_size
        )

    return train_features, test_features, train_labels, test_labels


def run_classifier(
    classifier_pipe, type_of_dataset="ASR", train_size=0.9, number_of_times=100
):
    """For running the classifiers multiple times, and returning mean accuracy score. Wraps around get_train_test_data"""
    mean_score_list = []
    n = number_of_times
    for i in range(n):
        train_features, test_features, train_labels, test_labels = get_train_test_data(
            type_of_dataset="ASR"
        )
        classifier_pipe.fit(train_features, train_labels)

        classifier_pred_labels = classifier_pipe.predict(test_features)  # predictions

        classifier_score = classifier_pipe.score(test_features, test_labels)  # accuracy

        mean_score_list.append(lgr_score)
    return mean_score, classifier_pred_labels, classifier_score

## Logistic Regression

In [153]:
lgr = LogisticRegression(max_iter=1000)  # Create the classification model

lgr_pipe = make_pipeline(preprocessing.StandardScaler(), lgr)  # Scale feature space

mean_score, lgr_pred_labels, lgr_score = run_classifier(
    classifier_pipe=lgr_pipe,
    type_of_dataset="ASR",
    train_size=0.9,
    number_of_times=100,
)

print("Average accuracy score =", round(mean_score, 3))

Average accuracy score = 0.843


### Model Evaluation

In [154]:
print(
    classification_report(
        test_labels,
        lgr_pred_labels,
        target_names=[
            "SwitchLightOff",
            "SwitchLightOn",
            "IncreaseBrightness",
            "DecreaseBrightness",
            "SetLightBrightness",
            "SetLightColor",
        ],
    )
)

print(confusion_matrix(test_labels, lgr_pred_labels))

print("\nACCURACY:", lgr_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.16      0.11      0.13        36
     SwitchLightOn       0.12      0.09      0.10        33
IncreaseBrightness       0.11      0.17      0.13        18
DecreaseBrightness       0.17      0.18      0.17        28
SetLightBrightness       0.24      0.26      0.25        27
     SetLightColor       0.14      0.17      0.15        24

          accuracy                           0.16       166
         macro avg       0.16      0.16      0.16       166
      weighted avg       0.16      0.16      0.15       166

[[ 4  5  5  7  5 10]
 [ 6  3  4  8  4  8]
 [ 3  4  3  3  1  4]
 [ 1  7  5  5  9  1]
 [ 6  4  6  2  7  2]
 [ 5  3  4  5  3  4]]

ACCURACY: 0.8373493975903614


## Naive Bayes

### Multinomial Naïve Bayes

In [155]:
mnb = MultinomialNB()  # Create NB model

mnb_pipe = make_pipeline(preprocessing.Normalizer(), mnb)  # Scale feature space

mean_score, mnb_pred_labels, mnb_score = run_classifier(
    classifier_pipe=mnb_pipe,
    type_of_dataset="ASR",
    train_size=0.9,
    number_of_times=100,
)

print("Average accuracy score =", round(mean_score, 3))

Average accuracy score = 0.843


#### Model Evaluation

In [156]:
print(
    classification_report(
        test_labels,
        mnb_pred_labels,
        target_names=[
            "SwitchLightOff",
            "SwitchLightOn",
            "IncreaseBrightness",
            "DecreaseBrightness",
            "SetLightBrightness",
            "SetLightColor",
        ],
    )
)

print(confusion_matrix(test_labels, mnb_pred_labels))

print("\nACCURACY:", mnb_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.14      0.08      0.10        36
     SwitchLightOn       0.19      0.18      0.19        33
IncreaseBrightness       0.20      0.28      0.23        18
DecreaseBrightness       0.15      0.14      0.15        28
SetLightBrightness       0.15      0.15      0.15        27
     SetLightColor       0.06      0.08      0.07        24

          accuracy                           0.14       166
         macro avg       0.15      0.15      0.15       166
      weighted avg       0.15      0.14      0.14       166

[[ 3  6  6  7  5  9]
 [ 8  6  2  6  6  5]
 [ 2  2  5  3  3  3]
 [ 3  3  6  4  5  7]
 [ 4  4  3  4  4  8]
 [ 2 10  3  3  4  2]]

ACCURACY: 0.8253012048192772


### Bernoulli naïve Bayes

In [157]:
bnb = BernoulliNB()  # Create the classification model

bnb_pipe = make_pipeline(preprocessing.Normalizer(), bnb)  # Scale feature space

mean_score, bnb_pred_labels, bnb_score = run_classifier(
    classifier_pipe=bnb_pipe,
    type_of_dataset="ASR",
    train_size=0.9,
    number_of_times=100,
)

print("Average accuracy score =", round(mean_score, 3))

Average accuracy score = 0.843


#### Model Evaluation

In [158]:
print(
    classification_report(
        test_labels,
        bnb_pred_labels,
        target_names=[
            "SwitchLightOff",
            "SwitchLightOn",
            "IncreaseBrightness",
            "DecreaseBrightness",
            "SetLightBrightness",
            "SetLightColor",
        ],
    )
)

print(confusion_matrix(test_labels, bnb_pred_labels))

print("\nACCURACY:", bnb_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.11      0.08      0.10        36
     SwitchLightOn       0.17      0.12      0.14        33
IncreaseBrightness       0.07      0.11      0.09        18
DecreaseBrightness       0.13      0.11      0.12        28
SetLightBrightness       0.13      0.15      0.14        27
     SetLightColor       0.26      0.38      0.31        24

          accuracy                           0.15       166
         macro avg       0.15      0.16      0.15       166
      weighted avg       0.15      0.15      0.15       166

[[ 3  3  6  8 11  5]
 [ 9  4  8  3  6  3]
 [ 4  2  2  5  2  3]
 [ 3  3  6  3  6  7]
 [ 5  6  2  2  4  8]
 [ 3  5  3  2  2  9]]

ACCURACY: 0.8433734939759037


### Gaussian Naive Bayes

In [159]:
gnb = GaussianNB()  # Create the classification model

gnb_pipe = make_pipeline(preprocessing.StandardScaler(), gnb)  # Scale feature space

mean_score, gnb_pred_labels, gnb_score = run_classifier(
    classifier_pipe=gnb_pipe,
    type_of_dataset="ASR",
    train_size=0.9,
    number_of_times=100,
)

print("Average accuracy score =", round(mean_score, 3))

Average accuracy score = 0.843


#### Model Evaluation

In [160]:
print(
    classification_report(
        test_labels,
        gnb_pred_labels,
        target_names=[
            "SwitchLightOff",
            "SwitchLightOn",
            "IncreaseBrightness",
            "DecreaseBrightness",
            "SetLightBrightness",
            "SetLightColor",
        ],
    )
)

print(confusion_matrix(test_labels, gnb_pred_labels))

print("\nACCURACY:", gnb_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.35      0.22      0.27        36
     SwitchLightOn       0.07      0.09      0.08        33
IncreaseBrightness       0.13      0.28      0.18        18
DecreaseBrightness       0.10      0.07      0.08        28
SetLightBrightness       0.09      0.07      0.08        27
     SetLightColor       0.14      0.12      0.13        24

          accuracy                           0.14       166
         macro avg       0.15      0.14      0.14       166
      weighted avg       0.16      0.14      0.14       166

[[ 8 11 10  4  0  3]
 [ 3  3  9  3  9  6]
 [ 1  6  5  3  0  3]
 [ 3 10  5  2  7  1]
 [ 5  6  5  3  2  6]
 [ 3  5  4  5  4  3]]

ACCURACY: 0.6807228915662651


# SVM

In [161]:
svm = SVC(C=1)  # Create the classification model

svm_pipe = make_pipeline(preprocessing.Normalizer(), svm)  # Scale feature space

mean_score, svm_pred_labels, svm_score = run_classifier(
    classifier_pipe=svm_pipe,
    type_of_dataset="ASR",
    train_size=0.9,
    number_of_times=100,
)

print("Average accuracy score =", round(mean_score, 3))

Average accuracy score = 0.843


### Model Evaluation

In [162]:
print(
    classification_report(
        test_labels,
        svm_pred_labels,
        target_names=[
            "SwitchLightOff",
            "SwitchLightOn",
            "IncreaseBrightness",
            "DecreaseBrightness",
            "SetLightBrightness",
            "SetLightColor",
        ],
    )
)

print(confusion_matrix(test_labels, svm_pred_labels))

print("\nACCURACY:", svm_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.16      0.14      0.15        36
     SwitchLightOn       0.17      0.12      0.14        33
IncreaseBrightness       0.17      0.22      0.20        18
DecreaseBrightness       0.30      0.29      0.29        28
SetLightBrightness       0.21      0.19      0.20        27
     SetLightColor       0.27      0.42      0.33        24

          accuracy                           0.22       166
         macro avg       0.21      0.23      0.22       166
      weighted avg       0.21      0.22      0.21       166

[[ 5  5  7  7  5  7]
 [ 8  4  7  3  4  7]
 [ 3  2  4  1  4  4]
 [ 7  4  2  8  2  5]
 [ 4  6  2  6  5  4]
 [ 5  2  1  2  4 10]]

ACCURACY: 0.8554216867469879


# NEURAL NETWORK

In [163]:
mlp = MLPClassifier(
    hidden_layer_sizes=(100, 100, 100), activation="relu", solver="adam", max_iter=5000
)  # Create the classification model

mlp_pipe = make_pipeline(preprocessing.Normalizer(), mlp)  # Scale feature space

mean_score, mlp_pred_labels, mlp_score = run_classifier(
    classifier_pipe=mlp_pipe,
    type_of_dataset="ASR",
    train_size=0.9,
    number_of_times=100,
)

print("Average accuracy score =", round(mean_score, 3))

Average accuracy score = 0.843


### Model Evaluation

In [164]:
print(
    classification_report(
        test_labels,
        mlp_pred_labels,
        target_names=[
            "SwitchLightOff",
            "SwitchLightOn",
            "IncreaseBrightness",
            "DecreaseBrightness",
            "SetLightBrightness",
            "SetLightColor",
        ],
    )
)

print(confusion_matrix(test_labels, mlp_pred_labels))

print("\nACCURACY:", mlp_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.20      0.17      0.18        36
     SwitchLightOn       0.17      0.15      0.16        33
IncreaseBrightness       0.14      0.17      0.15        18
DecreaseBrightness       0.15      0.14      0.15        28
SetLightBrightness       0.16      0.19      0.17        27
     SetLightColor       0.19      0.21      0.20        24

          accuracy                           0.17       166
         macro avg       0.17      0.17      0.17       166
      weighted avg       0.17      0.17      0.17       166

[[6 7 6 5 8 4]
 [8 5 2 6 5 7]
 [3 5 3 4 1 2]
 [2 5 5 4 8 4]
 [7 3 1 6 5 5]
 [4 4 5 2 4 5]]

ACCURACY: 0.8012048192771084


# Try Yourself

In [165]:
def user_friendly(sentence, cls):
    """return action from sentence"""

    sent = sentence.split()
    new_sent = []
    for word in sent:
        new_sent.append(hp.autocorrection(word))

    sent_np = get_TFIDF_feature(new_sent, dct_X, model)
    y_pred = cls.predict(sent_np)
    return hp.indx2action(y_pred)

In [166]:
user_friendly("I want a increase in the lights of my living room", lgr_pipe)

['IncreaseBrightness']