# TF IDF

## Requirements 

In [1]:
# Imports
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline


import numpy as np
import pandas as pd

import _helpers as hp

## Load Data

In [2]:
ASR_data = pd.read_csv("snips/new_ASR_with_labels.csv") # ASR data with improved speech recognition with 15555 framerate and with autocorrection applied

GT_data = pd.read_csv("snips/merged_GT_data.csv") # Groundtruth data

In [190]:
ASR_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,transcript,user_action,user_action_num
0,0,0,activate light all like and the entire house,SwitchLightOn,1
1,1,1,activate basement lights,SwitchLightOn,1
2,2,2,adjust the bedroom light intensity of thirty nine,SetLightBrightness,4
3,3,3,can you please change the light color to pink,SetLightColor,5
4,4,4,rid the brightness to toilet,SetLightBrightness,4
...,...,...,...,...,...
1655,1655,1655,turn the large meeting room green,SetLightColor,5
1656,1656,1656,turn the laundry room lights to twenty two,SetLightBrightness,4
1657,1657,1657,don't the light intensity to level thirty nine,SetLightBrightness,4
1658,1658,1658,turned the flat on,SwitchLightOn,1


### DATA manipulation

In [191]:
#Get ASR data into a numpy word array per sentence plus a numpy label array

XX_ASR = ASR_data["transcript"].apply(lambda x: x.split(' '))
X_ASR = list(XX_ASR) #numpy word array per transcript

y_num_ASR = np.array(ASR_data["user_action_num"]) #labels


In [192]:
#Get Groundtruth data into a numpy word array per sentence plus a numpy label array

XX_GT = GT_data["transcript"].apply(lambda x: x.split(' '))
X_GT = list(XX_GT)  #numpy word array per transcript

y_num_GT = np.array(GT_data["user_action_num"]) #labels

In [193]:
# concat the 2 arrays into one
X_all = X_ASR + X_GT

## Keyword to user action

Label each sentence in the dataset acording with keywords to user action

## TF IDF vector representation 

### Create TD IDF model based in the dictionary of our dataset 

In [194]:
# Create TD*IDF vector represenation
dct_X = Dictionary(X_all)
corpusBOW = [dct_X.doc2bow(line) for line in X_all]
model = TfidfModel(corpusBOW)


corpusBOW_ASR = [dct_X.doc2bow(line) for line in X_ASR]
X_ASR_vec = model[corpusBOW_ASR]


corpusBOW_GT = [dct_X.doc2bow(line) for line in X_GT]
X_GT_vec = model[corpusBOW_GT]

In [195]:
def Tfidf2np(X,dct):
    """ 
    Converts gensim format to numpy array
    Input:
    X - TDidfModel vector (N x lenght("sentence"))
    dct - Dictionary object (lenght("unique words"))
    Output:
    X_np - N x length("unique words")
    """
    N_dict = len(dct)
    N_sent = len(X)
    X_np = np.zeros((N_sent,N_dict))
    i = 0
    for _list in X:
        for word in _list:
            X_np[i, word[0]] = word[1]
        i += 1
    return X_np

In [196]:
#Tokenize data
X_ASR_np = Tfidf2np(X_ASR_vec,dct_X)
X_GT_np = Tfidf2np(X_GT_vec,dct_X)

Lets create a function that vectorizes a sentence based in a embeded text model

In [197]:
def get_TFIDF_feature(sent,dct_X, model):
    """return a vector representative of a string"""

    corpusBOW_sent = [dct_X.doc2bow(line) for line in [sent]]

    sent_vec = model[corpusBOW_sent]

    sent_np = Tfidf2np(sent_vec,dct_X)

    return sent_np

# Classifiers

## Train/Test Split

If you want to train and test only with Groundtruth data uncomment the next cell and comment the remain cells in this section 

In [198]:
# # Split into training and test data

# train_features, test_features, train_labels, test_labels = train_test_split(X_GT_np, y_num_GT, train_size= 0.9)

If you want to train and test only with ASR data uncomment the next cell and comment the remain cells in this section 

In [199]:
# # Split into training and test data

# train_features, test_features, train_labels, test_labels = train_test_split(X_ASR_np, y_num_ASR, train_size= 0.9)

If you want to train with Groundtruth and test with ASR data uncomment the next cell and comment the remain cells in this section 

In [204]:
# Split into training and test data
train_features, test_features, train_labels, test_labels = X_GT_np,X_ASR_np,y_num_GT,y_num_ASR

In [205]:
y_num_GT.shape

(2224,)

## Logistic Regression

In [206]:
lgr = LogisticRegression(max_iter= 1000) #Create the classification model

lgr_pipe = make_pipeline(preprocessing.StandardScaler(), lgr) #Scale feature space
lgr_pipe.fit(train_features, train_labels)


lgr_pred_labels = lgr_pipe.predict(test_features) #predictions

lgr_score = lgr_pipe.score(test_features,test_labels) #accuracy

### Model Evaluation

In [207]:

print(classification_report(test_labels, lgr_pred_labels, target_names= ['SwitchLightOff','SwitchLightOn','IncreaseBrightness','DecreaseBrightness','SetLightBrightness',"SetLightColor"]))

print(confusion_matrix(test_labels, lgr_pred_labels))

print("\nACCURACY:", lgr_score)

 

                    precision    recall  f1-score   support

    SwitchLightOff       0.75      0.78      0.76       276
     SwitchLightOn       0.81      0.83      0.82       257
IncreaseBrightness       0.79      0.86      0.83       269
DecreaseBrightness       0.79      0.84      0.81       268
SetLightBrightness       0.92      0.84      0.88       296
     SetLightColor       0.91      0.83      0.87       294

          accuracy                           0.83      1660
         macro avg       0.83      0.83      0.83      1660
      weighted avg       0.83      0.83      0.83      1660

[[214  19  14  16   8   5]
 [ 12 213  13   7   8   4]
 [ 14   8 232   9   2   4]
 [  8  14  18 224   1   3]
 [ 20   2   7  11 249   7]
 [ 16   7   8  15   4 244]]

ACCURACY: 0.8289156626506025


## Naive Bayes

### Multinomial Naïve Bayes

In [208]:
mnb = MultinomialNB() #Create NB model

mnb_pipe = make_pipeline(preprocessing.Normalizer(), mnb) #Scale feature space
mnb_pipe.fit(train_features, train_labels)


mnb_pred_labels = mnb_pipe.predict(test_features) #predictions

mnb_score = mnb_pipe.score(test_features,test_labels) #accuracy

#### Model Evaluation

In [209]:
print(classification_report(test_labels, mnb_pred_labels, target_names= ['SwitchLightOff','SwitchLightOn','IncreaseBrightness','DecreaseBrightness','SetLightBrightness',"SetLightColor"]))

print(confusion_matrix(test_labels, mnb_pred_labels))

print("\nACCURACY:", mnb_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.81      0.75      0.78       276
     SwitchLightOn       0.82      0.85      0.84       257
IncreaseBrightness       0.80      0.88      0.84       269
DecreaseBrightness       0.84      0.82      0.83       268
SetLightBrightness       0.91      0.85      0.88       296
     SetLightColor       0.87      0.89      0.88       294

          accuracy                           0.84      1660
         macro avg       0.84      0.84      0.84      1660
      weighted avg       0.84      0.84      0.84      1660

[[208  15  15  16  10  12]
 [ 11 219  14   4   3   6]
 [  8   6 236   8   6   5]
 [  8  14  14 221   4   7]
 [ 11   5  11   8 253   8]
 [ 10   8   5   7   3 261]]

ACCURACY: 0.8421686746987952


### Bernoulli naïve Bayes

In [210]:
bnb = BernoulliNB() #Create the classification model

bnb_pipe = make_pipeline(preprocessing.Normalizer(), bnb) #Scale feature space
bnb_pipe.fit(train_features, train_labels)


bnb_pred_labels = bnb_pipe.predict(test_features) #predictions

bnb_score = bnb_pipe.score(test_features,test_labels) #accuracy

#### Model Evaluation

In [211]:
print(classification_report(test_labels, bnb_pred_labels, target_names= ['SwitchLightOff','SwitchLightOn','IncreaseBrightness','DecreaseBrightness','SetLightBrightness',"SetLightColor"]))

print(confusion_matrix(test_labels, bnb_pred_labels))

print("\nACCURACY:", bnb_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.93      0.68      0.79       276
     SwitchLightOn       0.86      0.83      0.84       257
IncreaseBrightness       0.75      0.84      0.79       269
DecreaseBrightness       0.80      0.83      0.82       268
SetLightBrightness       0.89      0.85      0.87       296
     SetLightColor       0.78      0.91      0.84       294

          accuracy                           0.83      1660
         macro avg       0.84      0.83      0.83      1660
      weighted avg       0.84      0.83      0.83      1660

[[189  11  20  26   9  21]
 [  3 214  18   6   5  11]
 [  5   6 226  10   8  14]
 [  0   8  16 223   4  17]
 [  4   4  13   9 252  14]
 [  2   7   7   5   4 269]]

ACCURACY: 0.8271084337349398


### Gaussian Naive Bayes

In [212]:
gnb = GaussianNB() #Create the classification model

gnb_pipe = make_pipeline(preprocessing.StandardScaler(), gnb) #Scale feature space
gnb_pipe.fit(train_features, train_labels)


gnb_pred_labels = gnb_pipe.predict(test_features) #predictions

gnb_score = gnb_pipe.score(test_features,test_labels) #accuracy

#### Model Evaluation

In [213]:
print(classification_report(test_labels, gnb_pred_labels, target_names= ['SwitchLightOff','SwitchLightOn','IncreaseBrightness','DecreaseBrightness','SetLightBrightness',"SetLightColor"]))

print(confusion_matrix(test_labels, gnb_pred_labels))

print("\nACCURACY:", gnb_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.69      0.66      0.68       276
     SwitchLightOn       0.59      0.68      0.63       257
IncreaseBrightness       0.67      0.64      0.65       269
DecreaseBrightness       0.68      0.71      0.69       268
SetLightBrightness       0.70      0.59      0.64       296
     SetLightColor       0.68      0.73      0.71       294

          accuracy                           0.67      1660
         macro avg       0.67      0.67      0.67      1660
      weighted avg       0.67      0.67      0.67      1660

[[183  20  14  12  22  25]
 [ 19 174  12  14  17  21]
 [ 11  47 171  14  16  10]
 [ 14  21  20 189   9  15]
 [ 26  22  18  26 174  30]
 [ 13  13  20  21  11 216]]

ACCURACY: 0.6668674698795181


# SVM

In [214]:
svm = SVC(C = 1) #Create the classification model

svm_pipe = make_pipeline(preprocessing.Normalizer(), svm) #Scale feature space
svm.fit(train_features, train_labels)


svm_pred_labels = svm.predict(test_features) #predictions

svm_score = svm.score(test_features,test_labels) #accuracy

### Model Evaluation

In [215]:
print(classification_report(test_labels, svm_pred_labels, target_names= ['SwitchLightOff','SwitchLightOn','IncreaseBrightness','DecreaseBrightness','SetLightBrightness',"SetLightColor"]))

print(confusion_matrix(test_labels, svm_pred_labels))

print("\nACCURACY:", svm_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.81      0.80      0.81       276
     SwitchLightOn       0.86      0.87      0.87       257
IncreaseBrightness       0.84      0.89      0.86       269
DecreaseBrightness       0.82      0.86      0.84       268
SetLightBrightness       0.93      0.89      0.91       296
     SetLightColor       0.94      0.88      0.91       294

          accuracy                           0.87      1660
         macro avg       0.87      0.87      0.87      1660
      weighted avg       0.87      0.87      0.87      1660

[[222  10  11  20   6   7]
 [ 10 224  13   6   3   1]
 [ 12   5 239   7   3   3]
 [  7  14  13 231   2   1]
 [ 12   3   4   8 264   5]
 [ 12   4   6   8   6 258]]

ACCURACY: 0.8662650602409638


# NEURAL NETWORK

In [216]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100), activation='relu', solver='adam', max_iter=5000) #Create the classification model

mlp_pipe = make_pipeline(preprocessing.Normalizer(), mlp) #Scale feature space
mlp_pipe.fit(train_features, train_labels)


mlp_pred_labels = mlp_pipe.predict(test_features) #predictions

mlp_score = mlp_pipe.score(test_features,test_labels) #accuracy

### Model Evaluation

In [217]:
print(classification_report(test_labels, mlp_pred_labels, target_names= ['SwitchLightOff','SwitchLightOn','IncreaseBrightness','DecreaseBrightness','SetLightBrightness',"SetLightColor"]))

print(confusion_matrix(test_labels, mlp_pred_labels))

print("\nACCURACY:", mlp_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.80      0.80      0.80       276
     SwitchLightOn       0.81      0.89      0.85       257
IncreaseBrightness       0.81      0.85      0.83       269
DecreaseBrightness       0.85      0.83      0.84       268
SetLightBrightness       0.92      0.89      0.90       296
     SetLightColor       0.93      0.85      0.88       294

          accuracy                           0.85      1660
         macro avg       0.85      0.85      0.85      1660
      weighted avg       0.85      0.85      0.85      1660

[[222  16  14  12   7   5]
 [  6 228  11   5   4   3]
 [ 12  10 229   9   4   5]
 [  9  15  16 223   2   3]
 [ 15   3   6   6 262   4]
 [ 13  10   8   7   7 249]]

ACCURACY: 0.8512048192771084


# Try Your Self

In [218]:
def user_friendly(sentence, cls):
    """return action from sentence"""
    
    sent = sentence.split()
    new_sent = []
    for word in sent:
        new_sent.append(hp.autocorrection(word))

    sent_np = get_TFIDF_feature(new_sent,dct_X,model)
    y_pred = cls.predict(sent_np)
    return hp.indx2action(y_pred)

In [219]:
user_friendly("I want a increase in the lights of my living room", lgr_pipe)

['IncreaseBrightness']