# Test main Erik

In [1]:
# Downloads
# nltk.download('stopwords')
# nltk.download('wordnet')

# Import 
from gensim.test.utils import common_texts
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import numpy as np
import json

## TF IDF

### General testing

In [154]:
# Basic model
dataset = common_texts
dct = Dictionary(dataset)   # Set of avaliable text
corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format (i.e for each word in every sentece return "(index,nr times in sentence)""

model = TfidfModel(corpus)  # fit model
vector = model[corpus]   # apply model to the corpus document

### Our data

In [4]:
# Import raw meta data
File = open("openvoc-keyword-spotting-research-datasets/smart-lights/metadata.json")
metaData_in = json.load(File)
N =  len(metaData_in)
keys = list(metaData_in.keys())

In [5]:
# Manual tokenization + removing stopwords + lemmatization
important_words = {'on','off'}
lemmatizer = WordNetLemmatizer()
y_raw = []
X = []
for i in range(N):
    words = metaData_in.get(keys[i]).get('transcript').split(' ')
    words = [lemmatizer.lemmatize(word) for word in words if word not in ( set(stopwords.words('english'))-important_words)]
    X.append(words)
    y_raw.append(metaData_in.get(keys[i]).get('keywords'))

print(X[0])
print(y_raw[0])

['want', 'turn', 'off', 'light', 'living', 'room']
['turn off', 'living room']


In [30]:
#keywords to user action
def indx2action(y_num):
    N = len(y_num)
    y =[]
    for i in range (N):
        if y_num[i] == 0:
            y.append('SwitchLightOff')
        elif y_num[i] == 1:
            y.append('SwitchLightOn')
        elif y_num[i] == 2:
            y.append('IncreaseBrightness')
        elif  y_num[i] == 3:
            y.append('DecreaseBrightness')
        else:
            y.append('No class assigned')
    return y

y = []
y_num = np.empty(N)
i = 0
for line in y_raw:
    for word in line:
        if word == 'turn off':
            y.append('SwitchLightOff')
            y_num[i] = 0
            break
        elif word == 'turn on':
            y.append('SwitchLightOn')
            y_num[i] = 1
            break
        elif word == 'increase':
            y.append('IncreaseBrightness')
            y_num[i] = 2
            break
        elif word == 'decrease':
            y.append('DecreaseBrightness')
            y_num[i] = 3
            break
        else:
            y.append('No class')
            y_num[i] = 4
            break
    i += 1

In [147]:
# Create TD*IDF vector represenation
dct_X = Dictionary(X)
corpusBOW = [dct_X.doc2bow(line) for line in X]
model = TfidfModel(corpusBOW)
X_vec = model[corpusBOW]


In [148]:
def Tfidf2np(X,dct):
    """ 
    Converts gensim format to numpy array
    Input:
    X - TDidfModel vector (N x lenght("sentence"))
    dct - Dictionary object (lenght("unique words"))
    Output:
    X_np - N x length("unique words")
    """
    N_dict = len(dct)
    N_sent = len(X)
    X_np = np.zeros((N_sent,N_dict))
    i = 0
    for list in X:
        for word in list:
            X_np[i, word[0]] = word[1]
        i += 1
    return X_np

X_np = Tfidf2np(X_vec,dct_X)

In [149]:
# Split into training and test data
x_train, x_test, y_train, y_test = train_test_split(X_np,y_num,test_size = 0.5)

In [150]:
# Create LR model (add tuning parameters?)
cls = LogisticRegression()

In [151]:
cls.fit(x_train,y_train) 
y_est = cls.predict(x_test)
score = cls.score(x_test,y_test)
print(score)

0.9964539007092199


In [152]:
print(indx2action(y_est[1:10]))
print(indx2action(y_test[1:10]))

['SwitchLightOff', 'SwitchLightOff', 'DecreaseBrightness', 'SwitchLightOn', 'IncreaseBrightness', 'IncreaseBrightness', 'SwitchLightOn', 'IncreaseBrightness', 'DecreaseBrightness']
['SwitchLightOff', 'SwitchLightOff', 'DecreaseBrightness', 'SwitchLightOn', 'IncreaseBrightness', 'IncreaseBrightness', 'SwitchLightOn', 'IncreaseBrightness', 'DecreaseBrightness']


## Word2vec 

In [31]:
# Import
from gensim.models import Word2Vec

In [32]:
# Make word2vec based on previous corpus
vector_size = 100
model_w2v = Word2Vec(X,
    vector_size=vector_size,
    window=5,
    min_count=1,
    workers=4)

In [33]:
model_w2v.train(X,total_examples=N,epochs= 5)


(1603, 12580)

In [34]:
X_vec = np.empty((N,vector_size))
for i in range(N):
    # Constructing sentence feature
    value_iter = np.zeros((vector_size,))
    for word in X[i]:
        try:
            value_iter += np.array(model_w2v.wv[word]) / len(X[i])
        except:
            print('Issue for: X=',i,'with word "', word,'".')
            print('Word ignored in feature construction.')
    X_vec[i,:] = value_iter
print(X_vec.shape)

(564, 100)


In [54]:
x_train, x_test, y_train, y_test = train_test_split(X_vec,y_num,test_size = 0.5)

In [55]:
cls = LogisticRegression()

In [56]:
cls.fit(x_train,y_train) 
y_est = cls.predict(x_test)
score = cls.score(x_test,y_test)
print(score)

0.8439716312056738


In [57]:
print(indx2action(y_est[1:10]))
print(indx2action(y_test[1:10]))

['IncreaseBrightness', 'SwitchLightOn', 'IncreaseBrightness', 'IncreaseBrightness', 'SwitchLightOn', 'SwitchLightOn', 'SwitchLightOff', 'SwitchLightOn', 'SwitchLightOff']
['IncreaseBrightness', 'SwitchLightOn', 'IncreaseBrightness', 'IncreaseBrightness', 'SwitchLightOn', 'SwitchLightOn', 'SwitchLightOff', 'IncreaseBrightness', 'SwitchLightOff']
