In [110]:
#requirements
import sys
!{sys.executable} -m pip install --user deeppavlov


!pip install Pillow
!pip install --upgrade numpy==1.18.0
!pip install pycocotools==2.0.0
!pip install -U scikit-learn==0.21.2
import pandas as pd
import matplotlib
from matplotlib.colors import is_color_like
import numpy as np
from deeppavlov.core.data.utils import simple_download



### 1. LOAD DATA

Load data used to build our model

In [111]:
#load data into a pandas dataframe
data = pd.read_json('./snips/metadata.json', orient='index')

After loading the data I selected only the columns that I found usefull

In [112]:
#select data that is needed
data_sentences = data[['transcript','keywords']]

Now lets see the keywords that we have and the number of apperances for each keywords

In [113]:
pd.Series(data_sentences['keywords'].sum()).value_counts()

kitchen        193
brightness     192
bedroom        187
living room    184
increase       144
decrease       143
turn off       139
turn on        139
dtype: int64

It would be expected (in an ideal training set) that we would have keywords related to every "user action" in a uniforme distribution, but we can easily see that this is not the case. This data set is obviosly favoring the actions of 'SwitchLightOff','SwitchLightOn','DecreaseBrightness','IncreaseBrightness'. Missing sentences related to 'SetLightBrightness' and 'SetLightColor'. This assumption is based on the fact that for a sentence with the intension of changing the light colors, the color that we pretend to change the light (i.e red, yellow, blue...) is definitly a keyword, and we dont observe anything related to that subject in the keywords. The same goes for the set brightness action it should appear something related to the brightness level. This lack of data will affect the predictions of the model as we will observe later on the code

### ASSOCIATE KEYWORDS TO USER ACTIONS

Here I wanted to associate the keywords to a user action so i could build the model, this is not the most clean code ever nor the more efficient one, but I kinda wanted to build the model as fast as possible to see some results so i didn't put much thought into this implementation 

In [114]:

#keywords to user action
data_sentences['user_actions'] = 0

for index, rows in data_sentences.iterrows():
    if('turn off' in  rows.keywords):
        data_sentences.loc[index,'user_actions'] = 'SwitchLightOff'
    elif('turn on' in  rows.keywords):
        data_sentences.loc[index,'user_actions'] = 'SwitchLightOn'
    elif('increase' in rows.keywords):
        is_num_in_actions = [element.isdigit()  for element in rows.keywords]
        if(True in is_num_in_actions):
            data_sentences.loc[index,'user_actions'] = 'SetLightBrightness'
        else:
            data_sentences.loc[index,'user_actions'] = 'IncreaseBrightness'
    elif('decrease' in rows.keywords):
        is_num_in_actions = [element.isdigit()  for element in rows.keywords]
        if(True in is_num_in_actions):
            data_sentences.loc[index,'user_actions'] = 'SetLightBrightness'
        else:
            data_sentences.loc[index,'user_actions'] = 'DecreaseBrightness'
    else:
        is_color_in_actions = [is_color_like(element)  for element in rows.keywords]
        if(True in is_color_in_actions):
            data_sentences.loc[index,'user_actions'] = 'SetLightColor'
        else:
            data_sentences.loc[index,'user_actions'] = 'Nullaction'

        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sentences['user_actions'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [115]:
#save data into csv file since is the input for the model
data_sentences.to_csv('./snips/data_sentences.csv', index=False)

### 2. DATA SPLIT

Read dataset, previous saved into the csv file

In [116]:
from deeppavlov.dataset_readers.basic_classification_reader import BasicClassificationDatasetReader

# read data from particular columns of `.csv` file
dr = BasicClassificationDatasetReader().read(
    data_path='./snips/',
    train='data_sentences.csv',
    x = 'transcript',
    y = 'user_actions'
)




### SPLIT DATA INTO TRAIN AND VALID SETS

In [117]:
from deeppavlov.dataset_iterators.basic_classification_iterator import BasicClassificationDatasetIterator

# initialize data iterator splitting `train` field to `train` and `valid` in proportion 0.8/0.2
train_iterator = BasicClassificationDatasetIterator(
    data=dr,
    field_to_split='train',  # field that will be splitted
    split_fields=['train', 'valid'],   # fields to which the fiald above will be splitted
    split_proportions=[0.8, 0.2],  #proportions for splitting
    split_seed=23,  # seed for splitting dataset
    seed=42)  # seed for iteration over dataset



2021-11-27 03:26:30.8 INFO in 'deeppavlov.dataset_iterators.basic_classification_iterator'['basic_classification_iterator'] at line 73: Splitting field <<train>> to new fields <<['train', 'valid']>>


In [118]:
x_train, y_train = train_iterator.get_instances(data_type='train')
for x, y in list(zip(x_train, y_train))[:5]:
    print('x:', x)
    print('y:', y)
    print('=================')

x: turn on the lights for the bedroom please
y: SwitchLightOn
x: increase the lights for the bedroom now
y: IncreaseBrightness
x: increase the lights in the kitchen
y: IncreaseBrightness
x: increase the brightness in the bedroom
y: IncreaseBrightness
x: please turn off the lights in the living room
y: SwitchLightOff


### 3. MODEL CREATION ###

In [119]:
# get all train and valid data from iterator
x_train, y_train = train_iterator.get_instances(data_type="train")
x_valid, y_valid = train_iterator.get_instances(data_type="valid")

### 3.2 TEXT TO VECTOR

In order to get our computer to understand any text, we need to break that word down in a way that our machine can understand. Therefore I used one of the tokenized functions that deepavlov has in order match a vector to each text sample:

In [120]:
from deeppavlov.models.sklearn import SklearnComponent

tfidf = SklearnComponent(
    model_class="sklearn.feature_extraction.text:TfidfVectorizer",
    infer_method="transform",
    save_path='./tfidf_v0.pkl',
    load_path='./tfidf_v0.pkl',
    mode='train')

2021-11-27 03:26:30.92 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 203: Loading model sklearn.feature_extraction.text:TfidfVectorizer from /home/goncalo/Documents/EPFL/ML/ML_course/NLP-project/Me/tfidf_v0.pkl
2021-11-27 03:26:30.93 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 210: Model sklearn.feature_extraction.textTfidfVectorizer loaded  with parameters


In [121]:
from deeppavlov.models.preprocessors.str_lower import str_lower
tfidf.fit(str_lower(train_iterator.get_instances(data_type='train')[0]))
tfidf.save()


2021-11-27 03:26:30.123 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 109: Fitting model sklearn.feature_extraction.textTfidfVectorizer
2021-11-27 03:26:30.132 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 241: Saving model to /home/goncalo/Documents/EPFL/ML/ML_course/NLP-project/Me/tfidf_v0.pkl


### 3.2 LOGISTIC REGRESSION

Now we are able to build a machine learning model to try to predict our user intentions in a given sentence. I picked a logistic regression model but we definitly need to try other models in the future to see which one give us better results

In [122]:
from deeppavlov.metrics.accuracy import sets_accuracy

# initialize sklearn classifier, all parameters for classifier could be passed
cls = SklearnComponent(
    model_class="sklearn.linear_model:LogisticRegression",
    infer_method="predict",
    save_path='./logreg_v0.pkl',
    load_path='./logreg_v0.pkl',
    C=1,
    mode='train')



2021-11-27 03:26:30.157 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 203: Loading model sklearn.linear_model:LogisticRegression from /home/goncalo/Documents/EPFL/ML/ML_course/NLP-project/Me/logreg_v0.pkl
2021-11-27 03:26:30.159 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 210: Model sklearn.linear_model._logisticLogisticRegression loaded  with parameters


In [123]:
# fit sklearn classifier and save it
cls.fit(tfidf(x_train), y_train)
cls.save()

2021-11-27 03:26:30.230 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 109: Fitting model sklearn.linear_model._logisticLogisticRegression
2021-11-27 03:26:30.258 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 241: Saving model to /home/goncalo/Documents/EPFL/ML/ML_course/NLP-project/Me/logreg_v0.pkl


In [124]:
y_valid_pred = cls(tfidf(x_valid))

In [125]:
# Let's look into obtained result
print("Text sample: {}".format(x_valid[0]))
print("True label: {}".format(y_valid[0]))
print("Predicted label: {}".format(y_valid_pred[0]))



Text sample: increase lights for the kitchen now
True label: IncreaseBrightness
Predicted label: IncreaseBrightness


In [126]:
sets_accuracy(np.squeeze(y_valid), y_valid_pred)

1.0

Note: This accuracy is unrealisticly high as expected given the fact that the sentences in the valid sample were way to similar to the training test. This result totally supports my initial assumption for our lack of variety in the dataset, and we can confirm this by self testing sentences and seeing what user action it gave us

In [127]:
#Test for Increasing/Decreasing Lights and for TurnOn/Off Lights
print('\n Test for Increasing/Decreasing Lights and for TurnOn/Off Lights \n')
print("Test1: {}".format(cls(tfidf(['Marcus needs more brightness']))))
print("Test2: {}".format(cls(tfidf(['Erick screamed: Decrease the lights in rolex']))))
print("Test3: {}".format(cls(tfidf(['Gonçalo turn off the brigtness']))))
print("Test4: {}".format(cls(tfidf(['DarthVader fliped on the brigtness']))))

#Test for Set Brigthness/Change color
print('\n Test for Set Brigthness/Change color \n')
print("Test5: {}".format(cls(tfidf(['Gonçalo tried to change the light to red']))))
print("Test6: {}".format(cls(tfidf(['Erick switched the light to blue']))))
print("Test7: {}".format(cls(tfidf(['Marcus set brightness to 8']))))
print("Test8: {}".format(cls(tfidf(['Yoda set the lights to 2']))))


 Test for Increasing/Decreasing Lights and for TurnOn/Off Lights 

Test1: ['IncreaseBrightness']
Test2: ['DecreaseBrightness']
Test3: ['SwitchLightOff']
Test4: ['SwitchLightOn']

 Test for Set Brigthness/Change color 

Test5: ['DecreaseBrightness']
Test6: ['DecreaseBrightness']
Test7: ['IncreaseBrightness']
Test8: ['DecreaseBrightness']


### OVERALL

 - **We need to improve our training dataset**: Not just find/ask for more variety on the data but also find a way to augment our currrent data in order that our model fits better synonyms of the keywords (ie. increase, grow, amplify...), although the model is working well for sentences that are related to ('IncreaseBrightness','DecreaseBrightness','SwitchLightOff' amd 'SwitchLightOn') we still need to improve the model to do a better fitting in synonyms.

 - **We need to test more models**

 - **Improve the way we coorelate the keywords to the user actions**