In [1]:
import pandas as pd

dataframe = pd.read_csv('dataset.csv')
dataframe.head()

Unnamed: 0.1,Unnamed: 0,Name,Description,Importance
0,0,Decorate office,Add new decorations to the office space. Good ...,5
1,1,Walk the dog,Take the dog for a long walk. Make sure to exp...,5
2,2,Client follow-up,Check in with clients on recent proposals. Key...,7
3,3,Respond to friend’s text,Message back a friend who texted yesterday. Pe...,3
4,4,Exercise Routine,Follow daily exercise routine. Important for h...,6


In [17]:
task_names = dataframe["Name"]
task_descriptions = dataframe["Description"]
importances = dataframe["Importance"]
print(len(task_names))

10000


In [18]:
concatenated = [str(task_names[i]).lower() + " " + str(task_descriptions[i]).lower() for i in range(len(task_names))]
print(concatenated[:2])

['decorate office add new decorations to the office space. good for work environment.', 'walk the dog take the dog for a long walk. make sure to explore new paths and spend some quality time.']


In [22]:
alphanumeric = []
for task in concatenated:
    new_task = ''.join([character if character.isalnum() else ' ' for character in task])
    alphanumeric.append(new_task)
print(alphanumeric[:2])

10000
['decorate office add new decorations to the office space  good for work environment ', 'walk the dog take the dog for a long walk  make sure to explore new paths and spend some quality time ']


In [23]:
tokens = []
for task in alphanumeric:
    tokens.append([word for word in task.split() if word != ''])
print(tokens[:5])

10000
[['decorate', 'office', 'add', 'new', 'decorations', 'to', 'the', 'office', 'space', 'good', 'for', 'work', 'environment'], ['walk', 'the', 'dog', 'take', 'the', 'dog', 'for', 'a', 'long', 'walk', 'make', 'sure', 'to', 'explore', 'new', 'paths', 'and', 'spend', 'some', 'quality', 'time'], ['client', 'follow', 'up', 'check', 'in', 'with', 'clients', 'on', 'recent', 'proposals', 'key', 'for', 'securing', 'new', 'projects'], ['respond', 'to', 'friend', 's', 'text', 'message', 'back', 'a', 'friend', 'who', 'texted', 'yesterday', 'personal', 'low', 'importance'], ['exercise', 'routine', 'follow', 'daily', 'exercise', 'routine', 'important', 'for', 'health', 'and', 'fitness']]


In [29]:
vocabulary = set()
for task in tokens:
    for token in task:
        vocabulary.add(token)
vocabulary = list(vocabulary)
print(len(vocabulary))

3733


In [30]:
import numpy as np

word2index = dict()
for i, w in enumerate(vocabulary):
    word2index[w] = i
indices = [np.asarray([word2index[token] for token in task]) for task in tokens]
print(indices[:5])

[array([3509,  761, 2582, 3592, 2192, 3653,  384,  761, 2729, 1334, 3194,
        985, 1335]), array([ 416,  384,  701,    8,  384,  701, 3194, 3300, 1965,  416, 2852,
       2430, 3653, 3578, 3592, 1957, 1499, 3176,  448, 1371,  923]), array([3142,  155, 3293, 2624,  704,  147, 3502, 3128,  650, 3118, 1803,
       3194, 3555, 3592, 2334]), array([3186, 3653, 2247, 2163,  933,  327,  283, 3300, 2247, 2607, 3242,
       3646,  325, 1408, 2956]), array([ 201,  935,  155,  610,  201,  935, 3289, 3194, 2245, 1499, 1045])]


In [47]:
from autograd import Tensor
from layers import Embedding, LSTMCell, MSELoss, Linear, Sequential
from optimisers import SGD

embedding = Embedding(vocab_size=len(vocabulary), dim=512)
model = Sequential([LSTMCell(n_inputs=512, n_hidden=512, n_outputs=512), LSTMCell(n_inputs=512, n_hidden=512, n_outputs=512)])
output_layer = Linear(n_inputs=512, n_outputs=1)

In [48]:
criterion = MSELoss()
optimiser = SGD(parameters=model.get_parameters() + embedding.get_parameters() + output_layer.get_parameters(), alpha=0.005)

In [49]:
X_train = indices[:int(len(indices) * 0.8)]
y_train = importances[:int(len(indices) * 0.8)]

X_test = indices[int(len(indices) * 0.8):]
y_test = importances[int(len(indices) * 0.8):]

print(type(y_train))

<class 'pandas.core.series.Series'>


In [50]:
min_loss = 1000
iterations = 10
for iteration in range(iterations):
    total_loss = 0
    
    hidden0 = model.layers[0].init_hidden(batch_size=1)
    hidden1 = model.layers[1].init_hidden(batch_size=1)
    
    for task_i in range(len(X_train)):
        hidden0 = model.layers[0].init_hidden(batch_size=1)
        hidden1 = model.layers[1].init_hidden(batch_size=1)
        
        for t in range(len(X_train[task_i])):
            input = Tensor([X_train[task_i][t]], autograd=True)
            lstm_input = embedding.forward(input=input)
            hidden0 = model.layers[0].forward(input=lstm_input, hidden=hidden0)
            hidden1 = model.layers[1].forward(input=hidden0[0], hidden=hidden1)
        
        output = output_layer.forward(hidden1[0])
        target = Tensor(y_train[task_i], autograd=True)
        loss = criterion.forward(output, target)
        loss.backward()
        optimiser.step()
        
        total_loss += loss.data / len(X_train[task_i])
        epoch_loss = np.exp(total_loss / (task_i + 1))
        
        if epoch_loss < min_loss:
            min_loss = epoch_loss
            print()
        
        print(f"Iter: {iteration} - Alpha: {str(optimiser.alpha)[:5]} - Example {task_i + 1}/{len(X_train)} - Min Loss: {str(min_loss)[:5]} - Loss: {epoch_loss}", end='\r')
    optimiser.alpha *= 0.99


Iter: 0 - Alpha: 0.005 - Example 1/8000 - Min Loss: [6.87 - Loss: [6.87054531]
Iter: 0 - Alpha: 0.005 - Example 14/8000 - Min Loss: [4.59 - Loss: [4.64359376]
Iter: 0 - Alpha: 0.005 - Example 15/8000 - Min Loss: [4.26 - Loss: [4.26436368]
Iter: 0 - Alpha: 0.005 - Example 16/8000 - Min Loss: [3.96 - Loss: [3.96074727]
Iter: 0 - Alpha: 0.005 - Example 17/8000 - Min Loss: [3.65 - Loss: [3.65310817]
Iter: 0 - Alpha: 0.005 - Example 18/8000 - Min Loss: [3.41 - Loss: [3.41318517]
Iter: 0 - Alpha: 0.005 - Example 19/8000 - Min Loss: [3.20 - Loss: [3.20185994]
Iter: 0 - Alpha: 0.005 - Example 20/8000 - Min Loss: [3.10 - Loss: [3.10505323]
Iter: 0 - Alpha: 0.005 - Example 21/8000 - Min Loss: [2.94 - Loss: [2.94437459]
Iter: 0 - Alpha: 0.005 - Example 22/8000 - Min Loss: [2.85 - Loss: [2.85570479]
Iter: 0 - Alpha: 0.005 - Example 23/8000 - Min Loss: [2.73 - Loss: [2.7324476]
Iter: 0 - Alpha: 0.005 - Example 24/8000 - Min Loss: [2.66 - Loss: [2.66862395]
Iter: 0 - Alpha: 0.005 - Example 25/8000 

KeyboardInterrupt: 

In [54]:
def predict(task_indices):
    hidden0 = model.layers[0].init_hidden(batch_size=1)
    hidden1 = model.layers[1].init_hidden(batch_size=1)
    for t in range(len(task_indices)):
        input = Tensor([task_indices[t]], autograd=True)
        lstm_input = embedding.forward(input=input)
        hidden0 = model.layers[0].forward(input=lstm_input, hidden=hidden0)
        hidden1 = model.layers[1].forward(input=hidden0[0], hidden=hidden1)
    output = output_layer.forward(hidden1[0])
    
    return output

In [55]:
for task_i in range(len(X_test)):
    output = predict(X_test[task_i])
    target = Tensor(np.array(y_test)[task_i], autograd=True)
    loss = criterion.forward(output, target)
    print(f"Task {task_i + 1}/{len(X_test)} - Loss: {loss.data}", end='\r')
    total_loss += loss.data / len(X_test[task_i])
loss_to_display = np.exp(total_loss / (task_i + 1))
print()
print(loss_to_display)

Task 2000/2000 - Loss: [0.00100236]]05]
[1.18641419]


In [53]:
import pickle as pkl

with open("importance_embedding.pkl", "wb") as file:
    pkl.dump(embedding, file)

with open("importance_model.pkl", "wb") as file:
    pkl.dump(model, file)

with open("importance_output.pkl", "wb") as file:
    pkl.dump(output_layer, file)

with open("word2index.pkl", "wb") as file:
    pkl.dump(word2index, file)