In [39]:
import pandas as pd

dataframe = pd.read_csv('dataset.csv')
dataframe.head()

Unnamed: 0,ID,Name,Description,Importance
0,0,Decorate office,Add new decorations to the office space. Good ...,5
1,1,Walk the dog,Take the dog for a long walk. Make sure to exp...,5
2,2,Client follow-up,Check in with clients on recent proposals. Key...,7
3,3,Respond to friend’s text,Message back a friend who texted yesterday. Pe...,3
4,4,Exercise Routine,Follow daily exercise routine. Important for h...,6


In [40]:
task_names = dataframe["Name"]
task_descriptions = dataframe["Description"]
importances = dataframe["Importance"]
print(len(task_names))

10000


In [41]:
concatenated = [str(task_names[i]).lower() + " " + str(task_descriptions[i]).lower() for i in range(len(task_names))]
print(concatenated[:2])

['decorate office add new decorations to the office space. good for work environment.', 'walk the dog take the dog for a long walk. make sure to explore new paths and spend some quality time.']


In [42]:
alphanumeric = []
for task in concatenated:
    new_task = ''.join([character if character.isalnum() else ' ' for character in task])
    alphanumeric.append(new_task)
print(alphanumeric[:2])

['decorate office add new decorations to the office space  good for work environment ', 'walk the dog take the dog for a long walk  make sure to explore new paths and spend some quality time ']


In [43]:
tokens = []
for task in alphanumeric:
    tokens.append([word for word in task.split() if word != ''])
print(tokens[:5])

[['decorate', 'office', 'add', 'new', 'decorations', 'to', 'the', 'office', 'space', 'good', 'for', 'work', 'environment'], ['walk', 'the', 'dog', 'take', 'the', 'dog', 'for', 'a', 'long', 'walk', 'make', 'sure', 'to', 'explore', 'new', 'paths', 'and', 'spend', 'some', 'quality', 'time'], ['client', 'follow', 'up', 'check', 'in', 'with', 'clients', 'on', 'recent', 'proposals', 'key', 'for', 'securing', 'new', 'projects'], ['respond', 'to', 'friend', 's', 'text', 'message', 'back', 'a', 'friend', 'who', 'texted', 'yesterday', 'personal', 'low', 'importance'], ['exercise', 'routine', 'follow', 'daily', 'exercise', 'routine', 'important', 'for', 'health', 'and', 'fitness']]


In [44]:
vocabulary = set()
for task in tokens:
    for token in task:
        vocabulary.add(token)
vocabulary = list(vocabulary)
print(len(vocabulary))

3733


In [45]:
import cupy as np

word2index = dict()
for i, w in enumerate(vocabulary):
    word2index[w] = i
indices = [np.asarray([word2index[token] for token in task]) for task in tokens]
print(indices[:5])

[array([1102, 3659, 3661, 2649, 1424, 1526, 2524, 3659, 1210, 2235, 3643,
       1112,  248]), array([2219, 2524, 2674, 1175, 2524, 2674, 3643, 1728, 1024, 2219, 3280,
       1351, 1526, 2558, 2649,    5, 1202,  748, 2790, 3277, 1363]), array([ 852, 3112, 1800,  509,  454,  364, 1220, 2077, 2008,  703, 2339,
       3643, 1212, 2649, 1325]), array([1088, 1526, 3015,  616, 1344,  108, 1700, 1728, 3015,  358,  911,
       1675, 1499, 2691, 2772]), array([1543,  348, 3112, 2513, 1543,  348, 2198, 3643,  466, 1202,  732])]


In [46]:
from autograd import Tensor
from layers import Embedding, LSTMCell, MSELoss, Linear, Relu, Sequential
from optimisers import Adam

embedding = Embedding(vocab_size=len(vocabulary), dim=512)
model = Sequential([LSTMCell(n_inputs=512, n_outputs=512), LSTMCell(n_inputs=512, n_outputs=512)])
output_layer = Sequential([Linear(n_inputs=512, n_outputs=512), Relu(), Linear(n_inputs=512, n_outputs=1)])

In [47]:
criterion = MSELoss()
optimiser = Adam(parameters=model.get_parameters() + embedding.get_parameters() + output_layer.get_parameters())

In [48]:
X_train = indices[:int(len(indices) * 0.8)]
y_train = importances[:int(len(indices) * 0.8)]

X_test = indices[int(len(indices) * 0.8):]
y_test = importances[int(len(indices) * 0.8):]

print(type(y_train))

<class 'pandas.core.series.Series'>


In [49]:
min_loss = 1000
iterations = 10
timestep = 0
moment1 = [np.zeros(p.data.shape) for p in model.get_parameters() + embedding.get_parameters() + output_layer.get_parameters()]
moment2 = [np.zeros(p.data.shape) for p in model.get_parameters() + embedding.get_parameters() + output_layer.get_parameters()]
for iteration in range(iterations):
    total_loss = 0
    
    hidden0 = model.layers[0].init_hidden()
    hidden1 = model.layers[1].init_hidden()
    
    for task_i in range(0, len(X_train)):
        timestep += 1

        hidden0 = model.layers[0].init_hidden()
        hidden1 = model.layers[1].init_hidden()
        
        for t in range(len(X_train[task_i])):
            input = np.zeros(shape=(1,len(vocabulary)))
            input[0, X_train[task_i][t]] = 1
            input = Tensor(input, autograd=True)
            lstm_input = embedding.forward(input=input)
            hidden0 = model.layers[0].forward(input=lstm_input, hidden=hidden0)
            hidden1 = model.layers[1].forward(input=hidden0[0], hidden=hidden1)
        
        output = output_layer.forward(hidden1[0])
        target = Tensor(y_train[task_i], autograd=True)
        loss = criterion.forward(output, target)
        loss.backward()
        moment1, moment2 = optimiser.step(timestep, moment1, moment2)
        
        total_loss += loss.data / len(X_train[task_i])
        epoch_loss = np.exp(total_loss / (task_i + 1))
        
        if epoch_loss < min_loss:
            min_loss = epoch_loss
            print()
        
        print(f"Iter: {iteration} - Alpha: {str(optimiser.alpha)[:5]} - Example {task_i + 1}/{len(X_train)} - Min Loss: {str(min_loss)} - Loss: {epoch_loss}", end='\r')
    optimiser.alpha *= 0.99

Iter: 0 - Alpha: 0.001 - Example 2/8000 - Min Loss: 1000 - Loss: [70139.41113378]
Iter: 0 - Alpha: 0.001 - Example 3/8000 - Min Loss: [329.56902429] - Loss: [329.56902429]
Iter: 0 - Alpha: 0.001 - Example 4/8000 - Min Loss: [33.67223694] - Loss: [33.67223694]
Iter: 0 - Alpha: 0.001 - Example 5/8000 - Min Loss: [1.12685024] - Loss: [1.12685024]
Iter: 0 - Alpha: 0.001 - Example 10/8000 - Min Loss: [1.08209413] - Loss: [1.30242848]
Iter: 0 - Alpha: 0.001 - Example 31/8000 - Min Loss: [1.00000195] - Loss: [1.01602269]

KeyboardInterrupt: 

In [25]:
def predict(task_indices):
    hidden0 = model.layers[0].init_hidden()
    hidden1 = model.layers[1].init_hidden()
    for t in range(len(task_indices)):
        input = Tensor([task_indices[t]], autograd=True)
        lstm_input = embedding.forward(input=input)
        hidden0 = model.layers[0].forward(input=lstm_input, hidden=hidden0)
        hidden1 = model.layers[1].forward(input=hidden0[0], hidden=hidden1)
    output = output_layer.forward(hidden1[0])
    
    return output

In [26]:
for task_i in range(len(X_test)):
    output = predict(X_test[task_i])
    target = Tensor(np.array(y_test)[task_i], autograd=True)
    loss = criterion.forward(output, target)
    print(f"Task {task_i + 1}/{len(X_test)} - Loss: {loss.data}", end='\r')
    total_loss += loss.data / len(X_test[task_i])
loss_to_display = np.exp(total_loss / (task_i + 1))
print()
print(loss_to_display)

Task 2000/2000 - Loss: [0.33116146]]
[1.28548814]


In [27]:
import pickle as pkl

with open("importance_embedding.pkl", "wb") as file:
    pkl.dump(embedding, file)

with open("importance_model.pkl", "wb") as file:
    pkl.dump(model, file)

with open("importance_output.pkl", "wb") as file:
    pkl.dump(output_layer, file)

with open("word2index.pkl", "wb") as file:
    pkl.dump(word2index, file)

In [53]:
with open("importance_embedding.pkl", "rb") as file:
    embedding = pkl.load(file)

with open("importance_model.pkl", "rb") as file:
    model = pkl.load(file)

with open("importance_output.pkl", "rb") as file:
    output_layer = pkl.load(file)

with open("word2index.pkl", "rb") as file:
    word2index = pkl.load(file)