In [1]:
import pandas as pd

dataframe = pd.read_csv('dataset.csv')
dataframe.head()

Unnamed: 0.1,Unnamed: 0,Name,Description,Importance
0,0,Yoga Session,Attend the 90-minute yoga session to improve f...,7
1,1,Update Budget,Review and update the monthly budget. Crucial ...,8
2,2,Watch Webinar,Attend a live webinar on digital marketing str...,4
3,3,Water Plants,Watering indoor plants to keep them alive. Imp...,7
4,4,Schedule Doctor’s Appointment,Scheduling an appointment for a routine health...,8


In [3]:
task_names = dataframe["Name"]
task_descriptions = dataframe["Description"]
importances = dataframe["Importance"]
print(len(task_names))

1000


In [4]:
concatenated = [task_names[i].lower() + " " + task_descriptions[i].lower() for i in range(len(task_names))]
print(concatenated[:2])

['yoga session attend the 90-minute yoga session to improve flexibility and reduce stress. important for well-being.', 'update budget review and update the monthly budget. crucial for financial health and planning.']


In [7]:
alphanumeric = []
for task in concatenated:
    new_task = ''.join([character if character.isalnum() else ' ' for character in task])
    alphanumeric.append(new_task)
print(alphanumeric[:2])

['yoga session attend the 90 minute yoga session to improve flexibility and reduce stress  important for well being ', 'update budget review and update the monthly budget  crucial for financial health and planning ']


In [8]:
tokens = []
for task in alphanumeric:
    tokens.append([word for word in task.split() if word != ''])
print(tokens[:5])

[['yoga', 'session', 'attend', 'the', '90', 'minute', 'yoga', 'session', 'to', 'improve', 'flexibility', 'and', 'reduce', 'stress', 'important', 'for', 'well', 'being'], ['update', 'budget', 'review', 'and', 'update', 'the', 'monthly', 'budget', 'crucial', 'for', 'financial', 'health', 'and', 'planning'], ['watch', 'webinar', 'attend', 'a', 'live', 'webinar', 'on', 'digital', 'marketing', 'strategies', 'useful', 'for', 'professional', 'development', 'but', 'it', 'will', 'be', 'recorded', 'for', 'later', 'viewing'], ['water', 'plants', 'watering', 'indoor', 'plants', 'to', 'keep', 'them', 'alive', 'important', 'for', 'plant', 'health', 'but', 'has', 'a', 'day', 'or', 'two', 'of', 'flexibility'], ['schedule', 'doctor', 's', 'appointment', 'scheduling', 'an', 'appointment', 'for', 'a', 'routine', 'health', 'check', 'up', 'important', 'for', 'monitoring', 'and', 'maintaining', 'health']]


In [9]:
vocabulary = set()
for task in tokens:
    for token in task:
        vocabulary.add(token)
vocabulary = list(vocabulary)
print(len(vocabulary))

1493


In [10]:
import numpy as np

word2index = dict()
for i, w in enumerate(vocabulary):
    word2index[w] = i
indices = [np.asarray([word2index[token] for token in task]) for task in tokens]
print(indices[:5])

[array([ 969,  440,  748,  295, 1435,  259,  969,  440,   74, 1196,  139,
       1178,  498,  663,  719, 1257,   36, 1064]), array([ 940,   30,   75, 1178,  940,  295,   67,   30,  263, 1257,  150,
       1462, 1178,  658]), array([ 723,  859,  748,  703,  185,  859,  293,  462, 1429, 1143,  249,
       1257,  324,  783,  849, 1147,  716,  976, 1275, 1257, 1428, 1018]), array([ 915, 1185,  839,  257, 1185,   74, 1349,   60,  125,  719, 1257,
        872, 1462,  849,  861,  703,  138, 1215,  208,  238,  139]), array([ 309,  173, 1413, 1213,  164,  824, 1213, 1257,  703, 1287, 1462,
       1431,  437,  719, 1257,  986, 1178, 1170, 1462])]


In [11]:
from autograd import Tensor
from layers import Embedding, LSTMCell, MSELoss, Linear
from optimisers import SGD

embedding = Embedding(vocab_size=len(vocabulary), dim=512)
model = LSTMCell(n_inputs=512, n_hidden=512, n_outputs=1)
output_layer = Linear(n_inputs=512, n_outputs=1)

In [12]:
criterion = MSELoss()
optimiser = SGD(parameters=model.get_parameters() + embedding.get_parameters() + output_layer.get_parameters(), alpha=0.05)

In [13]:
X_train = indices[:int(len(indices) * 0.8)]
y_train = importances[:int(len(indices) * 0.8)]

X_test = indices[int(len(indices) * 0.8):]
y_test = importances[int(len(indices) * 0.8):]

print(type(y_train))

<class 'pandas.core.series.Series'>


In [15]:
min_loss = 1000
iterations = 10
for iteration in range(iterations):
    total_loss = 0
    
    hidden = model.init_hidden(batch_size=1)
    
    for task_i in range(len(X_train)):
        hidden = model.init_hidden(batch_size=1)
        
        for t in range(len(X_train[task_i])):
            input = Tensor([X_train[task_i][t]], autograd=True)
            rnn_input = embedding.forward(input=input)
            hidden = model.forward(input=rnn_input, hidden=hidden)
        
        output = output_layer.forward(hidden[0])
        target = Tensor(y_train[task_i], autograd=True)
        loss = criterion.forward(output, target)
        loss.backward()
        optimiser.step()
        
        total_loss += loss.data / len(X_train[task_i])
        epoch_loss = np.exp(total_loss / (task_i + 1))
        
        if epoch_loss < min_loss:
            min_loss = epoch_loss
            print()
        
        print(f"Iter: {iteration} - Alpha: {str(optimiser.alpha)[:5]} - Example {task_i + 1}/{len(X_train)} - Min Loss: {str(min_loss)[:5]} - Loss: {epoch_loss}", end='\r')
    optimiser.alpha *= 0.99


Iter: 0 - Alpha: 0.046 - Example 1/800 - Min Loss: [1.00 - Loss: [1.00056071]
Iter: 0 - Alpha: 0.046 - Example 640/800 - Min Loss: [1.00 - Loss: [1.02002413]

KeyboardInterrupt: 

In [16]:
def predict(task_indices):
    hidden = model.init_hidden(batch_size=1)
    for t in range(len(task_indices)):
        input = Tensor([task_indices[t]], autograd=True)
        rnn_input = embedding.forward(input=input)
        hidden = model.forward(input=rnn_input, hidden=hidden)
    output = output_layer.forward(hidden[0])
    
    return output

In [17]:
for task_i in range(len(X_test)):
    output = predict(X_test[task_i])
    target = Tensor(np.array(y_test)[task_i], autograd=True)
    loss = criterion.forward(output, target)
    print(f"Task {task_i + 1}/{len(X_test)} - Loss: {loss.data}", end='\r')
    total_loss += loss.data / len(X_test[task_i])
loss_to_display = np.exp(total_loss / (task_i + 1))
print()
print(loss_to_display)

Task 200/200 - Loss: [6.42183281]]
[1.20739589]


In [18]:
import pickle as pkl

with open("importance_embedding.pkl", "wb") as file:
    pkl.dump(embedding, file)

with open("importance_model.pkl", "wb") as file:
    pkl.dump(model, file)

with open("importance_output.pkl", "wb") as file:
    pkl.dump(output_layer, file)

with open("word2index.pkl", "wb") as file:
    pkl.dump(word2index, file)

In [69]:
text = ""

print(predict([word2index[token] for token in text.split(' ') if token in word2index.keys()]))

[[6.73039372]]
