In [1]:
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable
from classifier import eval, naive_bayes as nb, naive_bayes_thres as nb_thres, preproc, logistic_regression as lg, LSTM
from data import data
from os.path import exists

In [2]:
get_data_path = 'data/raw_data/'
main_data_path = get_data_path + 'myanimelist.csv'
train_data_path = get_data_path + 'train_data.csv'
test_data_path = get_data_path + 'test_data.csv'
dev_data_path = get_data_path + 'dev_data.csv'

model_data_path = 'data/model/'
helper_data_path = 'data/helper_data/'

exists(main_data_path)

True

We write the data that we request from mynaimelist api and then we read it

In [3]:
data.write_data(100,10000, main_data_path)

(None, None)

In [4]:
dataset = pd.read_csv(main_data_path)

We split the data into train, dev and test data and then read it

In [5]:
file_paths = {'train_path': train_data_path, 'test_path': test_data_path, 'dev_path': dev_data_path}
data.split_data(dataset, 1000, 1000, file_paths)

In [6]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
dev_data = pd.read_csv(dev_data_path)

# Naive Bayes

We'll be using the naive bayes algorithm to classify the synopsis by choosing the top n genres. In this case we'll get the n by choosing the number of genres that already exist in our data else we specify how many genres we want.

We process the data that we have to in order to use the algorithm

In [7]:
func_list = [preproc.clean_para, preproc.bag_of_words, preproc.remove_stop_words]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

total_count = nb_thres.total_word_count(x_tr, y_tr)[0]
x_tr = nb.prune_vocab(total_count, x_tr, 10)
x_dev = nb.prune_vocab(total_count, x_dev, 10)
x_te = nb.prune_vocab(total_count, x_te, 10)

for i in range(5):
    print(x_tr[i], y_tr[i])

Counter({'had': 2, 'takafumi': 2, 'magical': 2, 'world': 2, 'youtube': 2, 'channel': 2, 'fateful': 1, 'encounter': 1, 'truck': 1, 'rendered': 1, 'yousuke': 1, 'comatose': 1, 'past': 1, '17': 1, 'years': 1, 'when': 1, 'finally': 1, 'regains': 1, 'consciousness': 1, 'begins': 1, 'foreign': 1, 'reveals': 1, 'been': 1, 'transported': 1, 'called': 1, 'gran': 1, 'claims': 1, 'as': 1, 'nonsense': 1, 'until': 1, 'makes': 1, 'cup': 1, 'water': 1, 'air': 1, 'flash': 1, 'brilliance': 1, 'pair': 1, 'creates': 1, 'showcase': 1, 'responsibility': 1, 'now': 1, 'falls': 1, 'shoulders': 1, 'everything': 1, 'has': 1, 'transpired': 1, 'during': 1, 'absence': 1, 'including': 1, 'getting': 1, 'him': 1, 'up': 1, 'speed': 1, 'internet': 1, 'new': 1, 'technology': 1, 'surprisingly': 1, 'outcome': 1, '90s': 1, 'result': 1, 'which': 1, 'was': 1, 'especially': 1, 'hardcore': 1, 'sega': 1, 'fan': 1, 'wisdom': 1, 'from': 1, 'other': 1, 'experiences': 1, 'grow': 1, 'tackle': 1, 'online': 1, 'journey': 1, 'duo': 1, 

we're getting the necessary data to run the algorithm 

In [8]:
vals = np.logspace(-3,2,11)
smoothing = nb.find_best_smoother(x_tr, y_tr, x_dev, y_dev, vals)
weights = nb.calculating_weights(x_tr, y_tr, smoothing)
genre_list = nb.get_label_count(y_tr)[1]
print("Smoothing Value:", smoothing)

Smoothing Value: 0.03162277660168379


In [9]:
amount_list = nb.get_amount_list(y_tr)
y_pred = nb.predict_all(x_tr, weights, genre_list, amount_list)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_tr = preproc.one_hot_encoding_label(y_tr, genre_list)
acc = eval.accuracy(y_pred, y_tr)
print("Training accuracy :", acc)

f_score = eval.f_score(y_pred, y_tr)
print("F_score :", f_score)

Training accuracy : 0.9909820468271697
F_score : 0.8863446780113446


In [10]:
amount_list = nb.get_amount_list(y_dev)
y_pred = nb.predict_all(x_dev, weights, genre_list, amount_list)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_dev = preproc.one_hot_encoding_label(y_dev, genre_list)
acc = eval.accuracy(y_pred, y_dev)
print("Validation accuracy :", acc)

f_score = eval.f_score(y_pred, y_dev)
print("F_score :", f_score)

Validation accuracy : 0.9743466666666667
F_score : 0.674119241192412


In [11]:
amount_list = nb.get_amount_list(y_te)
y_pred = nb.predict_all(x_te, weights, genre_list, amount_list)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_te = preproc.one_hot_encoding_label(y_te, genre_list)
acc = eval.accuracy(y_pred, y_te)
print("Testing accuracy :", acc)

f_score = eval.f_score(y_pred, y_te)
print("F_score :", f_score)

Testing accuracy : 0.9745733333333333
F_score : 0.6814765324870553


In [12]:
for i in range(5):
    x = test_data.loc[i].at["synopsis"]
    for func in func_list:
        x = func(x)
    y_pred = nb.predict(x, weights, genre_list, amount_list[i])[1]
    print('Input:', test_data.loc[i].at["synopsis"])
    print('Output:', y_pred)

Input: Fourth season of the Shimajirou children's television series.
Output: ['Kids', 'Comedy', 'Adventure', 'Fantasy']
Input: Rodeo is a normal high school boy who aims to be like GRANRODEO. Gra-P, a self-proclaimed music producer who comes from the future, tries to help him.
Output: ['School', 'Comedy']
Input: A shadow painting anime about a timid giraffe named Noop and his hedgehog companion named Harry as they travel their distant star world helping each other.
Output: ['Adventure']
Input: Bundled with the franchise's Kill Me Baby Super Best Album CD (キルミーベイベー・スーパー). OAD adapted eight previously unanimated story episodes from Kazuho's original manga.

(Source: ANN)
Output: ['Music', 'Comedy', 'Fantasy']
Input: The protagonist Qin Chen, who was originally the top genius in the military domain, was conspired by the people to fall into the death canyon in the forbidden land of the mainland. Qin Chen, who was inevitably dead, unexpectedly triggered the power of the mysterious ancient s

In [13]:
weight_path = model_data_path + 'nb_weight'
nb.save_weights(weights, weight_path)
weights = nb.read_weights(pd.read_csv(model_data_path + 'nb_weight'))

genre_data_path = helper_data_path + 'genre_list.csv'
genre_df_dict = {'genres': list(genre_list)}
genre_df = pd.DataFrame(data=genre_df_dict)
genre_df.to_csv(genre_data_path, index=False)

# Naive Bayes with threshold

We'll be using the same naive bayes algorithm but in this case instead of choosing the top n genres we choose a threshold so that the score of the genre is higher than the threshold is selected. All though the score is calculate through the naive bayes algorithm we then divide it by the sentence probabilities in order to get the probability. P(class|document) = P(document|class) * P(class) / P(document). In regular naive bayes we ignore the P(document) since we're choosing the highest score genre but since we're using a threshold we need it.

We process the data in order to run the algorithm

In [14]:
func_list = [preproc.clean_para, preproc.bag_of_words, preproc.remove_stop_words]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

total_count = nb_thres.total_word_count(x_tr, y_tr)[0]
x_tr = nb.prune_vocab(total_count, x_tr, 10)
x_dev = nb.prune_vocab(total_count, x_dev, 10)
x_te = nb.prune_vocab(total_count, x_te, 10)

for i in range(5):
    print(x_tr[i], y_tr[i])

Counter({'had': 2, 'takafumi': 2, 'magical': 2, 'world': 2, 'youtube': 2, 'channel': 2, 'fateful': 1, 'encounter': 1, 'truck': 1, 'rendered': 1, 'yousuke': 1, 'comatose': 1, 'past': 1, '17': 1, 'years': 1, 'when': 1, 'finally': 1, 'regains': 1, 'consciousness': 1, 'begins': 1, 'foreign': 1, 'reveals': 1, 'been': 1, 'transported': 1, 'called': 1, 'gran': 1, 'claims': 1, 'as': 1, 'nonsense': 1, 'until': 1, 'makes': 1, 'cup': 1, 'water': 1, 'air': 1, 'flash': 1, 'brilliance': 1, 'pair': 1, 'creates': 1, 'showcase': 1, 'responsibility': 1, 'now': 1, 'falls': 1, 'shoulders': 1, 'everything': 1, 'has': 1, 'transpired': 1, 'during': 1, 'absence': 1, 'including': 1, 'getting': 1, 'him': 1, 'up': 1, 'speed': 1, 'internet': 1, 'new': 1, 'technology': 1, 'surprisingly': 1, 'outcome': 1, '90s': 1, 'result': 1, 'which': 1, 'was': 1, 'especially': 1, 'hardcore': 1, 'sega': 1, 'fan': 1, 'wisdom': 1, 'from': 1, 'other': 1, 'experiences': 1, 'grow': 1, 'tackle': 1, 'online': 1, 'journey': 1, 'duo': 1, 

We're getting the necessary data to run the algorithm

In [15]:
smoothers = np.logspace(-3,2,11)
thresholds = [0.1 * i for i in range(1,10)]

smoothing, threshold = nb_thres.threshold_find_best_hyperparameter(x_tr, y_tr, x_dev, y_dev, smoothers, thresholds, 10)
weights = nb.calculating_weights(x_tr, y_tr, smoothing)
print("smoothing:", smoothing, ", Threshold:", threshold)

count, total_number = nb_thres.total_word_count(x_tr, y_tr)

smoothing: 0.001 , Threshold: 0.4


In [16]:
genre_list = nb.get_label_count(y_tr)[1]
genre_list = sorted(genre_list)
sentence_probabilites = nb_thres.find_sentence_probabilites(x_tr, count, total_number, smoothing)
y_pred = nb_thres.threshold_predict_all(x_tr, sentence_probabilites, weights, genre_list, threshold)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_tr = preproc.one_hot_encoding_label(y_tr, genre_list)
acc = eval.accuracy(y_pred, y_tr)
print("Training accuracy :", acc)

f_score = eval.f_score(y_pred, y_tr)
print("F_score :", f_score)

Training accuracy : 0.9887962273517001
F_score : 0.8533495051006044


In [17]:
sentence_probabilites = nb_thres.find_sentence_probabilites(x_dev, count, total_number, smoothing)
y_pred = nb_thres.threshold_predict_all(x_dev, sentence_probabilites, weights, genre_list, threshold)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_dev = preproc.one_hot_encoding_label(y_dev, genre_list)
acc = eval.accuracy(y_pred, y_dev)
print("Validation accuracy :", acc)

f_score = eval.f_score(y_pred, y_dev)
print("F_score :", f_score)

Validation accuracy : 0.97452
F_score : 0.5997905759162303


In [18]:
sentence_probabilites = nb_thres.find_sentence_probabilites(x_te, count, total_number, smoothing)
y_pred = nb_thres.threshold_predict_all(x_te, sentence_probabilites, weights, genre_list, threshold)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_te = preproc.one_hot_encoding_label(y_te, genre_list)
acc = eval.accuracy(y_pred, y_te)
print("Testing accuracy :", acc)

f_score = eval.f_score(y_pred, y_te)
print("F_score :", f_score)

Testing accuracy : 0.97396
F_score : 0.5948973242065961


Let's see some result

In [19]:
for i in range(5):
    x = test_data.loc[i].at["synopsis"]
    for func in func_list:
        x = func(x)
    sent_prob = nb_thres.find_sentence_probabilites([x], count, total_number, smoothing)[0]
    y_pred = nb_thres.threshold_predict(x, sent_prob, weights, genre_list, threshold)[1]
    print('Input:', test_data.loc[i].at["synopsis"])
    print('Output:', y_pred)

Input: Fourth season of the Shimajirou children's television series.
Output: ['Adventure', 'Comedy', 'Fantasy', 'Kids']
Input: Rodeo is a normal high school boy who aims to be like GRANRODEO. Gra-P, a self-proclaimed music producer who comes from the future, tries to help him.
Output: ['School']
Input: A shadow painting anime about a timid giraffe named Noop and his hedgehog companion named Harry as they travel their distant star world helping each other.
Output: ['Adventure']
Input: Bundled with the franchise's Kill Me Baby Super Best Album CD (キルミーベイベー・スーパー). OAD adapted eight previously unanimated story episodes from Kazuho's original manga.

(Source: ANN)
Output: ['Music']
Input: The protagonist Qin Chen, who was originally the top genius in the military domain, was conspired by the people to fall into the death canyon in the forbidden land of the mainland. Qin Chen, who was inevitably dead, unexpectedly triggered the power of the mysterious ancient sword.

Three hundred years late

In [20]:
vocab_data_path = helper_data_path + 'vocab_no_stop_word.csv'
vocab_df_dict = {'word': list(count.keys())}
vocab_df = pd.DataFrame(data=vocab_df_dict)
vocab_df.to_csv(vocab_data_path, index=False)

count_data_path = helper_data_path + 'count.csv'
count_df_dict = {'words': [], 'counts': []}
for word in count:
    count_df_dict['words'].append(word)
    count_df_dict['counts'].append(count[word])
count_df = pd.DataFrame(data=count_df_dict)
count_df.to_csv(count_data_path, index=False)

# Logistic Regression with Threshold

We're going to use logistic regression in order to classify. We'll choose a threshold and any genres with a weight that surpass the threshold is selected. In this section we'll treat synopsis with multiple genre as duplicate synopsis with different genre label.

In [23]:
func_list = [preproc.clean_para, preproc.bag_of_words, preproc.remove_stop_words]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

total_count = nb_thres.total_word_count(x_tr, y_tr)[0]
x_tr = nb.prune_vocab(total_count, x_tr, 10)
x_dev = nb.prune_vocab(total_count, x_dev, 10)
x_te = nb.prune_vocab(total_count, x_te, 10)

for i in range(5):
    print(x_tr[i], y_tr[i])

Counter({'had': 2, 'takafumi': 2, 'magical': 2, 'world': 2, 'youtube': 2, 'channel': 2, 'fateful': 1, 'encounter': 1, 'truck': 1, 'rendered': 1, 'yousuke': 1, 'comatose': 1, 'past': 1, '17': 1, 'years': 1, 'when': 1, 'finally': 1, 'regains': 1, 'consciousness': 1, 'begins': 1, 'foreign': 1, 'reveals': 1, 'been': 1, 'transported': 1, 'called': 1, 'gran': 1, 'claims': 1, 'as': 1, 'nonsense': 1, 'until': 1, 'makes': 1, 'cup': 1, 'water': 1, 'air': 1, 'flash': 1, 'brilliance': 1, 'pair': 1, 'creates': 1, 'showcase': 1, 'responsibility': 1, 'now': 1, 'falls': 1, 'shoulders': 1, 'everything': 1, 'has': 1, 'transpired': 1, 'during': 1, 'absence': 1, 'including': 1, 'getting': 1, 'him': 1, 'up': 1, 'speed': 1, 'internet': 1, 'new': 1, 'technology': 1, 'surprisingly': 1, 'outcome': 1, '90s': 1, 'result': 1, 'which': 1, 'was': 1, 'especially': 1, 'hardcore': 1, 'sega': 1, 'fan': 1, 'wisdom': 1, 'from': 1, 'other': 1, 'experiences': 1, 'grow': 1, 'tackle': 1, 'online': 1, 'journey': 1, 'duo': 1, 

We're creating the model for the processing

In [24]:
vocab = nb.count_words(x_tr, y_tr)[1]
genre_list = nb.get_label_count(y_tr)[1]
genre_list = sorted(genre_list)

model = torch.nn.Sequential(
            torch.nn.Linear(len(vocab), len(genre_list), bias=True),
        )
model.add_module('softmax',torch.nn.LogSoftmax(dim=1))
loss = torch.nn.NLLLoss()

We're going to process the data into a one hot encoding format in order to be evaluate by our model.

In [25]:
X_tr_num_genres = lg.make_num_genres(y_tr)
X_tr = preproc.make_numpy_bag_of_word(x_tr, X_tr_num_genres, vocab)
X_tr_var = Variable(torch.from_numpy(X_tr.astype(np.float32)))

X_dev = preproc.make_numpy_bag_of_word(x_dev, [1 for i in range(len(x_dev))], vocab)
X_dev_var = Variable(torch.from_numpy(X_dev.astype(np.float32)))

X_te = preproc.make_numpy_bag_of_word(x_te, [1 for i in range(len(x_te))], vocab)
X_te_var = Variable(torch.from_numpy(X_te.astype(np.float32)))

In [26]:
Y_tr = lg.make_numpy_label(y_tr, genre_list)
Y_dev = preproc.one_hot_encoding_label(y_dev, genre_list)
Y_te = preproc.one_hot_encoding_label(y_te, genre_list)

Y_tr_var = Variable(torch.from_numpy(Y_tr))
Y_dev_var = Variable(torch.from_numpy(Y_dev))
Y_te_var = Variable(torch.from_numpy(Y_te))

We're going to train our data

In [27]:
lg_model_path = model_data_path + "lg.params"
model_trained, losses, accuracies = lg.train_model(loss,model,
                                                       X_tr_var,
                                                       Y_tr_var,
                                                       X_dv_var=X_dev_var,
                                                       Y_dv_var = Y_dev_var,
                                                       num_its= 300,
                                                       threshold = np.log(0.2),
                                                       optim_args={'lr':1},
                                                       param_file = lg_model_path
                                                  )

Epoch 1: Dev Accuracy: 0.96064
Epoch 1: Dev F_score: 0
Epoch 11: Dev Accuracy: 0.9610133333333334
Epoch 11: Dev F_score: 0.020107238605898123
Epoch 21: Dev Accuracy: 0.96208
Epoch 21: Dev F_score: 0.07482108002602471
Epoch 31: Dev Accuracy: 0.9632133333333334
Epoch 31: Dev F_score: 0.1321170179301667
Epoch 41: Dev Accuracy: 0.96404
Epoch 41: Dev F_score: 0.17295308187672495
Epoch 51: Dev Accuracy: 0.9647066666666667
Epoch 51: Dev F_score: 0.20198974977389209
Epoch 61: Dev Accuracy: 0.9653066666666666
Epoch 61: Dev F_score: 0.22651605231866823
Epoch 71: Dev Accuracy: 0.9657866666666667
Epoch 71: Dev F_score: 0.24573780129335684
Epoch 81: Dev Accuracy: 0.9660933333333334
Epoch 81: Dev F_score: 0.25924847072531315
Epoch 91: Dev Accuracy: 0.9664266666666667
Epoch 91: Dev F_score: 0.2726747544771808
Epoch 101: Dev Accuracy: 0.9668266666666666
Epoch 101: Dev F_score: 0.288329519450801
Epoch 111: Dev Accuracy: 0.9674266666666667
Epoch 111: Dev F_score: 0.3104713519616144
Epoch 121: Dev Accura

In [28]:
for i in range(1, 10):
    threshold = np.log(0.1 * i)
    Y_hat = model_trained.forward(X_dev_var).data
    for row in range(Y_hat.size()[0]):
        for column in range(Y_hat.size()[1]):
            if Y_hat[row][column] >= threshold:
                Y_hat[row][column] = 1
            else:
                Y_hat[row][column] = 0
    # compute dev accuracy
    acc = eval.accuracy(Y_hat.data.numpy().astype(int), Y_dev_var.data.numpy())
    f_score = eval.f_score(Y_hat.data.numpy().astype(int), Y_dev_var.data.numpy())
    print('threshold:', 0.1 * i, 'Dev accuracy:', acc, 'Dev f-score:', f_score)

threshold: 0.1 Dev accuracy: 0.973 Dev f-score: 0.5863125638406538
threshold: 0.2 Dev accuracy: 0.96956 Dev f-score: 0.3961914837344618
threshold: 0.30000000000000004 Dev accuracy: 0.9646666666666667 Dev f-score: 0.1920731707317073
threshold: 0.4 Dev accuracy: 0.9630133333333334 Dev f-score: 0.11543367346938775
threshold: 0.5 Dev accuracy: 0.9621066666666667 Dev f-score: 0.07184846505551927
threshold: 0.6000000000000001 Dev accuracy: 0.9615066666666666 Dev f-score: 0.043089161418627765
threshold: 0.7000000000000001 Dev accuracy: 0.9612266666666667 Dev f-score: 0.02937249666221629
threshold: 0.8 Dev accuracy: 0.96092 Dev f-score: 0.014127144298688193
threshold: 0.9 Dev accuracy: 0.9607733333333334 Dev f-score: 0.00675219446320054


Pick the best threshold with the highest f_score

In [29]:
threshold = np.log(0.1)

In [30]:
X_tr = preproc.make_numpy_bag_of_word(x_tr, [1 for i in range(len(x_tr))], vocab)
X_tr_var = Variable(torch.from_numpy(X_tr.astype(np.float32)))


Y_tr = preproc.one_hot_encoding_label(y_tr, genre_list)
Y_tr_var = Variable(torch.from_numpy(Y_tr))

Y_hat = model_trained.forward(X_tr_var).data
for row in range(Y_hat.size()[0]):
    for column in range(Y_hat.size()[1]):
        if Y_hat[row][column] >= threshold:
            Y_hat[row][column] = 1
        else:
            Y_hat[row][column] = 0

acc = eval.accuracy(Y_hat.data.numpy().astype(int), Y_tr_var.data.numpy())
f_score = eval.f_score(Y_hat.data.numpy().astype(int), Y_tr_var.data.numpy())
print('Training accuracy:', acc)
print('Training f-score:', f_score)

Training accuracy: 0.9778423099197485
Training f-score: 0.6685642155285498


In [31]:
Y_hat = model_trained.forward(X_dev_var).data
for row in range(Y_hat.size()[0]):
    for column in range(Y_hat.size()[1]):
        if Y_hat[row][column] >= threshold:
            Y_hat[row][column] = 1
        else:
            Y_hat[row][column] = 0
# compute dev accuracy
acc = eval.accuracy(Y_hat.data.numpy().astype(int), Y_dev_var.data.numpy())
f_score = eval.f_score(Y_hat.data.numpy().astype(int), Y_dev_var.data.numpy())
print('Dev accuracy:', acc)
print('Dev f-score:', f_score)

Dev accuracy: 0.973
Dev f-score: 0.5863125638406538


In [32]:
Y_hat = model_trained.forward(X_te_var).data
for row in range(Y_hat.size()[0]):
    for column in range(Y_hat.size()[1]):
        if Y_hat[row][column] >= threshold:
            Y_hat[row][column] = 1
        else:
            Y_hat[row][column] = 0
# compute dev accuracy
print((Y_hat.data.numpy().astype(int) == Y_te_var.data.numpy()).shape)
acc = eval.accuracy(Y_hat.data.numpy().astype(int), Y_te_var.data.numpy())
f_score = eval.f_score(Y_hat.data.numpy().astype(int), Y_te_var.data.numpy())
print('Test accuracy:', acc)
print('Test f-score:', f_score)

(1000, 75)
Test accuracy: 0.9717066666666667
Test f-score: 0.5655200655200655


Let's see some example of the result

In [33]:
for i in range(5):
    x = test_data.loc[i].at["synopsis"]
    for func in func_list:
        x = func(x)
    x = preproc.make_numpy_bag_of_word([x], [1], vocab)
    x = Variable(torch.from_numpy(x.astype(np.float32)))
    result = []
    Y_hat = model_trained.forward(x).data
    Y_hat = Y_hat[0]
    for row in range(Y_hat.size(dim=0)):
        if Y_hat[row] >= threshold:
            result.append(genre_list[i])
    print('Input:', test_data.loc[i].at["synopsis"])
    print('Output:', result)

Input: Fourth season of the Shimajirou children's television series.
Output: ['Action', 'Action', 'Action', 'Action']
Input: Rodeo is a normal high school boy who aims to be like GRANRODEO. Gra-P, a self-proclaimed music producer who comes from the future, tries to help him.
Output: ['Adult Cast', 'Adult Cast']
Input: A shadow painting anime about a timid giraffe named Noop and his hedgehog companion named Harry as they travel their distant star world helping each other.
Output: ['Adventure']
Input: Bundled with the franchise's Kill Me Baby Super Best Album CD (キルミーベイベー・スーパー). OAD adapted eight previously unanimated story episodes from Kazuho's original manga.

(Source: ANN)
Output: ['Anthropomorphic']
Input: The protagonist Qin Chen, who was originally the top genius in the military domain, was conspired by the people to fall into the death canyon in the forbidden land of the mainland. Qin Chen, who was inevitably dead, unexpectedly triggered the power of the mysterious ancient sword.

# In this section We'll try training the model with no duplicate data. 
So one synopsis has 1 genre label associate with it.

We're going to create the model

In [34]:
model = torch.nn.Sequential(
            torch.nn.Linear(len(vocab), len(genre_list), bias=True),
        )
model.add_module('softmax',torch.nn.LogSoftmax(dim=1))
loss = torch.nn.NLLLoss()

We're going to process the data and then transform it into one hot encoding format.

In [35]:
func_list = [preproc.clean_para, preproc.bag_of_words, preproc.remove_stop_words]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

total_count = nb_thres.total_word_count(x_tr, y_tr)[0]
x_tr = nb.prune_vocab(total_count, x_tr, 10)
x_dev = nb.prune_vocab(total_count, x_dev, 10)
x_te = nb.prune_vocab(total_count, x_te, 10)

In [36]:
X_tr = preproc.make_numpy_bag_of_word(x_tr, [1 for i in range(len(x_tr))], vocab)
X_tr_var = Variable(torch.from_numpy(X_tr.astype(np.float32)))

X_dev = preproc.make_numpy_bag_of_word(x_dev, [1 for i in range(len(x_dev))], vocab)
X_dev_var = Variable(torch.from_numpy(X_dev.astype(np.float32)))

X_te = preproc.make_numpy_bag_of_word(x_te, [1 for i in range(len(x_te))], vocab)
X_te_var = Variable(torch.from_numpy(X_te.astype(np.float32)))

In [37]:
Y_tr = lg.make_numpy_label(np.transpose([y_tr[:,0]]), genre_list)
Y_dev = preproc.one_hot_encoding_label(y_dev, genre_list)
Y_te = preproc.one_hot_encoding_label(y_te, genre_list)

Y_tr_var = Variable(torch.from_numpy(Y_tr))
Y_dev_var = Variable(torch.from_numpy(Y_dev))
Y_te_var = Variable(torch.from_numpy(Y_te))

We're going to train the data

In [38]:
lg_model_path = model_data_path + "lg_no_duplicate.params"
model_trained, losses, accuracies = lg.train_model(loss,model,
                                                       X_tr_var,
                                                       Y_tr_var,
                                                       X_dv_var=X_dev_var,
                                                       Y_dv_var = Y_dev_var,
                                                       num_its= 200,
                                                       threshold = np.log(0.2),
                                                       optim_args={'lr':1},
                                                       param_file = lg_model_path)

Epoch 1: Dev Accuracy: 0.96064
Epoch 1: Dev F_score: 0.008730691739422432
Epoch 11: Dev Accuracy: 0.9616933333333333
Epoch 11: Dev F_score: 0.18588835364125816
Epoch 21: Dev Accuracy: 0.9638933333333334
Epoch 21: Dev F_score: 0.29035639412997905
Epoch 31: Dev Accuracy: 0.9642
Epoch 31: Dev F_score: 0.3193916349809886
Epoch 41: Dev Accuracy: 0.9642266666666667
Epoch 41: Dev F_score: 0.32570997738125157
Epoch 51: Dev Accuracy: 0.96448
Epoch 51: Dev F_score: 0.336322869955157
Epoch 61: Dev Accuracy: 0.9645066666666666
Epoch 61: Dev F_score: 0.3401090728805156
Epoch 71: Dev Accuracy: 0.96456
Epoch 71: Dev F_score: 0.3417533432392273
Epoch 81: Dev Accuracy: 0.96476
Epoch 81: Dev F_score: 0.3462775166955231
Epoch 91: Dev Accuracy: 0.9649733333333333
Epoch 91: Dev F_score: 0.35055624227441284
Epoch 101: Dev Accuracy: 0.9651333333333333
Epoch 101: Dev F_score: 0.3579671004173827
Epoch 111: Dev Accuracy: 0.9652933333333333
Epoch 111: Dev F_score: 0.3621661357510414
Epoch 121: Dev Accuracy: 0.96

In [39]:
for i in range(1, 10):
    threshold = np.log(0.1 * i)
    Y_hat = model_trained.forward(X_dev_var).data
    for row in range(Y_hat.size()[0]):
        for column in range(Y_hat.size()[1]):
            if Y_hat[row][column] >= threshold:
                Y_hat[row][column] = 1
            else:
                Y_hat[row][column] = 0
    # compute dev accuracy
    acc = eval.accuracy(Y_hat.data.numpy().astype(int), Y_dev_var.data.numpy())
    f_score = eval.f_score(Y_hat.data.numpy().astype(int), Y_dev_var.data.numpy())
    print('threshold:', 0.1 * i, 'Dev accuracy:', acc, 'Dev f-score:', f_score)

threshold: 0.1 Dev accuracy: 0.96132 Dev f-score: 0.38105397909110306
threshold: 0.2 Dev accuracy: 0.9654933333333333 Dev f-score: 0.3675464320625611
threshold: 0.30000000000000004 Dev accuracy: 0.9664666666666667 Dev f-score: 0.34522259828169743
threshold: 0.4 Dev accuracy: 0.96644 Dev f-score: 0.3068025337372625
threshold: 0.5 Dev accuracy: 0.9662 Dev f-score: 0.2734307824591574
threshold: 0.6000000000000001 Dev accuracy: 0.9657333333333333 Dev f-score: 0.24544920728126834
threshold: 0.7000000000000001 Dev accuracy: 0.9647066666666667 Dev f-score: 0.19714892326357292
threshold: 0.8 Dev accuracy: 0.9638266666666667 Dev f-score: 0.15456528513555626
threshold: 0.9 Dev accuracy: 0.9627066666666667 Dev f-score: 0.10093217614914819


Pick the best threshold with the highest f_score

In [40]:
threshold = np.log(0.1)

In [41]:
X_tr = preproc.make_numpy_bag_of_word(x_tr, [1 for i in range(len(x_tr))], vocab)
X_tr_var = Variable(torch.from_numpy(X_tr.astype(np.float32)))


Y_tr = preproc.one_hot_encoding_label(y_tr, genre_list)
Y_tr_var = Variable(torch.from_numpy(Y_tr))

Y_hat = model_trained.forward(X_tr_var).data
for row in range(Y_hat.size()[0]):
    for column in range(Y_hat.size()[1]):
        if Y_hat[row][column] >= threshold:
            Y_hat[row][column] = 1
        else:
            Y_hat[row][column] = 0

acc = eval.accuracy(Y_hat.data.numpy().astype(int), Y_tr_var.data.numpy())
f_score = eval.f_score(Y_hat.data.numpy().astype(int), Y_tr_var.data.numpy())
print('Training accuracy:', acc)
print('Training f-score:', f_score)

Training accuracy: 0.963271283196823
Training f-score: 0.4070522238546814


In [42]:
Y_hat = model_trained.forward(X_dev_var).data
for row in range(Y_hat.size()[0]):
    for column in range(Y_hat.size()[1]):
        if Y_hat[row][column] >= threshold:
            Y_hat[row][column] = 1
        else:
            Y_hat[row][column] = 0
# compute dev accuracy
acc = eval.accuracy(Y_hat.data.numpy().astype(int), Y_dev_var.data.numpy())
f_score = eval.f_score(Y_hat.data.numpy().astype(int), Y_dev_var.data.numpy())
print('Dev accuracy:', acc)
print('Dev f-score:', f_score)

Dev accuracy: 0.96132
Dev f-score: 0.38105397909110306


In [43]:
Y_hat = model_trained.forward(X_te_var).data
for row in range(Y_hat.size()[0]):
    for column in range(Y_hat.size()[1]):
        if Y_hat[row][column] >= threshold:
            Y_hat[row][column] = 1
        else:
            Y_hat[row][column] = 0
# compute dev accuracy
acc = eval.accuracy(Y_hat.data.numpy().astype(int), Y_te_var.data.numpy())
f_score = eval.f_score(Y_hat.data.numpy().astype(int), Y_te_var.data.numpy())
print('Test accuracy:', acc)
print('Test f-score:', f_score)

Test accuracy: 0.9600266666666667
Test f-score: 0.3693731594446782


Let's see some example of the result

In [44]:
for i in range(5):
    x = test_data.loc[i].at["synopsis"]
    for func in func_list:
        x = func(x)
    x = preproc.make_numpy_bag_of_word([x], [1], vocab)
    x = Variable(torch.from_numpy(x.astype(np.float32)))
    result = []
    Y_hat = model_trained.forward(x).data
    Y_hat = Y_hat[0]
    for row in range(Y_hat.size(dim=0)):
        if Y_hat[row] >= threshold:
            result.append(genre_list[i])
    print('Input:', test_data.loc[i].at["synopsis"])
    print('Output:', result)

Input: Fourth season of the Shimajirou children's television series.
Output: ['Action', 'Action', 'Action']
Input: Rodeo is a normal high school boy who aims to be like GRANRODEO. Gra-P, a self-proclaimed music producer who comes from the future, tries to help him.
Output: ['Adult Cast', 'Adult Cast', 'Adult Cast']
Input: A shadow painting anime about a timid giraffe named Noop and his hedgehog companion named Harry as they travel their distant star world helping each other.
Output: ['Adventure']
Input: Bundled with the franchise's Kill Me Baby Super Best Album CD (キルミーベイベー・スーパー). OAD adapted eight previously unanimated story episodes from Kazuho's original manga.

(Source: ANN)
Output: ['Anthropomorphic', 'Anthropomorphic']
Input: The protagonist Qin Chen, who was originally the top genius in the military domain, was conspired by the people to fall into the death canyon in the forbidden land of the mainland. Qin Chen, who was inevitably dead, unexpectedly triggered the power of the my

# LSTM model

We're going to use a lstm model in order to evaluate our data.

We're going to process the data in order to use our algorithm

In [21]:
func_list = [preproc.clean_para, preproc.sentence_to_list]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

In [22]:
vocab = nb.count_words(x_tr, y_tr)[1]
genre_count, genre_list = nb.get_label_count(y_tr)
word_to_index = LSTM.word_to_ix(vocab)

embedding = LSTM.load_glove_vectors(vocab)
loss_weight = LSTM.calculate_simple_loss_weights(genre_count, genre_list)

model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
loss = torch.nn.BCELoss(reduction = "none")

In [23]:
X_tr = [LSTM.prepare_sequence(x, word_to_index) for x in x_tr]
# X_tr = biLSTM.create_x_numpy(X_tr, max_len).astype(int)
# X_tr_var = Variable(torch.from_numpy(X_tr))

X_dev = [LSTM.prepare_sequence(x, word_to_index) for x in x_dev]
# X_dev = biLSTM.create_x_numpy(X_dev, max_len).astype(int)
# X_dev_var = Variable(torch.from_numpy(X_dev))

X_te = [LSTM.prepare_sequence(x, word_to_index) for x in x_te]
# X_te = biLSTM.create_x_numpy(X_te, max_len).astype(int)
# X_te_var = Variable(torch.from_numpy(X_te.astype(int)))

In [24]:
Y_tr = preproc.one_hot_encoding_label(y_tr, genre_list).astype(np.float32)
Y_dev = preproc.one_hot_encoding_label(y_dev, genre_list).astype(np.float32)
Y_te = preproc.one_hot_encoding_label(y_te, genre_list).astype(np.float32)

Y_tr_var = Variable(torch.from_numpy(Y_tr))
Y_dev_var = Variable(torch.from_numpy(Y_dev))
Y_te_var = Variable(torch.from_numpy(Y_te))

In [46]:
print(embedding.shape)

(35439, 100)


We're going to train our data

In [47]:
loss_weight = torch.tensor([1 for i in range(len(genre_list))])
for lr in [0.01, 0.1]:
    print("learning rate:", lr)
    model_path = model_data_path + 'lstm_no_loss_weight_lr_' + str(lr) + '.params'
    model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
    no_loss_weight_model, losses, accuracies = LSTM.train_model(model, X_tr, Y_tr, loss_weight, loss_weight,
                                                                X_dev, Y_dev, num_its= 20,
                                                                status_frequency=2, 
                                                                optim_args = {'lr':lr}, param_file = model_path)

learning rate: 0.01
Epoch 1: Dev Accuracy: 0.96064
Epoch 1: Dev F_score: 0
Epoch 3: Dev Accuracy: 0.96064
Epoch 3: Dev F_score: 0
Epoch 5: Dev Accuracy: 0.96064
Epoch 5: Dev F_score: 0
Epoch 7: Dev Accuracy: 0.96064
Epoch 7: Dev F_score: 0
Epoch 9: Dev Accuracy: 0.96064
Epoch 9: Dev F_score: 0
Epoch 11: Dev Accuracy: 0.96064
Epoch 11: Dev F_score: 0
Epoch 13: Dev Accuracy: 0.96064
Epoch 13: Dev F_score: 0
Epoch 15: Dev Accuracy: 0.96064
Epoch 15: Dev F_score: 0
Epoch 17: Dev Accuracy: 0.96064
Epoch 17: Dev F_score: 0
Epoch 19: Dev Accuracy: 0.96064
Epoch 19: Dev F_score: 0
learning rate: 0.1
Epoch 1: Dev Accuracy: 0.96064
Epoch 1: Dev F_score: 0
Epoch 3: Dev Accuracy: 0.96024
Epoch 3: Dev F_score: 0.05393401015228427
Epoch 5: Dev Accuracy: 0.9625733333333333
Epoch 5: Dev F_score: 0.1547726588376995
Epoch 7: Dev Accuracy: 0.9613733333333333
Epoch 7: Dev F_score: 0.15955903684363212
Epoch 9: Dev Accuracy: 0.9620933333333334
Epoch 9: Dev F_score: 0.14082804472650348
Epoch 11: Dev Accuracy

Our data would be scewed to just predict the most common genres so we add a simple function to counter it called simple loss weight. during training it would scale the loss in proportion to the number of synopsis associate with that genre. 

In [48]:
simple_loss_weight = LSTM.calculate_simple_loss_weights(genre_count, genre_list)
for lr in [0.01, 0.1]:
    print("lreaning rate:", lr)
    model_path = model_data_path + 'lstm_simple_loss_weight_lr_' + str(lr) + '.params'
    model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
    simple_loss_weight_01_model, losses, accuracies = LSTM.train_model(model, X_tr, Y_tr, simple_loss_weight, 
                                                                       simple_loss_weight, X_dev, Y_dev, 
                                                                       num_its= 20, 
                                                                       status_frequency=2, 
                                                                       optim_args = {'lr':lr}, 
                                                                       param_file = model_path)

lreaning rate: 0.01
Epoch 1: Dev Accuracy: 0.5939066666666667
Epoch 1: Dev F_score: 0.09989065224458432
Epoch 3: Dev Accuracy: 0.70764
Epoch 3: Dev F_score: 0.11737712836613935
Epoch 5: Dev Accuracy: 0.76608
Epoch 5: Dev F_score: 0.1317430466198159
Epoch 7: Dev Accuracy: 0.8139466666666667
Epoch 7: Dev F_score: 0.13736399604352129
Epoch 9: Dev Accuracy: 0.8880533333333334
Epoch 9: Dev F_score: 0.17524557956777997
Epoch 11: Dev Accuracy: 0.9127066666666667
Epoch 11: Dev F_score: 0.18922600619195046
Epoch 13: Dev Accuracy: 0.9139466666666667
Epoch 13: Dev F_score: 0.18960321446509293
Epoch 15: Dev Accuracy: 0.91404
Epoch 15: Dev F_score: 0.18936250471520177
Epoch 17: Dev Accuracy: 0.9141866666666667
Epoch 17: Dev F_score: 0.18901209677419353
Epoch 19: Dev Accuracy: 0.9144933333333334
Epoch 19: Dev F_score: 0.1889465030985203
lreaning rate: 0.1
Epoch 1: Dev Accuracy: 0.9038
Epoch 1: Dev F_score: 0.21173385775155687
Epoch 3: Dev Accuracy: 0.9485866666666667
Epoch 3: Dev F_score: 0.22195318

We're going to use a much more complex function to counter unbalanced in our data.

In [49]:
w_p, w_n = LSTM.calculate_complex_loss_weights(genre_count, genre_list, len(X_tr))
for lr in [0.01, 0.1]:
    model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
    model_path = model_data_path + 'lstm_complex_loss_weight_lr_' + str(lr) + '.params'
    complex_loss_weight_001_model, losses, accuracies = LSTM.train_model(model, X_tr, Y_tr, w_p, w_n,
                                                                        X_dev, Y_dev, num_its= (50 if lr == 0.1 else 20),
                                                                        status_frequency=2, 
                                                                        optim_args = {'lr':lr}, param_file = model_path)

Epoch 1: Dev Accuracy: 0.5897333333333333
Epoch 1: Dev F_score: 0.09807714855199906
Epoch 3: Dev Accuracy: 0.7157066666666667
Epoch 3: Dev F_score: 0.12764912854921856
Epoch 5: Dev Accuracy: 0.5834666666666667
Epoch 5: Dev F_score: 0.1110858183473708
Epoch 7: Dev Accuracy: 0.5240933333333333
Epoch 7: Dev F_score: 0.1066700037542235
Epoch 9: Dev Accuracy: 0.58888
Epoch 9: Dev F_score: 0.11279277205501526
Epoch 11: Dev Accuracy: 0.5150933333333333
Epoch 11: Dev F_score: 0.11457369625553877
Epoch 13: Dev Accuracy: 0.5772933333333333
Epoch 13: Dev F_score: 0.1318765574084723
Epoch 15: Dev Accuracy: 0.60804
Epoch 15: Dev F_score: 0.13986013986013984
Epoch 17: Dev Accuracy: 0.62088
Epoch 17: Dev F_score: 0.13721325403568396
Epoch 19: Dev Accuracy: 0.6090133333333333
Epoch 19: Dev F_score: 0.13899817957601737
Epoch 1: Dev Accuracy: 0.61816
Epoch 1: Dev F_score: 0.10768367919237241
Epoch 3: Dev Accuracy: 0.51596
Epoch 3: Dev F_score: 0.11358808448296911
Epoch 5: Dev Accuracy: 0.524333333333333

In [50]:
y_pred = np.zeros((Y_tr.shape[0],Y_tr.shape[1]))
index = 0
for Xtr, Ytr in zip(X_tr, Y_tr):             
    X_tr_var = Variable(torch.Tensor(Xtr)).long()
    # run forward on dev data
    Y_hat = complex_loss_weight_001_model(X_tr_var)

    # compute dev accuracy
    for i in range(Y_hat.size(dim=0)):
        if Y_hat[i] >= 0.8:
            Y_hat[i] = 1
        else:
            Y_hat[i] = 0
    y_pred[index] = Y_hat.tolist()
    index += 1
    # save
acc = eval.accuracy(y_pred, Y_tr)
f_score = eval.f_score(y_pred, Y_tr)
print('Training accuracy:', acc)
print('Training f-score:', f_score)

Training accuracy: 0.9909290973773476
Training f-score: 0.8866465406724288


In [51]:
y_pred = np.zeros((Y_dev.shape[0],Y_dev.shape[1]))
index = 0
for Xdv, Ydv in zip(X_dev, Y_dev):             
    X_dev_var = Variable(torch.Tensor(Xdv)).long()
    # run forward on dev data
    Y_hat = complex_loss_weight_001_model(X_dev_var)

    # compute dev accuracy
    for i in range(Y_hat.size(dim=0)):
        if Y_hat[i] >= 0.8:
            Y_hat[i] = 1
        else:
            Y_hat[i] = 0
    y_pred[index] = Y_hat.tolist()
    index += 1
    # save
acc = eval.accuracy(y_pred, Y_dev)
f_score = eval.f_score(y_pred, Y_dev)
print('Dev accuracy:', acc)
print('Dev f-score:', f_score)

Dev accuracy: 0.97156
Dev f-score: 0.6108374384236452


In [52]:
y_pred = np.zeros((Y_te.shape[0],Y_te.shape[1]))
index = 0
for Xte, Yte in zip(X_te, Y_te):             
    X_te_var = Variable(torch.Tensor(Xte)).long()
    # run forward on dev data
    Y_hat = complex_loss_weight_001_model(X_te_var)

    # compute dev accuracy
    for i in range(Y_hat.size(dim=0)):
        if Y_hat[i] >= 0.8:
            Y_hat[i] = 1
        else:
            Y_hat[i] = 0
    y_pred[index] = Y_hat.tolist()
    index += 1
    # save
acc = eval.accuracy(y_pred, Y_te)
f_score = eval.f_score(y_pred, Y_te)
print('Test accuracy:', acc)
print('Test f-score:', f_score)

Test accuracy: 0.9694533333333333
Test f-score: 0.594585029198372


Let's see some example

In [53]:
genre_list = sorted(genre_list)
for i in range(5):
    x = test_data.loc[i].at["synopsis"]
    for func in func_list:
        x = func(x)
    x = LSTM.prepare_sequence(x, word_to_index)
    Y_hat = complex_loss_weight_001_model(Variable(torch.Tensor(x)).long())

    result = []
    for index in range(Y_hat.size(dim=0)):
        if Y_hat[index] >= 0.8:
            result.append(genre_list[index])
    print('Input:', test_data.loc[i].at["synopsis"])
    print('Output:', result)

Input: The film centers on Anemone, a girl who lost her father in a battle in Toyko seven years prior to the film's story, leaving her with only her stuffed toy Gulliver, and the AI concierge Dominikids for emotional support. Now she is a key part of a strategy by the experimental unit "Acid" to combat the seventh Eureka, "Eureka Seven," an enemy of humanity that has killed 2.6 billion people. Driven to the brink, all of humanity entrusts its hope to Anemone as she dives deep into the interior of Eureka Seven.

(Source: ANN)
Output: ['Adventure', 'Comedy', 'Fantasy', 'Kids']
Input: The film centers on Anemone, a girl who lost her father in a battle in Toyko seven years prior to the film's story, leaving her with only her stuffed toy Gulliver, and the AI concierge Dominikids for emotional support. Now she is a key part of a strategy by the experimental unit "Acid" to combat the seventh Eureka, "Eureka Seven," an enemy of humanity that has killed 2.6 billion people. Driven to the brink, 

In [26]:
vocab_data_path = helper_data_path + 'vocab_with_stop_word.csv'
vocab_df_dict = {'word': list(vocab)}
vocab_df = pd.DataFrame(data=vocab_df_dict)
vocab_df.to_csv(vocab_data_path, index=False)