In [1]:
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable
from classifier import eval, naive_bayes as nb, naive_bayes_thres as nb_thres, preproc, logistic_regression as lg, LSTM
from data import data
from os.path import exists

In [2]:
exists('data/myanimelist.csv')

True

We write the data that we request from mynaimelist api and then we read it

In [3]:
data.write_data(100,10000)

(None, None)

In [4]:
dataset = pd.read_csv('data/myanimelist.csv')

We split the data into train, dev and test data and then read it

In [5]:
data.split_data(dataset, 1000, 1000)

In [6]:
train_data = pd.read_csv('data/train_data.csv')
test_data = pd.read_csv('data/test_data.csv')
dev_data = pd.read_csv('data/dev_data.csv')

# Naive Bayes

In [18]:
func_list = [preproc.clean_para, preproc.bag_of_words, preproc.remove_stop_words]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

In [14]:
vals = np.logspace(-3,2,11)
smoothing = nb.find_best_smoother(x_tr, y_tr, x_dev, y_dev, vals)
weights = nb.calculating_weights(x_tr, y_tr, smoothing)
genre_list = nb.get_label_count(y_tr)[1]

In [15]:
amount_list = nb.get_amount_list(y_tr)
y_pred = nb.predict_all(x_tr, weights, genre_list, amount_list)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_tr = preproc.one_hot_encoding_label(y_tr, genre_list)
acc = eval.accuracy(y_pred, y_tr)
print("Training accuracy :", acc)

f_score = eval.f_score(y_pred, y_tr)
print("F_score :", f_score)

Training accuracy : 0.9972135352031107
F_score : 0.9648815482148816


In [19]:
amount_list = nb.get_amount_list(y_dev)
y_pred = nb.predict_all(x_dev, weights, genre_list, amount_list)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_dev = preproc.one_hot_encoding_label(y_dev, genre_list)
acc = eval.accuracy(y_pred, y_dev)
print("Validation accuracy :", acc)

f_score = eval.f_score(y_pred, y_dev)
print("F_score :", f_score)

Validation accuracy : 0.9773333333333334
F_score : 0.712059620596206


In [20]:
amount_list = nb.get_amount_list(y_te)
y_pred = nb.predict_all(x_te, weights, genre_list, amount_list)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_te = preproc.one_hot_encoding_label(y_te, genre_list)
acc = eval.accuracy(y_pred, y_te)
print("Testing accuracy :", acc)

f_score = eval.f_score(y_pred, y_te)
print("F_score :", f_score)

Testing accuracy : 0.9773733333333333
F_score : 0.7165525304827125


In [21]:
nb.save_weights(weights)
weights = pd.read_csv('data/nb_weight.csv')

# Naive Bayes with threshold

In [25]:
func_list = [preproc.clean_para, preproc.bag_of_words, preproc.remove_stop_words]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

In [23]:
smoothers = np.logspace(-3,2,11)
thresholds = [0.1 * i for i in range(10)]

smoothing, threshold = nb_thres.threshold_find_best_hyperparameter(x_tr, y_tr, x_dev, y_dev, smoothers, thresholds, 10)
weights = nb.calculating_weights(x_tr, y_tr, smoothing)
print(smoothing, threshold)

count, total_number = nb_thres.total_word_count(x_tr, y_tr)

  if score[genre] - sent_prob >= np.log(threshold):


0.001 0.5


In [26]:
genre_list = nb.get_label_count(y_tr)[1]
sentence_probabilites = nb_thres.find_sentence_probabilites(x_tr, count, total_number, smoothing)
y_pred = nb_thres.threshold_predict_all(x_tr, sentence_probabilites, weights, genre_list, threshold)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_tr = preproc.one_hot_encoding_label(y_tr, genre_list)
acc = eval.accuracy(y_pred, y_tr)
print("Training accuracy :", acc)

f_score = eval.f_score(y_pred, y_tr)
print("F_score :", f_score)

Training accuracy : 0.9949284355092248
F_score : 0.9343865733307645


In [27]:
sentence_probabilites = nb_thres.find_sentence_probabilites(x_dev, count, total_number, smoothing)
y_pred = nb_thres.threshold_predict_all(x_dev, sentence_probabilites, weights, genre_list, threshold)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_dev = preproc.one_hot_encoding_label(y_dev, genre_list)
acc = eval.accuracy(y_pred, y_dev)
print("Validation accuracy :", acc)

f_score = eval.f_score(y_pred, y_dev)
print("F_score :", f_score)

Validation accuracy : 0.9764266666666667
F_score : 0.6153176675369887


In [28]:
sentence_probabilites = nb_thres.find_sentence_probabilites(x_te, count, total_number, smoothing)
y_pred = nb_thres.threshold_predict_all(x_te, sentence_probabilites, weights, genre_list, threshold)

y_pred = preproc.one_hot_encoding_label(y_pred, genre_list)
y_te = preproc.one_hot_encoding_label(y_te, genre_list)
acc = eval.accuracy(y_pred, y_te)
print("Testing accuracy :", acc)

f_score = eval.f_score(y_pred, y_te)
print("F_score :", f_score)

Testing accuracy : 0.9760133333333333
F_score : 0.6101841820151679


In [29]:
nb.save_weights(weights, 'weight_thres.csv')
weights = pd.read_csv('weight_thres.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'weight_thres.csv'

# Logistic Regression with Threshold

In [7]:
func_list = [preproc.clean_para, preproc.bag_of_words, preproc.remove_stop_words]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

In [8]:
vocab = nb.count_words(x_tr, y_tr)[1]
genre_list = nb.get_label_count(y_tr)[1]

model = torch.nn.Sequential(
            torch.nn.Linear(len(vocab), len(genre_list), bias=True),
        )
model.add_module('softmax',torch.nn.LogSoftmax(dim=1))
loss = torch.nn.NLLLoss()

In [9]:
X_tr_num_genres = lg.make_num_genres(y_tr)
X_tr = preproc.make_numpy_bag_of_word(x_tr, X_tr_num_genres, vocab)
X_tr_var = Variable(torch.from_numpy(X_tr.astype(np.float32)))

X_dev = preproc.make_numpy_bag_of_word(x_dev, [1 for i in range(len(x_dev))], vocab)
X_dev_var = Variable(torch.from_numpy(X_dev.astype(np.float32)))

X_te = preproc.make_numpy_bag_of_word(x_te, [1 for i in range(len(x_te))], vocab)
X_te_var = Variable(torch.from_numpy(X_te.astype(np.float32)))

In [10]:
Y_tr = lg.make_numpy_label(y_tr, genre_list)
Y_dev = preproc.one_hot_encoding_label(y_dev, genre_list)
Y_te = preproc.one_hot_encoding_label(y_te, genre_list)

Y_tr_var = Variable(torch.from_numpy(Y_tr))
Y_dev_var = Variable(torch.from_numpy(Y_dev))
Y_te_var = Variable(torch.from_numpy(Y_te))

In [11]:
model_trained, losses, accuracies = lg.train_model(loss,model,
                                                       X_tr_var,
                                                       Y_tr_var,
                                                       X_dv_var=X_dev_var,
                                                       Y_dv_var = Y_dev_var,
                                                       num_its=300,
                                                       threshold = np.log(0.2),
                                                       optim_args={'lr':1},
                                                       param_file = "lg_best.params"
                                                  )

Epoch 1: Dev Accuracy: 0.96064
Epoch 1: Dev F_score: 0
Epoch 11: Dev Accuracy: 0.9610266666666667
Epoch 11: Dev F_score: 0.02011397921555481
Epoch 21: Dev Accuracy: 0.96204
Epoch 21: Dev F_score: 0.07354376830458836
Epoch 31: Dev Accuracy: 0.96324
Epoch 31: Dev F_score: 0.1332914177931468
Epoch 41: Dev Accuracy: 0.96408
Epoch 41: Dev F_score: 0.17260442260442257
Epoch 51: Dev Accuracy: 0.9647733333333334
Epoch 51: Dev F_score: 0.2027761013880507
Epoch 61: Dev Accuracy: 0.96536
Epoch 61: Dev F_score: 0.22862232779097388
Epoch 71: Dev Accuracy: 0.96584
Epoch 71: Dev F_score: 0.2486803519061583
Epoch 81: Dev Accuracy: 0.9662266666666667
Epoch 81: Dev F_score: 0.2638767800058122
Epoch 91: Dev Accuracy: 0.9666133333333333
Epoch 91: Dev F_score: 0.27880184331797236


We'll try training the model with no duplicate data

In [26]:
model = torch.nn.Sequential(
            torch.nn.Linear(len(vocab), len(genre_list), bias=True),
        )
model.add_module('softmax',torch.nn.LogSoftmax(dim=1))
loss = torch.nn.NLLLoss()

In [27]:
func_list = [preproc.clean_para, preproc.bag_of_words, preproc.remove_stop_words]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

In [28]:
X_tr = preproc.make_numpy_bag_of_word(x_tr, [1 for i in range(len(x_tr))], vocab)
X_tr_var = Variable(torch.from_numpy(X_tr.astype(np.float32)))

X_dev = preproc.make_numpy_bag_of_word(x_dev, [1 for i in range(len(x_dev))], vocab)
X_dev_var = Variable(torch.from_numpy(X_dev.astype(np.float32)))

X_te = preproc.make_numpy_bag_of_word(x_te, [1 for i in range(len(x_te))], vocab)
X_te_var = Variable(torch.from_numpy(X_te.astype(np.float32)))

In [29]:
Y_tr = lg.make_numpy_label(np.transpose([y_tr[:,0]]), genre_list)
Y_dev = preproc.one_hot_encoding_label(y_dev, genre_list)
Y_te = preproc.one_hot_encoding_label(y_te, genre_list)

Y_tr_var = Variable(torch.from_numpy(Y_tr))
Y_dev_var = Variable(torch.from_numpy(Y_dev))
Y_te_var = Variable(torch.from_numpy(Y_te))

In [30]:
model_trained, losses, accuracies = lg.train_model(loss,model,
                                                       X_tr_var,
                                                       Y_tr_var,
                                                       X_dv_var=X_dev_var,
                                                       Y_dv_var = Y_dev_var,
                                                       num_its=200,
                                                       threshold = np.log(0.2),
                                                       optim_args={'lr':1},
                                                      param_file = "lg_1_best.params")

Epoch 1: Dev Accuracy: 0.96064
Epoch 1: Dev F_score: 0
Epoch 11: Dev Accuracy: 0.96064
Epoch 11: Dev F_score: 0.009395973154362415
Epoch 21: Dev Accuracy: 0.9606666666666667
Epoch 21: Dev F_score: 0.07057340894770006
Epoch 31: Dev Accuracy: 0.9607866666666667
Epoch 31: Dev F_score: 0.09812940815700706
Epoch 41: Dev Accuracy: 0.96072
Epoch 41: Dev F_score: 0.11158021712907117
Epoch 51: Dev Accuracy: 0.96088
Epoch 51: Dev F_score: 0.12417910447761196
Epoch 61: Dev Accuracy: 0.9611466666666667
Epoch 61: Dev F_score: 0.14041297935103242
Epoch 71: Dev Accuracy: 0.96116
Epoch 71: Dev F_score: 0.14749780509218616
Epoch 81: Dev Accuracy: 0.9611466666666667
Epoch 81: Dev F_score: 0.1533991865194654
Epoch 91: Dev Accuracy: 0.96144
Epoch 91: Dev F_score: 0.16991963260619977
Epoch 101: Dev Accuracy: 0.96156
Epoch 101: Dev F_score: 0.17605030008573877
Epoch 111: Dev Accuracy: 0.9618
Epoch 111: Dev F_score: 0.18584825234441602
Epoch 121: Dev Accuracy: 0.9620266666666667
Epoch 121: Dev F_score: 0.197

# LSTM model

In [8]:
func_list = [preproc.clean_para, preproc.sentence_to_list]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

In [9]:
vocab = nb.count_words(x_tr, y_tr)[1]
genre_count, genre_list = nb.get_label_count(y_tr)
word_to_index = LSTM.word_to_ix(vocab)

embedding = LSTM.load_glove_vectors(vocab)
loss_weight = LSTM.calculate_loss_weights(genre_count, genre_list)

model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
loss = torch.nn.BCELoss(reduction = "none")

In [10]:
X_tr = [LSTM.prepare_sequence(x, word_to_index) for x in x_tr]
# X_tr = biLSTM.create_x_numpy(X_tr, max_len).astype(int)
# X_tr_var = Variable(torch.from_numpy(X_tr))

X_dev = [LSTM.prepare_sequence(x, word_to_index) for x in x_dev]
# X_dev = biLSTM.create_x_numpy(X_dev, max_len).astype(int)
# X_dev_var = Variable(torch.from_numpy(X_dev))

X_te = [LSTM.prepare_sequence(x, word_to_index) for x in x_te]
# X_te = biLSTM.create_x_numpy(X_te, max_len).astype(int)
# X_te_var = Variable(torch.from_numpy(X_te.astype(int)))

In [11]:
Y_tr = preproc.one_hot_encoding_label(y_tr, genre_list).astype(np.float32)
Y_dev = preproc.one_hot_encoding_label(y_dev, genre_list).astype(np.float32)
Y_te = preproc.one_hot_encoding_label(y_te, genre_list).astype(np.float32)

Y_tr_var = Variable(torch.from_numpy(Y_tr))
Y_dev_var = Variable(torch.from_numpy(Y_dev))
Y_te_var = Variable(torch.from_numpy(Y_te))

In [11]:
print(embedding.shape)

(35439, 100)


In [12]:
for i in range(len(X_tr)):
    model(Variable(torch.Tensor(X_tr[i]).long()))

In [13]:
simple_loss_weight_001_model, losses, accuracies = LSTM.train_model(loss, model, X_tr, Y_tr, loss_weight, 
                                        X_dev, Y_dev, num_its=20, status_frequency=2, 
                                        optim_args = {'lr':0.01}, param_file = 'lstm.params')

Epoch 1: Dev Accuracy: 0.64788
Epoch 1: Dev F_score: 0.06718236727773656
Epoch 3: Dev Accuracy: 0.7478533333333334
Epoch 3: Dev F_score: 0.07989101347735124
Epoch 5: Dev Accuracy: 0.80204
Epoch 5: Dev F_score: 0.08176139526253942
Epoch 7: Dev Accuracy: 0.8369333333333333
Epoch 7: Dev F_score: 0.0794821616739425
Epoch 9: Dev Accuracy: 0.8820133333333333
Epoch 9: Dev F_score: 0.08461777180097238
Epoch 11: Dev Accuracy: 0.9215466666666666
Epoch 11: Dev F_score: 0.09809932556713673
Epoch 13: Dev Accuracy: 0.9338
Epoch 13: Dev F_score: 0.09874750408422579
Epoch 15: Dev Accuracy: 0.9384133333333333
Epoch 15: Dev F_score: 0.08841523583974738
Epoch 17: Dev Accuracy: 0.9398133333333333
Epoch 17: Dev F_score: 0.08363784003248073
Epoch 19: Dev Accuracy: 0.9432
Epoch 19: Dev F_score: 0.08426483233018056


In [14]:
model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
simple_loss_weight_01_model, losses, accuracies = LSTM.train_model(loss, model, X_tr, Y_tr, loss_weight, 
                                        X_dev, Y_dev, num_its=20, status_frequency=2, 
                                        optim_args = {'lr':0.1}, param_file = 'lstm.params')

Epoch 1: Dev Accuracy: 0.87384
Epoch 1: Dev F_score: 0.22455335190952302
Epoch 3: Dev Accuracy: 0.9417466666666666
Epoch 3: Dev F_score: 0.26633081444164564
Epoch 5: Dev Accuracy: 0.9418533333333333
Epoch 5: Dev F_score: 0.2661955241460542
Epoch 7: Dev Accuracy: 0.9574666666666667
Epoch 7: Dev F_score: 0.19322205361659078
Epoch 9: Dev Accuracy: 0.95748
Epoch 9: Dev F_score: 0.19327093346825197
Epoch 11: Dev Accuracy: 0.9574933333333333
Epoch 11: Dev F_score: 0.1933198380566802
Epoch 13: Dev Accuracy: 0.9574933333333333
Epoch 13: Dev F_score: 0.1933198380566802
Epoch 15: Dev Accuracy: 0.9574933333333333
Epoch 15: Dev F_score: 0.1933198380566802
Epoch 17: Dev Accuracy: 0.9574933333333333
Epoch 17: Dev F_score: 0.1933198380566802
Epoch 19: Dev Accuracy: 0.9574933333333333
Epoch 19: Dev F_score: 0.1933198380566802


In [15]:
model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
loss_weight = torch.tensor([1 for i in range(len(genre_list))])
no_loss_weight_model, losses, accuracies = LSTM.train_model(loss, model, X_tr, Y_tr,loss_weight, 
                                        X_dev, Y_dev, num_its=20, status_frequency=2, 
                                        optim_args = {'lr':0.01}, param_file = 'lstm_no_loss_weight.params')

Epoch 1: Dev Accuracy: 0.96064
Epoch 1: Dev F_score: 0
Epoch 3: Dev Accuracy: 0.96064
Epoch 3: Dev F_score: 0
Epoch 5: Dev Accuracy: 0.96064
Epoch 5: Dev F_score: 0
Epoch 7: Dev Accuracy: 0.96064
Epoch 7: Dev F_score: 0
Epoch 9: Dev Accuracy: 0.96064
Epoch 9: Dev F_score: 0
Epoch 11: Dev Accuracy: 0.9606533333333334
Epoch 11: Dev F_score: 0.0006772773450728074
Epoch 13: Dev Accuracy: 0.9606533333333334
Epoch 13: Dev F_score: 0.0006772773450728074
Epoch 15: Dev Accuracy: 0.9606533333333334
Epoch 15: Dev F_score: 0.0006772773450728074
Epoch 17: Dev Accuracy: 0.9606533333333334
Epoch 17: Dev F_score: 0.0006772773450728074
Epoch 19: Dev Accuracy: 0.9610266666666667
Epoch 19: Dev F_score: 0.020770519262981575


In [None]:
model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
loss_weight = torch.tensor([1 for i in range(len(genre_list))])
no_loss_weight_model, losses, accuracies = LSTM.train_model(loss, model, X_tr, Y_tr,loss_weight, 
                                        X_dev, Y_dev, num_its=20, status_frequency=2, 
                                        optim_args = {'lr':0.1}, param_file = 'lstm_no_loss_weight.params')

Epoch 1: Dev Accuracy: 0.96064
Epoch 1: Dev F_score: 0
Epoch 3: Dev Accuracy: 0.96036
Epoch 3: Dev F_score: 0.05887939221272554
Epoch 5: Dev Accuracy: 0.9606533333333334
Epoch 5: Dev F_score: 0.08097165991902834
Epoch 7: Dev Accuracy: 0.9612133333333334
Epoch 7: Dev F_score: 0.037073816617014234
Epoch 9: Dev Accuracy: 0.9637066666666667
Epoch 9: Dev F_score: 0.2599238716693855
Epoch 11: Dev Accuracy: 0.9651733333333333
Epoch 11: Dev F_score: 0.32401656314699795
Epoch 13: Dev Accuracy: 0.9655333333333334
Epoch 13: Dev F_score: 0.3434086868173737
Epoch 15: Dev Accuracy: 0.9662266666666667
Epoch 15: Dev F_score: 0.3722428748451053
Epoch 17: Dev Accuracy: 0.9671866666666666
Epoch 17: Dev F_score: 0.4085556356645037
Epoch 19: Dev Accuracy: 0.9674533333333334
Epoch 19: Dev F_score: 0.3980271270036992


In [11]:
def BCEloss_with_weight(output, target, w_p, w_n, genre_list):
#     loss = 
#     for i in range(output.size(dim=0)):
#         first_term = w_p[genre_list[i]] * target[i] * torch.log(output[i] + 1e-10)
#         second_term = w_n[genre_list[i]] * (1 - target[i]) + torch.log(1 - output[i] + 1e-10)
#         loss.append(first_term + second_term)
    loss_func = torch.nn.BCELoss(reduction = "none")
    first_term = target * w_p
    second_term = (1 - target) * w_n
    
    loss = loss_func(output, target)
    loss = (first_term + second_term) * loss
    return torch.mean(loss)

In [12]:
def calculate_loss_weights(genre_count, genre_list, num_data):
    genre_list = sorted(genre_list)
    positive_weights = [None] * len(genre_list)
    negative_weights = [None] * len(genre_list)
    
    i = 0
    for label in genre_list:
        positive_weights[i] = num_data / (2 * genre_count[label])
        negative_weights[i] = num_data / (2 * (num_data - genre_count[label]))
        i += 1
    return torch.tensor(positive_weights), torch.tensor(negative_weights)

In [13]:
def train_model(model, X_tr, Y_tr, w_p, w_n, genre_list, X_dv=None, Y_dv = None, num_its=50, status_frequency=10,
               optim_args = {'lr':0.1},
               param_file = 'best.params'):
    
    #initialize optimizer
    optimizer = optim.SGD(model.parameters(), **optim_args)
    
    losses=[]
    accuracies=[]
    
    for epoch in range(num_its):
        
        model.train()
        loss_value=0
        count1=0
        
        for X,Y in zip(X_tr,Y_tr):
            X_tr_var = Variable(torch.Tensor(X)).long()
            Y_tr_var = Variable(torch.from_numpy(Y))
            
            y_pred = model(X_tr_var)
            # set gradient to zero
            optimizer.zero_grad()
            
            output = BCEloss_with_weight(y_pred, Y_tr_var, w_p, w_n, genre_list)
            
            output.backward()
            optimizer.step()
            loss_value += output.item()
            count1+=1
            
            
        losses.append(loss_value/count1)
        
        # write parameters if this is the best epoch yet
        acc=0        
        if X_dv is not None and Y_dv is not None:
            acc=0
            index=0
            y_pred = np.zeros((Y_dv.shape[0],Y_dv.shape[1]))
            for Xdv, Ydv in zip(X_dv, Y_dv):
                
                X_dv_var = Variable(torch.Tensor(Xdv)).long()
                Y_dv_var = Variable(torch.from_numpy(Ydv))
                # run forward on dev data
                Y_hat = model(X_dv_var)
                
                # compute dev accuracy
                for i in range(Y_hat.size(dim=0)):
                    if Y_hat[i] >= 0.5:
                        Y_hat[i] = 1
                    else:
                        Y_hat[i] = 0
                y_pred[index] = Y_hat.tolist()
                index += 1
                # save
            acc = eval.accuracy(y_pred, Y_dv)
            f_score = eval.f_score(y_pred, Y_dv)
            if len(accuracies) == 0 or acc > max(accuracies):
                state = {'state_dict':model.state_dict(),
                         'epoch':len(accuracies)+1,
                         'accuracy':acc}
                torch.save(state,param_file)
            accuracies.append(acc)
        # print status message if desired
        if status_frequency > 0 and epoch % status_frequency == 0:
            print("Epoch "+str(epoch+1)+": Dev Accuracy: "+str(acc))
            print("Epoch "+str(epoch+1)+": Dev F_score: "+str(f_score))
    return model, losses, accuracies

In [14]:
import torch.optim as optim
 
model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
genre_list = sorted(genre_list)
w_p, w_n = calculate_loss_weights(genre_count, genre_list, len(X_tr))
complex_loss_weight_001_model, losses, accuracies = train_model(model, X_tr, Y_tr, w_p, w_n, genre_list,
                                        X_dev, Y_dev, num_its=20, status_frequency=2, 
                                        optim_args = {'lr':0.01}, param_file = 'lstm_complex_loss_weight.params')

Epoch 1: Dev Accuracy: 0.6150666666666667
Epoch 1: Dev F_score: 0.09453017187303976
Epoch 3: Dev Accuracy: 0.7319733333333334
Epoch 3: Dev F_score: 0.12887848847287225
Epoch 5: Dev Accuracy: 0.69116
Epoch 5: Dev F_score: 0.13061592163044702
Epoch 7: Dev Accuracy: 0.5692666666666667
Epoch 7: Dev F_score: 0.11631151352682112
Epoch 9: Dev Accuracy: 0.6109066666666667
Epoch 9: Dev F_score: 0.1299344066785927
Epoch 11: Dev Accuracy: 0.6203866666666666
Epoch 11: Dev F_score: 0.13674539886601375
Epoch 13: Dev Accuracy: 0.5720133333333334
Epoch 13: Dev F_score: 0.12724652655047716
Epoch 15: Dev Accuracy: 0.6259066666666666
Epoch 15: Dev F_score: 0.1412261638762205
Epoch 17: Dev Accuracy: 0.6493066666666667
Epoch 17: Dev F_score: 0.14537301793605406
Epoch 19: Dev Accuracy: 0.6231066666666667
Epoch 19: Dev F_score: 0.13294070734026564


In [14]:
import torch.optim as optim
 
model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
genre_list = sorted(genre_list)
w_p, w_n = calculate_loss_weights(genre_count, genre_list, len(X_tr))
complex_loss_weight_01_model, losses, accuracies = train_model(model, X_tr, Y_tr, w_p, w_n, genre_list,
                                        X_dev, Y_dev, num_its=50, status_frequency=2, 
                                        optim_args = {'lr':0.1}, param_file = 'lstm_complex_loss_weight.params')

Epoch 1: Dev Accuracy: 0.5843866666666667
Epoch 1: Dev F_score: 0.10543836992394892
Epoch 3: Dev Accuracy: 0.5592133333333333
Epoch 3: Dev F_score: 0.12252156602521568
Epoch 5: Dev Accuracy: 0.61288
Epoch 5: Dev F_score: 0.13537820131030376
Epoch 7: Dev Accuracy: 0.6896533333333333
Epoch 7: Dev F_score: 0.16459694207163877
Epoch 9: Dev Accuracy: 0.83664
Epoch 9: Dev F_score: 0.26103739445114593
Epoch 11: Dev Accuracy: 0.8728933333333333
Epoch 11: Dev F_score: 0.3091528371621132
Epoch 13: Dev Accuracy: 0.8956666666666667
Epoch 13: Dev F_score: 0.35261024240919997
Epoch 15: Dev Accuracy: 0.9205866666666667
Epoch 15: Dev F_score: 0.4064181781941399
Epoch 17: Dev Accuracy: 0.9283733333333334
Epoch 17: Dev F_score: 0.4291179596174282
Epoch 19: Dev Accuracy: 0.93516
Epoch 19: Dev F_score: 0.46187894212681196
Epoch 21: Dev Accuracy: 0.94608
Epoch 21: Dev F_score: 0.5013563501849568
Epoch 23: Dev Accuracy: 0.9471066666666667
Epoch 23: Dev F_score: 0.502071043052592
Epoch 25: Dev Accuracy: 0.95

In [15]:
import torch.optim as optim
 
model = LSTM.BiLSTM(len(word_to_index), len(genre_list), 100, 128, embeddings=embedding)
genre_list = sorted(genre_list)
w_p, w_n = calculate_loss_weights(genre_count, genre_list, len(X_tr))
complex_loss_weight_01_adam_model, losses, accuracies = train_model(model, X_tr, Y_tr, w_p, w_n, genre_list,
                                        X_dev, Y_dev, num_its=50, status_frequency=2, 
                                        optim_args = {'lr':0.1}, param_file = 'lstm_adam_complex_loss_weight.params')

Epoch 1: Dev Accuracy: 0.62884
Epoch 1: Dev F_score: 0.11061056263778397
Epoch 3: Dev Accuracy: 0.7149066666666667
Epoch 3: Dev F_score: 0.13649947500201923
Epoch 5: Dev Accuracy: 0.7619733333333333
Epoch 5: Dev F_score: 0.15313092979127135
Epoch 7: Dev Accuracy: 0.78772
Epoch 7: Dev F_score: 0.168138356235958
Epoch 9: Dev Accuracy: 0.8136133333333333
Epoch 9: Dev F_score: 0.18408918461448667
Epoch 11: Dev Accuracy: 0.8237733333333334
Epoch 11: Dev F_score: 0.19638839910013983
Epoch 13: Dev Accuracy: 0.8503466666666667
Epoch 13: Dev F_score: 0.21914567969945736
Epoch 15: Dev Accuracy: 0.8533333333333334
Epoch 15: Dev F_score: 0.2082913487836476
Epoch 17: Dev Accuracy: 0.8566133333333333
Epoch 17: Dev F_score: 0.2106576629477393
Epoch 19: Dev Accuracy: 0.86024
Epoch 19: Dev F_score: 0.22032133293662604
Epoch 21: Dev Accuracy: 0.8861866666666667
Epoch 21: Dev F_score: 0.2550183278058998
Epoch 23: Dev Accuracy: 0.8951333333333333
Epoch 23: Dev F_score: 0.2501668414529507
Epoch 25: Dev Acc

In [37]:
1 - torch.tensor([1,2])

tensor([ 0, -1])

In [13]:
print(len(word_to_index), len(genre_list), 100, 128)
from server import server

35439 75 100 128


ModuleNotFoundError: No module named 'server'

In [41]:
acc=0
count2=0
for Xdv, Ydv in zip(X_te, Y_te):

    X_dv_var = Variable(torch.Tensor(Xdv)).long()
    Y_dv_var = Variable(torch.from_numpy(Ydv))
    # run forward on dev data
    Y_hat = model(X_dv_var)

    # compute dev accuracy
    for i in range(Y_hat.size(dim=0)):
        if Y_hat[i] >= 0.5:
            Y_hat[i] = 1
        else:
            Y_hat[i] = 0
    acc += (Y_hat == Y_dv_var).float().sum()
    count2 += Y_hat.size(dim=0)
    # save
acc/=count2
print(acc)

tensor(0.9616)


In [47]:
genre_list = sorted(genre_list)
for i in range(10):
    output = model(Variable(torch.Tensor(X_te[i]).long()))
    for i in range(output.size(dim=0)):
        if output[i] >= 0.5:
            print(genre_list[i])
            
print(y_te[:10])

[['Drama' 'Historical' 'Romance' 'Slice of Life' nan nan nan nan nan nan
  nan]
 ['Action' 'Comedy' 'Fantasy' 'Shounen' nan nan nan nan nan nan nan]
 ['Comedy' nan nan nan nan nan nan nan nan nan nan]
 ['Drama' 'Sci-Fi' nan nan nan nan nan nan nan nan nan]
 ['Comedy' 'Slice of Life' nan nan nan nan nan nan nan nan nan]
 ['Action' 'Fantasy' 'Historical' 'Martial Arts' 'Supernatural' nan nan
  nan nan nan nan]
 ['Action' 'Adventure' 'Comedy' 'Fantasy' 'Kids' nan nan nan nan nan nan]
 ['Comedy' 'Romance' 'School' 'Shoujo' nan nan nan nan nan nan nan]
 ['Fantasy' 'Kids' 'Slice of Life' nan nan nan nan nan nan nan nan]
 ['Action' 'Adventure' 'Sci-Fi' nan nan nan nan nan nan nan nan]]


In [20]:
pred = torch.from_numpy(np.array([0.0,1.0,1.0,0.0]))
y = torch.from_numpy(np.array([0,1,1,1]))
(pred == y).float().sum()

tensor(3.)

In [13]:
print(X_tr.shape)
print(Y_tr.shape)

(8009, 387)
(8009, 75)


X [26622 26598 21407 18062 18754     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0 

In [47]:
output = model(Variable(torch.tensor(X_dev[375])))
genre_list = sorted(genre_list)
for i in range(output.size(dim=0)):
    if output[i] >= 0.5:
        print(genre_list[i])
print(y_dev[375])

Comedy
['Mecha' 'Parody' 'Slice of Life' nan nan nan nan nan nan nan nan]


NameError: name 'np' is not defined

# Logistic Regression with 1 label per synopsis

In [8]:
func_list = [preproc.clean_para, preproc.bag_of_words, preproc.remove_stop_words]
x_tr, y_tr = preproc.cleaning_data(train_data, func_list)
x_dev, y_dev = preproc.cleaning_data(dev_data, func_list)
x_te, y_te = preproc.cleaning_data(test_data, func_list)

In [9]:
vocab = nb.count_words(x_tr, y_tr)[1]
genre_list = nb.get_label_count(y_tr)[1]

model = torch.nn.Sequential(
            torch.nn.Linear(len(vocab), len(genre_list), bias=True),
        )
model.add_module('softmax',torch.nn.LogSoftmax(dim=1))
loss = torch.nn.NLLLoss()

In [10]:
X_tr = preproc.make_numpy_bag_of_word(x_tr, [1 for i in range(len(x_tr))], vocab)
X_tr_var = Variable(torch.from_numpy(X_tr.astype(np.float32)))

X_dev = preproc.make_numpy_bag_of_word(x_dev, [1 for i in range(len(x_dev))], vocab)
X_dev_var = Variable(torch.from_numpy(X_dev.astype(np.float32)))

X_te = preproc.make_numpy_bag_of_word(x_te, [1 for i in range(len(x_te))], vocab)
X_te_var = Variable(torch.from_numpy(X_te.astype(np.float32)))

In [11]:
Y_tr = lg.make_numpy_label([y_tr[:,0]], genre_list)
Y_dev = preproc.one_hot_encoding_label(y_dev, genre_list)
Y_te = preproc.one_hot_encoding_label(y_te, genre_list)

Y_tr_var = Variable(torch.from_numpy(Y_tr))
Y_dev_var = Variable(torch.from_numpy(Y_dev))
Y_te_var = Variable(torch.from_numpy(Y_te))

In [12]:
print(X_tr.shape, Y_tr.shape)

(8009, 34675) (8009,)


In [13]:
model_trained, losses, accuracies = lg.train_1_label_model(loss,model,
                                                       X_tr_var,
                                                       Y_tr_var,
                                                       X_dv_var=X_dev_var,
                                                       Y_dv_var = Y_dev_var,
                                                       num_its=200,
                                                       optim_args={'lr':0.01, 'momentum': 1})

Epoch 1: Dev Accuracy: 0.018
Epoch 11: Dev Accuracy: 0.34
Epoch 21: Dev Accuracy: 0.331
Epoch 31: Dev Accuracy: 0.378
Epoch 41: Dev Accuracy: 0.404
Epoch 51: Dev Accuracy: 0.42
Epoch 61: Dev Accuracy: 0.439
Epoch 71: Dev Accuracy: 0.449
Epoch 81: Dev Accuracy: 0.454
Epoch 91: Dev Accuracy: 0.455
Epoch 101: Dev Accuracy: 0.476
Epoch 111: Dev Accuracy: 0.505
Epoch 121: Dev Accuracy: 0.53
Epoch 131: Dev Accuracy: 0.528
Epoch 141: Dev Accuracy: 0.516
Epoch 151: Dev Accuracy: 0.53
Epoch 161: Dev Accuracy: 0.536
Epoch 171: Dev Accuracy: 0.529
Epoch 181: Dev Accuracy: 0.546
Epoch 191: Dev Accuracy: 0.548


In [22]:
Y_dev = lg.make_bi_numpy_label(y_dev, genre_list)
Y_dev_var = Variable(torch.from_numpy(Y_dev))
Y_hat = model.forward(X_dev_var).data
acc = lg.accuracy(Y_hat.data.numpy(),Y_dev_var.data.numpy(), np.log(0.4))
print(acc)

0.098


In [21]:
model = torch.nn.Sequential(
            torch.nn.Linear(len(vocab), len(genre_list), bias=True),
        )
model.add_module('softmax',torch.nn.LogSoftmax(dim=1))
loss = torch.nn.NLLLoss()


model_trained, losses, accuracies = lg.train_model(loss,model,
                                                   X_tr_var,
                                                   Y_tr_var,
                                                   X_dv_var=X_dev_var,
                                                   Y_dv_var = Y_dev_var,
                                                   num_its=200,
                                                   threshold=np.log(0.2),
                                                   optim_args={'lr':0.01, 'momentum': 1})

Epoch 1: Dev Accuracy: 0.0
Epoch 11: Dev Accuracy: 0.0


KeyboardInterrupt: 