In [1]:
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy, os, csv
from tqdm import tqdm
import statistics
from collections import defaultdict as dd
from sklearn.linear_model import LogisticRegression  
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score 
from sklearn.preprocessing import MinMaxScaler
from typing import List

In [2]:
#Construct an RNN network.
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import time
import math

In [3]:
#Note: Work in progress: do not run code yet!

In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
#Read into a dataframe.

def load_data_set(path: str):
    """
    loads data set located at path and returns as pandas data frame
    """
    with open(path) as file:
        data = json.load(file)
    
    print(f"loaded {len(data)} instances")
    data = pd.json_normalize(data)  #Normalize semi-structured JSON data into a flat table.
    return data

In [6]:
# Feature transformations

def filter_authors(authors: List[int], prolifics=True):
    
    if prolifics:
        prolifics = filter(lambda x: x < 100, authors)
        return list(prolifics)
    
    else:
        coauthors = filter(lambda x: x>=100, authors)
        return list(coauthors)
    
    
def text_to_vector(text: List[int]):
    """
    Converts text to sparse matrix representation
    text: List of integers between 1, 4999
    """
    
    word_vec = np.zeros(5000, dtype=int)
    
    for word in text:
        word_vec[word] += 1 
    
    return word_vec

def statistics_feature(lst: List[int], length=True):
    if length:
        return len(lst)
    else:
        if len(lst) <= 1:
            return 0
        else:
            return statistics.stdev(lst)

def pre_processing(df, train=True):
    
    if train:
        df["target authors"] = df["authors"].apply(lambda x: filter_authors(x))
        df["coauthors"]      = df["authors"].apply(lambda x: filter_authors(x, prolifics=False))
        df = df.drop(["authors"], axis=1)
    
    ########### preprocessing for text
    df["combined text"] = df["abstract"]# + df["title"] #+ df["abstract"]
    df["word_mean"] = df["combined text"].apply(lambda x: np.mean(x))  
    df["word_count"] = df["combined text"].apply(lambda x: statistics_feature(x))
    df["word_spread"] = df["combined text"].apply(lambda x: statistics_feature(x, length=False))
    #df = df.drop(["combined text"], axis=1)
    ###########
    
    df["abstract"] = df["abstract"].apply(lambda x: text_to_vector(x))
    df["title"]    = df["title"].apply(lambda x: text_to_vector(x))
    df["text"]     = df["title"] + df["abstract"]
    text_df = pd.DataFrame(df.text.tolist(), index=df.index)
    
    #preprocessing for venue. We use minmax scaling as a matter of best-practice. 
    # as we require all rows to have integer values, we give blank venues a dummy value of 465
    scalar = MinMaxScaler()
    df.loc[df.venue == "", "venue"] = 465
    df["venue"] = scalar.fit_transform(df["venue"].to_numpy().reshape(-1, 1))
    ##Repeat this for the word_mean, count and spread
    df["word_mean"] = scalar.fit_transform(df["word_mean"].to_numpy().reshape(-1, 1))
    df["word_spread"] = scalar.fit_transform(df["word_spread"].to_numpy().reshape(-1, 1))
    df["word_count"] = scalar.fit_transform(df["word_count"].to_numpy().reshape(-1, 1))
    
    # prepocessing for coauthors
    # we use a discretised binning strategy, with n=10 bins by default. 
    df["coauthors"] = df["coauthors"].apply(lambda x: build_bins(x, n_bins=10))
    coauth_df = pd.DataFrame(df.coauthors.tolist(), index=df.index)
    
    # dropping irrelevant columns & concat with 5000-column text_df
    df = df.drop(["abstract", "title", "text", "year", "coauthors"], axis=1)
    #df = df.drop(["year", "coauthors"], axis=1) #So we use either the abstract, title or text for the encoding.
    
    df = pd.concat([df, text_df, coauth_df], axis=1)
    
    # and drop row identifier if test set
    if not train:
        df = df.drop(["identifier"], axis=1)

    return df
    

def build_bins(coauthors: List[int], n_bins=10):
    """
    takes a list of coauthors and returns 10-column data frame
    
    This might be some of the uggliest code I have ever written, though
    sklearn's discrete bins didn't really give what I wanted
    """
    width = np.ceil(21246/n_bins)
    bins  = np.zeros(n_bins)
    for author in coauthors:
        i = 0
        while not (max(0,(i-1))*width <= author <= i*width):
            i += 1
        bins[i-1] += 1
    return bins

In [7]:
#tst = [0, 37, 38, 16, 22, 17, 30, 19, 15, 18, 17, 0, 16, 17, 30, 16, 41, 15, 26, 26, 16, 33, 34, 18, 1, 34, 30, 0, 18, 0, 18, 16, 0, 15, 32, 15, 49, 15, 24, 17, 16, 34, 39, 0, 0, 46, 17, 19, 18, 17, 18, 16, 28, 15, 19, 20, 22, 17, 41, 0, 0, 25, 
#16, 21, 18, 15, 21, 22, 17, 30, 15, 19, 17, 27, 15, 17, 28, 0, 0, 44, 15, 21, 17, 21, 15, 19, 17, 18, 41, 19, 0, 38, 17, 
#18, 17, 24, 19, 15, 19, 15, 15, 36, 24, 18, 0, 0, 17, 16, 27, 21, 18, 25, 15, 0, 16, 0, 41, 15, 17, 32, 28, 0, 16, 16, 16, 
#15, 27, 27, 15, 17, 17, 28, 0]
#len(tst) #133
#print(statistics.mean([1,3,4,5,101]))
#int(statistics.mean([1,3,4,5,101]))
#7%5
#lst1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
#lst1[(len(lst1)-len(lst1)%5):]

In [8]:
#Construct tensor vectors for authors (based on Week 9 Workshop code)
all_categories = [0, 1]

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size) # changed input
        self.activation = nn.Tanh() # new
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.activation(self.i2h(combined)) # changed to use activation
        output = self.h2o(hidden) # changed input
        output = self.softmax(output)
        return output, hidden
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

def simplify_num(lst, type_col = "abstract"):
    #Round to nearest 100.
    lst1 = []
    #for item in lst:
    for i in range(len(lst)):
        temp = int(round(lst[i], -2)/100)
        if temp == 50:
            lst[i] = 49
        else:
            lst[i] = temp
            
    #Depending on the type of the column make reductions.
    if type_col == "title":
        for item in lst:
            lst1.append(item)
        return lst1
    
    elif type_col != "title":
        lst_len = len(lst)
        for j in range(len(lst)):
            if (1+j) % 5 == 0: #if (1+j) % 5 == 0:
                lst1.append(int(statistics.mean(lst[j-4:j]))) 
                #lst1.append(int(statistics.mean(lst[j-4:j])))
                
        #Append average of any remaining values
        if lst_len % 5 != 0: #if lst_len % 5 != 0:
            lst1.append(int(statistics.mean(lst[(lst_len-(lst_len%5)):])))
            #lst1.append(int(statistics.mean(lst[(lst_len-(lst_len%5)):])))
        return lst1 
    
    return lst

######From the Week 9 Tutorial:               
def lineToTensor(lst_init): #, index_lst: int):
    n_letters = 50
    #lst_init = df.loc[index_lst, 'combined text']
    #print("lst before", lst_init)
    lst = simplify_num(lst_init)
    #print("lst", lst)
    tensor = torch.zeros(len(lst), 1, n_letters)
    for li, num in enumerate(lst):
        #print("li =", li, "num =", num)
        tensor[li][0][num] = 1
    return tensor

###### For testing the model.

def constructSample(df, index_lst: int):
    
    n_val = 50
    #print(df.head())
    #print(df.loc[index_lst, 'test_text'])
    lst_init = df.loc[index_lst, 'combined text']
    lst = simplify_num(lst_init)
    
    label = df.loc[index_lst, 'label']
    #print("Before initialisation")
    #Initiate the tensor.
    #for li, letter in 
    tensor = torch.zeros(len(lst), 1, n_val)
    #print("After initialisation")
    
    #Construct the category tensor.
    #print("all_categories", all_categories)
    #print("label", label)
    label_tensor = torch.tensor([all_categories.index(label)], dtype=torch.long)
    #if index_lst < 5:
    #    print(lst)
    #    print("label_tensor", label_tensor)
    for i, num in enumerate(lst):
        
        tensor[i][0][num] = 1
        
    return label, lst, label_tensor, tensor

In [9]:
#all_categories = [0,1]

def evaluate_RNN(rnn, line_tensor):
    ##Adapted from "NLP From Scratch".
    hidden = rnn.initHidden()
    
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
        
    return output

##use this function on User Input.
def predict_output(rnn, input_line, Id, n_predictions = 2):
    
    n_val = 50
    all_categories = [0,1]
    
    with torch.no_grad():
        #Obtain the tensor representation of the input.
        #using the lineToTensor function from the Week 9 Tutorial:
        output = evaluate_RNN(rnn, lineToTensor(input_line))#, Id))
        
        #Get top 2 categories (only 2 categories here):
        #Get the top N categories.
        #top_k, top_ind = output.topk(n_predictions, 1, True)
        
        top_1, top_ind = output.topk(1)
        category_index = top_ind[0].item()
        
        guess, guess_i = all_categories[category_index], category_index
        #for i in range(n_predictions):
        #    value = top_k[0][i].item()
        #    category_index = top_ind[0][i].item()
        #    predictions.append([value, all_categories[category_index]])
            
    return guess #predictions[0][1]

In [10]:
#Now train the data.
# training
'''
def train_RNN_classifier(category_tensor, lst_tensor):
    ###################use the tensor function here. Work in progress...
    ####Adapted from Week 9 Tutorial
    n_hidden = 32
    n_instances = len(df)
    n_val = 50
    learning_rate = 0.005
    no_catgories = 2
    
    rnn = RNNClassifier(n_val, n_hidden, no_catgories)
    criterion = nn.NLLLoss()
    hidden = rnn.initHidden()
    rnn.zero_grad()
    
    for i in range(lst_tensor.size()[0]):
        output, hidden = rnn(lst_tensor[i], hidden)
        
    loss = criterion(output, category_tensor)
    loss.backward()
    
    #Add parameters' gradients to their values, multiplied by the learning rate.
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)
        
    return output, loss.item()


def run_training_function(author: int, df, n_instances=5000):
    
    # Set up copy and construct the label for RNN training.
    #print("Copy 1")
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)   
    df = df.drop(['target authors'], axis=1)
    #print("Copy 2")
    
    #Trains the RNN function: adapted from Week 9 Tutorial.
    
    print_every=2000
    correct = 0
    #set n_iters as 5000 first.
    n_instances = len(df)
    current_loss = 0
    
    for i in tqdm(range(n_instances)):
        
        category, lst, category_tensor, lst_tensor = constructSample(df, i)
        
        ##Then start the training process.
        
        output, loss = train_RNN_classifier(category_tensor, lst_tensor)
        current_loss += loss
        #output, h_n = nn.RNN(category_tensor)
        
        #Get the top category. The "CategoryFromOutput" function from the Week 9 lab is included here
        top_1, top_ind = output.topk(1)
        category_index = top_ind[0].item()
        #print("category_index", category_index)
        ##make a guess
        guess, guess_i = all_categories[category_index], category_index
            
        if guess == category: 
            correct += 1
            
        #Print every:
        if i % print_every == 0: 
            print('%d %s %s' % (i, guess, category))
    #Examine the accuracy.
    df = df.drop(['label'], axis=1)
    df = df.drop(['combined text'], axis = 1)
    print("Accuracy is: %.4f" %(correct/n_instances))
'''
            

def train_classifier(author: int, df: pd.DataFrame, debug=False):
    """
    Trains a classifier for author i. Assumes text-vectorisaiton has occured.
    
    Model Features:
    text vectorisation
    """
    # create copy and set up label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)
    
    df = df.drop(['target authors', 'text', 'abstract', 'title'], axis=1)  ##This provides the input for the category tensor!
    #############ideally now drop text, abstract and text 
    
    # split up positive and negative instances so as to ensure a balanced training set 
    # if we don't do this, we end up with a very imbalanced training set 
    # however, if we don't include enough negative samples, we tend to "overclassify". 
    # we can tune out performance with the below 'neg sample factor'
    
    neg_sample_factor = 10
    
    pos = df[df['label'] == 1] 
    neg = df[df['label'] == 0]
    
    n_pos_samples = pos.shape[0]
    n_tot_samples = df.shape[0]
    
    # takes a sample of the negative instances to train on
    neg = neg.sample(frac=neg_sample_factor*(n_pos_samples/n_tot_samples), random_state=51) 
    
    if debug:
        print(f"training on {pos.shape[0]} postitive instances")
        print(f"training on {neg.shape[0]} negative  instances")
    
    df = pd.concat([pos, neg])
    X_train = df.loc[:, df.columns != "label"]
    y_train = df["label"]
    
    if debug:
        print(f"training on {X_train.shape[0]} instances")
    
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    
    if debug:
        y_train_pred = clf.predict(X_train) 
        acc = accuracy_score(y_train, y_train_pred) 
        f1  = f1_score(y_train, y_train_pred)
        print(f"Accuracy: {acc}")
        print(f"f1 score: {f1}")

    return clf

In [11]:
########################################################################## validation output
'''
def train_RNN_classifier(category_tensor, lst_tensor, rnn):
    ###################use the tensor function here. Work in progress...
    ####Adapted from Week 9 Tutorial
    n_instances = len(df)
    learning_rate = 0.005
    hidden = rnn.initHidden()
    rnn.zero_grad()
    
    for i in range(lst_tensor.size()[0]):
        output, hidden = rnn(lst_tensor[i], hidden)
        
    loss = criterion(output, category_tensor)
    loss.backward()
    
    #Add parameters' gradients to their values, multiplied by the learning rate.
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)
        
    return output, loss.item()
'''


def run_training_function(author: int, df, n_instances=1000):
    

    n_hidden = 32
    n_val = 50
    no_catgories = 2
    rnn = RNNClassifier(n_val, n_hidden, no_catgories)
    criterion = nn.NLLLoss()
    
    # Set up copy and construct the label for RNN training.
    #print(df.head())
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)  
    #df = df.drop(['target authors'], axis=1)
    
    #Trains the RNN function: adapted from Week 9 Tutorial.
    
    print_every=2000
    correct = 0
    #n_instances = len(df)
    current_loss = 0
    
    def train_RNN_classifier(category_tensor, lst_tensor):
        ###################use the tensor function here. Work in progress...
        ####Adapted from Week 9 Tutorial
        n_instances = len(df)
        learning_rate = 0.005
        hidden = rnn.initHidden()
        rnn.zero_grad()
    
        for i in range(lst_tensor.size()[0]):
            output, hidden = rnn(lst_tensor[i], hidden)
        
        loss = criterion(output, category_tensor)
        loss.backward()
    
        #Add parameters' gradients to their values, multiplied by the learning rate.
        for p in rnn.parameters():
            p.data.add_(p.grad.data, alpha=-learning_rate)
        
        return output, loss.item()
    
    for i in range(n_instances):
        
        category, lst, category_tensor, lst_tensor = constructSample(df, i)
        
        ##Then start the training process.
        
        output, loss = train_RNN_classifier(category_tensor, lst_tensor)
        current_loss += loss
        #output, h_n = nn.RNN(category_tensor)
        
        #Get the top category. The "CategoryFromOutput" function from the Week 9 lab is included here
        top_1, top_ind = output.topk(1)
        category_index = top_ind[0].item()
        #print("category_index", category_index)
        ##make a guess
        guess, guess_i = all_categories[category_index], category_index
            
        if guess == category: 
            correct += 1
            
        #Print every:
        #if i % print_every == 0: 
        #    print('%d %s %s' % (i, guess, category))
    #Examine the accuracy.
    df = df.drop(['label'], axis=1)
    #df = df.drop(['combined text'], axis = 1)
    #print("Accuracy is: %.4f" %(correct/n_instances))
    return rnn
    
########################################################################## validation output
def test_RNN_model(rnn_lst, test_df):
    #Repeat parts of the test function.
    
    if os.path.exists("predictionsRNN.csv"):
        os.remove("predictionsRNN.csv")
        print("removed previous RNN predictions")
    
    
    with open("predictionsRNN.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predicted']
        writer.writerow(header)
        
        
        X_test = test_df 
        n      = X_test.shape[0]
        #No "label" component, as this is the test dataset.
        for Id in tqdm(range(n)):
            #Obtain the relevant lst value under "combined text" combined text
            lst = test_df.loc[Id, "combined text"]
            row = [Id]
            authors = []
            #Construct the model.
            for author, rnn in enumerate(rnn_lst): 
                output = predict_output(rnn, lst, Id)
                if output == 1:
                    authors.append(author)
            if len(authors) == 0: row.append(-1)
            else: row += authors
            
            writer.writerow(row)

In [12]:
#Model validation
def validate_model(author: int, df: pd.DataFrame, classifier):
    # simple function to assess model performance
    
    # create copy and set up label
    df = df.copy(deep=True)
    df["label"] = df["target authors"].apply(lambda x: 1 if author in x else 0)   
    df = df.drop(['target authors'], axis=1)
    
    # split up positive and negative instances so as to ensure a balanced training set 
    # if we don't do this, we end up with a very imbalanced training set 
    pos = df[df['label'] == 1] 
    neg = df[df['label'] == 0]
    
    # takes a sample of the instances to test on
    pos = pos.sample(frac=(1/2))
    neg = neg.sample(frac=(1/10))
    
    # recombine 
    df = pd.concat([pos, neg])
    X_test = df.loc[:, df.columns != "label"]
    y_test = df["label"]
    
    # perform predictions (from text only)
    y_pred = classifier.predict(X_test)
    
    acc = accuracy_score(y_pred, y_test) 
    f1  = f1_score(y_pred, y_test)#, average="samples") 
    ##average="samples" is used by the leaderboard (from discussion question #184) but ValueError: Samplewise metrics are not available outside of multilabel classification. 
    #print(f"Model {author}")
    #print(f"Accuracy: {acc}")
    #print(f"f1 score: {f1}")
    return [acc, f1]

In [13]:
def write_validation_performance(file_name):
    """
    Function for writing validations to output file.
    WARNING: Deletes file_name.csv if already present in working directory.
    """
    if os.path.exists(file_name):
        os.remove(file_name)
        print("removed previous validations")
    
    with open(file_name, mode='w') as f0:
        write_valid = csv.writer(f0)
        header = ['Id', 'Accuracy', 'F1']
        write_valid.writerow(header)
        
        #Loop over each author.
        authors = np.arange(0, 100)
        models  = []
        sum_accuracy = 0
        sum_F1 = 0
        
        for i in tqdm(authors):
            model = train_classifier(i, df, debug=False)
            models.append(model)
            #Check for performance. Separate function.
            return_val = validate_model(i, df, model)
            sum_F1 += return_val[1]
            sum_accuracy += return_val[0]
            write_valid.writerow([i, return_val[0], return_val[1]])
            
        write_valid.writerow(["aver", sum_accuracy/len(authors), sum_F1/len(authors)])
        print(f"Accuracy: {sum_accuracy/len(authors)}")
        print(f"f1 score: {sum_F1/len(authors)}")

    return models


In [14]:
def make_predictions(test_df: pd.DataFrame, models):
    """
    function for writing predictions to output file. 
    WARNING: Deletes predictions.csv if present in working directory
    """
    if os.path.exists("predictions.csv"):
        os.remove("predictions.csv")
        print("removed previous predictions")
    
    
    with open("predictions.csv", mode='w') as f:    
        writer = csv.writer(f)
        
        header = ['Id','Predicted']
        writer.writerow(header)
        
        
        X_test = test_df 
        n      = X_test.shape[0]
        
        # loop over each training sample and write to necessary format
        for Id in tqdm(range(n)):
            x   = np.array(X_test.iloc[Id]).reshape(1, -1)
            row = [Id]
            authors = []
            #Now iterate through each of the models.
            for author, model in enumerate(models):
                if np.array(model.predict(x)).item() == 1:
                    authors.append(author)

            # to match the output requirement 
            if len(authors) == 0: row.append(-1)
            else: row += authors
            
            writer.writerow(row)
    return

In [15]:
#def make_RNN_predictions(test_df: pd.DataFrame, models):
    

In [16]:
#Read into a data frame
data_f = pd.read_json("train.json")

#Training and validating data
path = "train.json"
df = load_data_set(path)
df = pre_processing(df)

#file_name = "validation.csv"
#models = write_validation_performance(file_name)

#Building Predictions
#path = "test.json"
#df_test = load_data_set(path)
#df_test = pre_processing(df_test, train=False)

#make_predictions(df_test, models)

loaded 25793 instances


In [17]:
#Testing RNN:
#print("Testing RNN")
#author_id = 42 
#run_training_function(author_id, df)

#Now for the testing component
#make_RNN_predictions(test_df: pd.DataFrame, models)

In [18]:
#Now for the testing component.
path = "test.json"
df_test = load_data_set(path)
df_test = pre_processing(df_test, train=False)

rnn_lst = []
##Obtain the RNN objects:

#authors = np.arange(0, 100)
authors = np.arange(0, 1)
for i in tqdm(authors):
    rnn_lst.append(run_training_function(i, df))

test_RNN_model(rnn_lst, df_test)

loaded 800 instances


100%|██████████| 1/1 [00:08<00:00,  8.71s/it]


removed previous RNN predictions


100%|██████████| 800/800 [00:03<00:00, 256.75it/s]


In [19]:
df.head()

Unnamed: 0,venue,target authors,combined text,word_mean,word_count,word_spread,0,1,2,3,...,1.1,2.1,3.1,4,5,6,7,8,9,label
0,0.043011,"[42, 36]","[25, 19, 23, 15, 18, 19, 20, 29, 15, 19, 21, 1...",0.838297,0.026862,0.472364,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,0.004301,[45],"[0, 15, 17, 24, 15, 36, 22, 19, 16, 15, 15, 20...",0.634257,0.035458,0.598874,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,1.0,[],"[0, 15, 17, 24, 15, 26, 15, 15, 20, 20, 17, 22...",0.623812,0.028653,0.323598,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
3,0.008602,[97],"[0, 16, 15, 1, 17, 16, 0, 0, 34, 15, 27, 15, 1...",0.775345,0.032593,0.693761,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.019355,[2],"[0, 37, 38, 16, 22, 17, 30, 19, 15, 18, 17, 0,...",0.740513,0.043338,0.450081,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [20]:
#Original examples of the abstract include:

#[25, 19, 23, 15, 18, 19, 20, 29, 15, 19, 21, 15, 26, 42, 0, 26, 15, 18, 16, 29, 36, 19, 17, 20, 37, 15, 16, 15, 0, 
#44, 15, 15, 20, 20, 15, 0, 17, 15, 15, 16, 15, 16, 47, 0, 17, 42, 26, 16, 17, 16, 25, 15, 33, 16, 44, 22, 15, 16,
#38, 25, 15, 19, 17, 25, 46, 15, 16, 38, 25, 15, 34, 0, 0, 17, 16, 42, 16, 15, 33, 16, 44, 18, 47, 19, 21, 24, 0]
#label_tensor tensor([1])

#[0, 15, 17, 24, 15, 36, 22, 19, 16, 15, 15, 20, 17, 29, 15, 22, 26, 1, 20, 46, 0, 0, 0, 0, 15, 15, 0, 35, 22, 17, 
#0, 16, 1, 46, 22, 15, 22, 20, 19, 29, 15, 22, 0, 29, 20, 15, 25, 0, 37, 42, 15, 41, 40, 40, 15, 16, 15, 17, 0, 15, 
#17, 35, 22, 22, 15, 15, 15, 0, 15, 16, 22, 17, 22, 0, 0, 0, 16, 16, 15, 28, 15, 15, 0, 0, 35, 0, 20, 17, 48, 30, 15, 
#25, 15, 43, 0, 0, 0, 0, 15, 0, 0, 24, 15, 15, 48, 0, 21, 0, 21, 18, 0]
#label_tensor tensor([0])

#[0, 15, 17, 24, 15, 26, 15, 15, 20, 20, 17, 22, 18, 15, 16, 16, 0, 0, 16, 17, 0, 0, 17, 18, 23, 19, 16, 19, 15,   
#15, 25, 24, 15, 19, 15, 15, 38, 0, 22, 17, 15, 19, 20, 21, 15, 21, 17, 26, 0, 0, 15, 17, 16, 22, 17, 15, 18, 23,  
#17, 0, 20, 0, 21, 16, 16, 0, 0, 21, 39, 16, 36, 33, 49, 0, 15, 16, 36, 18, 16, 19, 16, 17, 0, 17, 18, 34, 17, 16,
#17, 19, 28, 0]
#label_tensor tensor([0])

#[0, 16, 15, 1, 17, 16, 0, 0, 34, 15, 27, 15, 18, 32, 15, 36, 15, 16, 20, 0, 25, 48, 17, 32, 17, 19, 24, 0, 19, 15,
#30, 27, 0, 0, 16, 15, 0, 42, 0, 0, 18, 43, 28, 41, 31, 17, 0, 31, 15, 27, 18, 32, 0, 0, 19, 20, 15, 29, 17, 15, 26, 
#43, 15, 19, 18, 0, 42, 0, 17, 31, 33, 44, 48, 48, 20, 0, 31, 26, 27, 15, 47, 35, 0, 0, 49, 16, 15, 0, 45, 26, 15, 34, 
#20, 15, 0, 30, 19, 15, 18, 15, 20, 22, 0]
#label_tensor tensor([0])

#[0, 37, 38, 16, 22, 17, 30, 19, 15, 18, 17, 0, 16, 17, 30, 16, 41, 15, 26, 26, 16, 33, 34, 18, 1, 34, 30, 0, 18, 0, 18, 
#16, 0, 15, 32, 15, 49, 15, 24, 17, 16, 34, 39, 0, 0, 46, 17, 19, 18, 17, 18, 16, 28, 15, 19, 20, 22, 17, 41, 0, 0, 25, 
#16, 21, 18, 15, 21, 22, 17, 30, 15, 19, 17, 27, 15, 17, 28, 0, 0, 44, 15, 21, 17, 21, 15, 19, 17, 18, 41, 19, 0, 38, 17, 
#18, 17, 24, 19, 15, 19, 15, 15, 36, 24, 18, 0, 0, 17, 16, 27, 21, 18, 25, 15, 0, 16, 0, 41, 15, 17, 32, 28, 0, 16, 16, 16, 
#15, 27, 27, 15, 17, 17, 28, 0]

In [21]:
#len(df)