In [122]:
import os
import torch
from torchvision.datasets.utils import download_url
from torchtext import data
from torchtext import datasets
import pandas as pd
import numpy as np

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
import time
from datetime import datetime
import random
from sklearn.metrics import f1_score

In [123]:
import models

In [124]:
SEED = 1
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = False

### Load data

In [125]:
# OR load full data, if produced. 435 MB. NOT in Github currently. 
# You can produce this in about 30 min with preprocess_data.ipynb
reuters = pd.read_pickle('input/reuters_all.pkl')

In [126]:
print(len(reuters))
reuters[0:2]

299773


Unnamed: 0,codes,headline,text
0,"[C18, C181, CCAT]",Eureko is latest suitor for French insurer GAN.,"\nEureko, an alliance of six European financia..."
1,"[G15, GCAT]",Reuter EC Report Long-Term Diary for July 28 -...,\n****\nHIGHLIGHTS\n****\nLUXEMBOURG - Luxembo...


In [127]:
# read classcodes
classcodes= pd.read_csv('input/classcodes.csv')
print(len(classcodes))
#classcodes[0:12]

# add index field to DataFrame
classcodes = classcodes.reset_index()
# Create dictionary index/int to classcode and classcode to int
itocode = dict(zip(classcodes.index, classcodes.Code))
codetoi = dict(zip(classcodes.Code, classcodes.index))
def listToInt(mylist):
    return [codetoi[item] for item in mylist]

reuters['codes'] = [listToInt(codelist) for codelist in reuters.codes]
reuters[0:3]
# Multihot, for single list - one row
def multihot(tags):
    return [1 if tag in tags else 0 for tag in taglist]

# list of classes, 126 int: [0...125]
taglist = list(classcodes.index)
Y_hot = [multihot(claslist) for claslist in reuters.codes]
reuters['codes'] = Y_hot

126


In [128]:
print(len(reuters))
reuters[0:2]

299773


Unnamed: 0,codes,headline,text
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Eureko is latest suitor for French insurer GAN.,"\nEureko, an alliance of six European financia..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Reuter EC Report Long-Term Diary for July 28 -...,\n****\nHIGHLIGHTS\n****\nLUXEMBOURG - Luxembo...


### Using DataSet


Example in:
https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/A%20-%20Using%20TorchText%20with%20Your%20Own%20Datasets.ipynb

In [129]:
#Define the Fields
TEXT = data.Field()
HEADLINE = data.Field()
LABELS = data.LabelField(sequential=False, use_vocab=False)

In [130]:
# Put data in random order
SEED = 1
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
idx = np.random.permutation(len(reuters))
reuters = reuters.iloc[idx]

In [131]:
# split it
size = len(reuters)
train_size = int(0.7*size)
test_size = int(0.85*size)
# val is rest

train = reuters[0: train_size]
val = reuters[train_size : test_size]
test = reuters[test_size : size]

train.to_json('input/train.json', orient='records', lines=True)
val.to_json('input/val.json', orient='records', lines=True)
test.to_json('input/test.json', orient='records', lines=True)

### For final training only: use all training data
dev/test sets are now part of same training data, so they will show too high score.
Stop training at same nr epochs found good with separete train/dev/set.

In [138]:
# For final training only, use all data
# split it
size = len(reuters)

# After finding model, higher training data, run same nr of epochs
#train_size = int(0.95*size)
train_size = size # use all data, will overfit dev set but its ok
test_size = int(0.97*size)
# val is rest

train = reuters[0: train_size]
#val = reuters[train_size : test_size]
val = reuters[test_size : size]
test = reuters[test_size : size] # same as val, not needed

train.to_json('input/train.json', orient='records', lines=True)
val.to_json('input/val.json', orient='records', lines=True)
test.to_json('input/test.json', orient='records', lines=True)

In [139]:
# Tell torchText which Fields to apply to which json elements
fields = {'headline': ('h', HEADLINE), 'text': ('t', TEXT), 'codes': ('l', LABELS)}
fields

{'headline': ('h', <torchtext.data.field.Field at 0x7f135bb027b8>),
 'text': ('t', <torchtext.data.field.Field at 0x7f135bb027f0>),
 'codes': ('l', <torchtext.data.field.LabelField at 0x7f135bb026d8>)}

In [140]:
# Create dataset (TabularDataset)
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'input',
                                        train = 'train.json',
                                        validation = 'val.json',
                                        test = 'test.json',
                                        format = 'json',
                                        fields = fields
)

In [141]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device,
    sort_key= lambda x: len(x.t)
)

In [142]:
# GLOVE
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.50d")
HEADLINE.build_vocab(train)
LABELS.build_vocab(train)

pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([25002, 50])


### Print some info

In [143]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
print('------------')
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABELS vocabulary: {len(LABELS.vocab)}")

Number of training examples: 299773
Number of validation examples: 8994
Number of testing examples: 8994
------------
Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABELS vocabulary: 3


In [144]:
print(TEXT.vocab.freqs.most_common(10))
print('-----')
print(TEXT.vocab.itos[:10])
print(LABELS.vocab.stoi)

[('the', 3112856), ('of', 1664693), ('to', 1661936), ('in', 1321409), ('a', 1238399), ('and', 1238227), ('on', 762926), ('said', 591160), ('for', 579301), ('The', 491337)]
-----
['<unk>', '<pad>', 'the', 'of', 'to', 'in', 'a', 'and', 'on', 'said']
defaultdict(<function _default_unk_index at 0x7f14b7dc5d08>, {'codes': 0, 'headline': 1, 'text': 2})


### F1 accuracy

The results will be ranked according to the highest micro-averaged F1 score. 
This will be calculated using the f1_score function found in scikit-learn, using a command like 
f1_score(y_true, y_pred, average='micro') where y_true is the matrix with the ground truth, and y_pred 
the predicted output. Both matrices are binary, a 1 in row i and column j means that the image/document
i contains the label j.

Scikit:  Micro-average in F1-score
 
'micro':
    Calculate metrics globally by counting the total true positives, false negatives and false positives.

In [146]:
# f1 score for BATCH
from sklearn.metrics import f1_score
def f1_accuracy(preds, y):
    """
    Returns f1 accuracy from sklearn
    """
    #round predictions to the closest integer
    #rounded_preds = torch.round(torch.sigmoid(preds))
    rounded_preds = torch.round(preds)
    
    preds_cpu = rounded_preds.cpu().data.numpy()
    y_cpu = y.cpu().data.numpy()
    f1 = f1_score(y_cpu, preds_cpu, average='micro')
    return f1 

In [147]:
def f1_own_accuracy(preds, y):
    '''for micro-average
    Returns counts of true_pos, false_pos and false_negative.
    For counting precision, recall and F1 globally
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    '''
    #round predictions to the closest integer
    #rounded_preds = torch.round(torch.sigmoid(preds))
    rounded_preds = torch.round(preds)
    
    preds = rounded_preds.cpu().data.numpy()
    y = y.cpu().data.numpy()
        
    # True positive
    tpos = np.sum(np.logical_and(preds == 1, y == 1))
 
    # True negative
    #tneg = np.sum(np.logical_and(preds == 0, y == 0))
 
    # False positive
    fpos = np.sum(np.logical_and(preds == 1, y == 0))
 
    # False negative
    fneg = np.sum(np.logical_and(preds == 0, y == 1))

    return tpos, fpos, fneg

In [148]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    #rounded_preds = torch.round(torch.sigmoid(preds))
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [149]:
# F1 version
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    
    epoch_tpos = 0
    epoch_fpos = 0
    epoch_fneg = 0
    
    model.train()
    
    for batch in iterator:
        if batch.t.shape[1] != BATCH_SIZE:
              continue
        
        optimizer.zero_grad()
                
        predictions = model(torch.cat((batch.t, batch.h))).squeeze(1)
        
        loss = criterion(predictions, batch.l.float())
        
        tpos, fpos, fneg = f1_own_accuracy(predictions, batch.l.float())
        epoch_tpos += tpos
        epoch_fpos += fpos
        epoch_fneg += fneg
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        #epoch_acc += acc.item()
    
    # Counted f1-score is Micro-average version
    # avoid div by zero with epsilon. 
    # F1 for 0 - no labels is not defined, but here we give it score 1
    eps = 1e-7
    epoch_precision = epoch_tpos / (epoch_tpos + epoch_fpos +eps )
    epoch_recall = epoch_tpos / (epoch_tpos + epoch_fneg +eps)
    epoch_f1 = 2* (  ((epoch_precision * epoch_recall)+eps) / (epoch_precision + epoch_recall +2*eps))
    
    # zero true_positive can cause F1=1, (because only 2*(epsilon / 2epsilon) remains, leading to 1)
    # fix it to zero
    if (epoch_precision==0 and epoch_recall==0):
        epoch_f1 = 0
    
    return epoch_loss / len(iterator), epoch_precision, epoch_recall, epoch_f1

In [150]:
# F1 version
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    #epoch_acc = 0
    epoch_tpos = 0
    epoch_fpos = 0
    epoch_fneg = 0    
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            if batch.t.shape[1] != BATCH_SIZE:
                continue

            predictions = model(torch.cat((batch.t, batch.h))).squeeze(1)
            
            loss = criterion(predictions, batch.l.float())
            
            tpos, fpos, fneg = f1_own_accuracy(predictions, batch.l.float())
            epoch_tpos += tpos
            epoch_fpos += fpos
            epoch_fneg += fneg            

            epoch_loss += loss.item()
            #epoch_acc += acc.item()

    # avoid div by zero with epsilon
    eps = 1e-7
    epoch_precision = epoch_tpos / (epoch_tpos + epoch_fpos +eps)
    epoch_recall = epoch_tpos / (epoch_tpos + epoch_fneg +eps)
    epoch_f1 = 2* (  ((epoch_precision * epoch_recall)+eps) / (epoch_precision + epoch_recall +2*eps))          
    
    # zero true_positive can cause F1=1, (because only 2*(epsilon / 2epsilon) remains, leading to 1)
    # fix it to zero
    if (epoch_precision==0 and epoch_recall==0):
        epoch_f1 = 0
    
    return epoch_loss / len(iterator), epoch_precision, epoch_recall, epoch_f1

In [151]:
def predict(model, iterator):
        
    model.eval()
    preds = []
    
    with torch.no_grad():
    
        for batch in iterator:
            #if batch.t.shape[1] != BATCH_SIZE:
            #    continue

            predictions = model(torch.cat((batch.t, batch.h))).squeeze(1)
            predictions = torch.round(predictions)
            
            cpu_pred = predictions.cpu()
            result = cpu_pred.data.numpy()            
            # unpack the #batch nr of predictions and add individually
            for item in result:
                preds.append(item)
    
    return preds

### Load model

In [152]:
# Common
INPUT_DIM = len(TEXT.vocab) # 25002
EMBEDDING_DIM = 50
OUTPUT_DIM = 126

### Models to try

Add wanted models to list, then train them all

In [153]:
# place to put model definitions
try_models = []
try_descs = []
try_epochs = []

### Try for best accuracy

In [154]:
# CNN with high nr of filters
N_FILTERS = 300
FILTER_SIZES = [3,5,7]
DROPOUT = 0.5
N_EPOCHS = 6 # full data, try 6
#N_EPOCHS = 1 # temporary

model = models.CNN2(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

# 14 epochs gave f1 0.84
desc = "CNN 300x3,5,7 filters"

try_models.append(model)
try_descs.append(desc)
try_epochs.append(N_EPOCHS)

In [155]:
# COMMON
def init_models():
    ''' init global parameters'''
    global model
    global optimizer
    global criterion
    global train_losses
    global train_f1s
    global val_losses
    global val_f1s
    global times
    global SEED
    global torch
    
    model = model.to(device)
    model.embedding.weight.data.copy_(pretrained_embeddings)

    optimizer = optim.Adam(model.parameters())

    #criterion = nn.BCEWithLogitsLoss()
    criterion = nn.BCELoss()
    criterion = criterion.to(device)

    # Reset 
    train_losses=[]
    train_f1s=[]
    val_losses=[]
    val_f1s=[]
    times=[]

    # If want repeatability
    SEED = 1
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    
    # Faster
    torch.backends.cudnn.deterministic = False

In [66]:
#model

In [67]:
#optimizer # default Adam

In [68]:
#optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.99))

In [156]:
try_descs

['CNN 300x3,5,7 filters']

### Train all models

In [157]:
# Store results of training
results = pd.DataFrame()

for i in range(len(try_models)):
    
    model = try_models[i]
    desc = try_descs[i]
    epochs = try_epochs[i]
    print(f'Training model: {desc}')

    # init global parameters
    init_models()
    model_name = type(model).__name__

    #N_EPOCHS = 3
    N_EPOCHS = epochs

    for epoch in range(N_EPOCHS):

        start = time.time()
        train_loss, train_precision, train_recall, train_f1 = train(model, train_iterator, optimizer, criterion)
        end = time.time(); elapsed = end-start
    
        valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(model, valid_iterator, criterion)
    
        times.append(elapsed) 
        train_losses.append(train_loss); train_f1s.append(train_f1)
        val_losses.append(valid_loss); val_f1s.append(valid_f1)
    
        print(f'| Ep:{epoch+1:02} |Tr Loss:{train_loss:.3f} |Prec:{train_precision:.3f} |Rec:{train_recall:.3f} |f1:{train_f1:.3f} |Val Loss:{valid_loss:.3f} |prec:{valid_precision:.3f} |rec:{valid_recall:.3f} |f1:{valid_f1:.3f} |')        


    #Test F1-score
    test_loss, test_precision, test_recall, test_f1 = evaluate(model, test_iterator, criterion)
    print(f'| Test Loss: {test_loss:.3f} ||Prec:{test_precision:.3f} |Rec:{test_recall:.3f} |Test F1:{test_f1: .3f} |')        
        
    # Store results
    times_cumul = pd.Series(times).cumsum() # cumulative time    
    results = results.append({'Model': model_name,
                          'Desc' : desc,
                          'Test_f1': test_f1,
                          'Time': times_cumul,
                          'Train_loss': pd.DataFrame({'Train_loss':train_losses}), 
                          'Train_f1': pd.DataFrame({'Train_f1':train_f1s}), 
                          'Val_loss': pd.DataFrame({'Val_loss':val_losses}), 
                          'Val_f1': pd.DataFrame({'Val_f1':val_f1s}),
                                         }, ignore_index=True)

    print(" ")
    print(f" Total time used for training: {sum(times)} s ##########")
    print(" ")

Training model: CNN 300x3,5,7 filters
| Ep:01 |Tr Loss:0.035 |Prec:0.857 |Rec:0.662 |f1:0.747 |Val Loss:0.026 |prec:0.836 |rec:0.829 |f1:0.833 |
| Ep:02 |Tr Loss:0.025 |Prec:0.882 |Rec:0.762 |f1:0.818 |Val Loss:0.025 |prec:0.836 |rec:0.864 |f1:0.850 |
| Ep:03 |Tr Loss:0.024 |Prec:0.888 |Rec:0.783 |f1:0.832 |Val Loss:0.022 |prec:0.862 |rec:0.865 |f1:0.864 |
| Ep:04 |Tr Loss:0.022 |Prec:0.893 |Rec:0.795 |f1:0.841 |Val Loss:0.021 |prec:0.869 |rec:0.876 |f1:0.872 |
| Ep:05 |Tr Loss:0.021 |Prec:0.897 |Rec:0.803 |f1:0.847 |Val Loss:0.020 |prec:0.870 |rec:0.885 |f1:0.877 |
| Ep:06 |Tr Loss:0.021 |Prec:0.899 |Rec:0.811 |f1:0.853 |Val Loss:0.018 |prec:0.888 |rec:0.882 |f1:0.885 |
| Test Loss: 0.018 ||Prec:0.888 |Rec:0.882 |Test F1: 0.885 |
 
 Total time used for training: 2698.9966175556183 s ##########
 


In [104]:
results   

Unnamed: 0,Desc,Model,Test_f1,Time,Train_f1,Train_loss,Val_f1,Val_loss
0,"CNN 300x3,5,7 filters",CNN2,0.819805,0 409.784567 dtype: float64,Train_f1 0 0.740232,Train_loss 0 0.035656,Val_f1 0 0.816793,Val_loss 0 0.028992


### Save training results - score and time

In [158]:
datetime_string = datetime.now().strftime("%Y%m%d-%H%M")
print(datetime_string)
results.to_pickle('model_results/results_CNN_all_traindata'+datetime_string+'.pkl')

20190106-1350


### Save the model

For this version create the model object with same paramters as when training. Then load weights.
This version saves also gradients etc.

In [159]:
torch.save(model.state_dict(), 'models/model_DICT_CNN300_f1_all_traindata_0_853_train.pt')
#torch.save(model, 'filename.pt')

In [160]:
# Whole model
torch.save(model, 'models/model_WHOLE_CNN300_f1_all_traindata_0_853_train.pt')

### Optional - Load Model

In [54]:
# If Whole model
model = torch.load('models/model_WHOLE_CNN300_f1_all_traindata_0_853_train.pt')

In [None]:
#Only Dict / weights

In [None]:
# USE same creation as when model was trained
#model2 = RNN(input_dim=25002, embedding_dim=50, hidden_dim=256, output_dim=126)

In [28]:
model.load_state_dict(torch.load('model_DICT_CNN300_f1_all_traindata_0_853_train.pt'))                 

In [29]:
model.eval()

CNN2(
  (embedding): Embedding(25002, 50)
  (convs): ModuleList(
    (0): Conv2d(1, 300, kernel_size=(3, 50), stride=(1, 1))
    (1): Conv2d(1, 300, kernel_size=(5, 50), stride=(1, 1))
    (2): Conv2d(1, 300, kernel_size=(7, 50), stride=(1, 1))
  )
  (fc): Linear(in_features=900, out_features=126, bias=True)
  (dropout): Dropout(p=0.5)
)

In [48]:
criterion = nn.BCELoss()
criterion = criterion.to(device)
model.to(device)

CNN2(
  (embedding): Embedding(25002, 50)
  (convs): ModuleList(
    (0): Conv2d(1, 300, kernel_size=(3, 50), stride=(1, 1))
    (1): Conv2d(1, 300, kernel_size=(5, 50), stride=(1, 1))
    (2): Conv2d(1, 300, kernel_size=(7, 50), stride=(1, 1))
  )
  (fc): Linear(in_features=900, out_features=126, bias=True)
  (dropout): Dropout(p=0.5)
)

### Test it on test data

In [161]:
test_loss, test_precision, test_recall, test_f1 = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} ||Prec:{test_precision:.3f} |Rec:{test_recall:.3f} |Test F1:{test_f1: .3f} |')

| Test Loss: 0.018 ||Prec:0.888 |Rec:0.882 |Test F1: 0.885 |


### Optional -  Extra test using training sets test-data in place of new data, so we have known labels to confirm result

In [206]:
# New data dataset
new_dataset1 = data.TabularDataset(
                                        path = 'input/test.json',
                                        format = 'json',
                                        #fields = fields_predict
                                        fields = fields
)

In [107]:
# Print out example
ex = new_dataset[1]
ex.t[0:10]

['The',
 'Zimbabwe',
 'dollar',
 'was',
 'quoted',
 'steady',
 'on',
 'the',
 'U.S.',
 'dollar']

In [108]:
# Print out example
ex.h

['ZIMBABWE', 'DOLLAR', 'STARTS', 'MONDAY', 'STABLE', 'TO', 'WEAKER.']

In [207]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

new_iterator1 = data.Iterator(
    dataset=new_dataset1, 
    batch_size=BATCH_SIZE,
    device=device,
    sort_key= lambda x: len(x.t),
    train=False,
    sort=None
)

In [208]:
preds = predict(model, new_iterator1)

# moved inside predict()
#rounded_preds = torch.round(preds)   
#preds_cpu = rounded_preds.cpu().data.numpy()

In [209]:
preds = np.asarray(preds)

In [114]:
preds_pd = pd.DataFrame(preds)
preds_pd.to_csv('output/predictions_trainset_test.csv', header=False, index=False)

In [115]:
test = pd.read_json('input/test.json', orient='records', lines=True)

In [199]:
# Ground truth, from pandas DataFrame
# Each is a 126 item list in a DataFrame
# Trun into 2-dim np.array

# explode the list into columns
temp = test['codes'].apply(pd.Series)
# turn to np.array
y_true = np.asarray(temp)

In [200]:
from sklearn.metrics import f1_score

f1_score(y_true,
         preds,
         average='micro')

0.16082617373947755

In [210]:
test_loss, test_precision, test_recall, test_f1 = evaluate(model, new_iterator1, criterion)

print(f'| Test Loss: {test_loss:.3f} ||Prec:{test_precision:.3f} |Rec:{test_recall:.3f} |Test F1:{test_f1: .3f} |')

| Test Loss: 0.018 ||Prec:0.888 |Rec:0.882 |Test F1: 0.885 |


### Predict on new data

In [162]:
# Test it on final test data
new_data = reuters = pd.read_pickle('input/data_new.pkl')
new_data.to_json('input/new_data.json', orient='records', lines=True)
len(new_data)

33142

In [172]:
temp = pd.read_json('input/new_data.json', orient='records', lines=True)

In [173]:
temp.head()

Unnamed: 0,codes,headline,text
0,[],PRESS DIGEST - SOUTH AFRICA - APRIL 10.,\nThese are the leading stories in the South A...
1,[],OFFICIAL JOURNAL CONTENTS - OJ C 110 OF APRIL ...,\n*\n(Note - contents are displayed in reverse...
2,[],OFFICIAL JOURNAL CONTENTS - OJ L 94 OF APRIL 9...,\n*\n(Note - contents are displayed in reverse...
3,[],OFFICIAL JOURNAL CONTENTS - OJ C 55 OF FEBRUAR...,"\n*\nMinutes of the sitting of Wednesday, 29 J..."
4,[],Toronto stocks close easier in lackluster deal...,\nCHANGE\t\t\t\t CHANGE\nTSE\t 5790.11 ...


In [179]:
fields_predict = {'headline': ('h', HEADLINE), 'text': ('t', TEXT)} # 'codes': ('l', LABELS)
fields_predict

{'headline': ('h', <torchtext.data.field.Field at 0x7f135bb027b8>),
 'text': ('t', <torchtext.data.field.Field at 0x7f135bb027f0>)}

In [187]:
# New data dataset
new_dataset = data.TabularDataset(
                                        path = 'input/new_data.json',
                                        format = 'json',
                                        fields = fields_predict
)

In [188]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

new_iterator = data.Iterator(
    dataset=new_dataset, 
    batch_size=BATCH_SIZE,
    device=device,
    sort_key= lambda x: len(x.t),
    train=False,
    sort=None
)

In [189]:
preds = predict(model, new_iterator)

In [190]:
preds = np.asarray(preds)

In [191]:
preds_pd = pd.DataFrame(preds)
# floats to int
preds_pd = preds_pd.astype(int)    

In [192]:
#datetime_string = datetime.now().strftime("%Y%m%d-%H%M")
#print(datetime_string)
preds_pd.to_csv('output/predictions_newdata_fulltraindata_CNN.csv', header=False, index=False)

In [None]:
len(preds_pd)

In [91]:
preds_pd.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,116,117,118,119,120,121,122,123,124,125
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Write predictions to file

In [None]:
datetime_string = datetime.now().strftime("%Y%m%d-%H%M")
print(datetime_string)

In [57]:
preds_pd.to_csv('output/predictions'+datetime_string+'.csv', header=False, index=False)