In [1]:


from google.colab import drive
drive.mount('/content/drive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!pip install -U torch

Collecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/13/70/54e9fb010fe1547bc4774716f11ececb81ae5b306c05f090f4461ee13205/torch-1.5.0-cp36-cp36m-manylinux1_x86_64.whl (752.0MB)
[K     |████████████████████████████████| 752.0MB 22kB/s 
[31mERROR: torchvision 0.5.0 has requirement torch==1.4.0, but you'll have torch 1.5.0 which is incompatible.[0m
Installing collected packages: torch
  Found existing installation: torch 1.4.0
    Uninstalling torch-1.4.0:
      Successfully uninstalled torch-1.4.0
Successfully installed torch-1.5.0


In [3]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [0]:


import torch
import torchtext
from torchtext.data import Field, LabelField
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator
import spacy
import en_core_web_sm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from tqdm import tqdm, trange
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix



In [5]:


## Set seed of randomization and working device
manual_seed = 77
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
n_gpu = torch.cuda.device_count()
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)



cpu


In [0]:
spacy_en = en_core_web_sm.load()
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [0]:


TEXT = Field(sequential=True, tokenize=tokenize_en, lower=True)
LABEL = Field(sequential=False, unk_token = None)



In [0]:
train, val, test = TabularDataset.splits(
               path="/content/drive/My Drive/Colab Notebooks/Block-6_NJS/financial_analysis/Milestone_3/finance_news_data/", # the root directory where the data lies
               train='finan_news_train-text.csv', validation="finan_news_dev-text.csv", test="finan_news_test-text.csv", # file names
               format='csv',
               skip_header=True, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=[('X', TEXT), ('y', LABEL)])

In [0]:
TEXT.build_vocab(train, min_freq=2)
LABEL.build_vocab(train)

In [10]:
print("Vocabulary size of TEXT:",len(TEXT.vocab.stoi))
print("Vocabulary size of LABEL:",len(LABEL.vocab.stoi))

Vocabulary size of TEXT: 1159
Vocabulary size of LABEL: 2


In [0]:


train_iter, val_iter, test_iter = BucketIterator.splits(
 (train, val, test), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(64,256,256),
 sort_key=lambda x: len(x.X), 
 sort=True,
# A key to use for sorting examples in order to batch together examples with similar lengths and minimize padding. 
 sort_within_batch=True
)



In [0]:

# To define a CNN class
class CNN_Text(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, output_size, kernel_num, region_sizes, dropout):
        '''
        vocabulary_size: vocabulary size
        embedding_dim: word embedding size
        output_size: number of classes in prediction
        kernel_num: number of kernels (number of output channels of convolutional layers)
        region_sizes: height of kernels of convolutional layers
        dropout: dropout rate
        '''
        super(CNN_Text, self).__init__()
        # the size of input channel is 1.
        Ci = 1
        
        # word embedding layer
        self.embeddings = nn.Embedding(num_embeddings = vocabulary_size, embedding_dim = embedding_dim )
        
        # convolution with kernels
        self.convolution_layers = nn.ModuleList([nn.Conv2d(in_channels = Ci, out_channels = kernel_num, kernel_size = (K, embedding_dim)) for K in region_sizes])
        
        # a dropout layer
        self.dropout = nn.Dropout(dropout) 
        
        # fully connected layer
        self.fc = nn.Linear(len(kernel_sizes) * kernel_num, output_size)

    def forward(self, x):
        # input x  [sequence length, batch size]
        
        input_embeddings = self.embeddings(x)  
        # (batch size, word_sequence, embedding_dim) word embedding

        input_embeddings = input_embeddings.permute(1,0,2)
        input_embeddings = input_embeddings.unsqueeze(1)
        #  [batch size, number of channel is one, sequence length, embeeding size]

        # convolutional layers
        convolute_outputs = [F.relu(conv(input_embeddings)).squeeze(3) for conv in self.convolution_layers]  
        
        # to get the maximum value of filtered tensor
        max_pooling_outputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in convolute_outputs] 
        
        concat_list = torch.cat(max_pooling_outputs, 1) # concatenate representations
        
        drop_output = self.dropout(concat_list)  # add drop layer
        
        fc1_output = self.fc(drop_output)  # get the fc1 using a fully connected layer
        
        final_output = F.softmax(fc1_output,dim=1)
        
        return final_output



In [0]:


# Hyper Parameters

# the vocabulary size
vocabulary_size = len(TEXT.vocab.stoi) 

# Dimension of word embedding is 300. Namely, each word is expressed by a vector that has 300 dimensions.
embedding_dim = 300 

# region size as 2, 3, and 4
kernel_sizes = [2,3,4] 

# the number of kernel in each region size
kernels_num = 32  

# The dropout rate is set to be 0.5.
dropout = 0.5

# The output size of labels.
output_size = 2

# learning rate is set to be 0.01.
lr = 0.01        

# The number of iteration is set to be 5.
num_epoch = 5  

# employ class CNN_Text and assign to cnn
model = CNN_Text(vocabulary_size, embedding_dim, output_size, kernels_num, kernel_sizes, dropout).to(device)



In [14]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.1)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)


CNN_Text(
  (embeddings): Embedding(1159, 300)
  (convolution_layers): ModuleList(
    (0): Conv2d(1, 32, kernel_size=(2, 300), stride=(1, 1))
    (1): Conv2d(1, 32, kernel_size=(3, 300), stride=(1, 1))
    (2): Conv2d(1, 32, kernel_size=(4, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=96, out_features=2, bias=True)
)

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 434,390 trainable parameters


In [0]:
# Loss and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr)   # define a optimizer for backpropagation
criterion = nn.CrossEntropyLoss()   # define loss funtion

In [0]:
def train(model, iterator, optimizer, criterion):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        batch_input, labels = batch.X, batch.y
        batch_input = batch_input.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        
        outputs = model(batch_input)

        loss = criterion(outputs, labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.cpu().item()

    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    all_pred=[]
    all_label = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            batch_input, labels = batch.X, batch.y
            batch_input = batch_input.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(batch_input)

            loss = criterion(outputs, labels)

            epoch_loss += loss.cpu().item()

            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(predicted)
            all_label.extend(labels.cpu())
    
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return epoch_loss / len(iterator), accuracy, f1score

In [19]:


# Train the model
MAX_EPOCH = 15
total_step = len(train_iter)
loss_list = []
acc_list = []

for epoch in trange(MAX_EPOCH, desc="Epoch"):
    train_loss = train(model, train_iter, optimizer, criterion)  
    val_loss, val_acc, val_f1 = evaluate(model, val_iter, criterion)

    # Create checkpoint at end of each epoch
    state_dict_model = model.state_dict() 
    state = {
        'epoch': epoch,
        'state_dict': state_dict_model,
        'optimizer': optimizer.state_dict()
        }

    torch.save(state, "./drive/My Drive/Colab Notebooks/CNN_TEXT_"+str(epoch+1)+".pt")

    print('\n Epoch [{}/{}], Train Loss: {:.4f}, Validation Loss: {:.4f}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}'.format(epoch+1, MAX_EPOCH, train_loss, val_loss, val_acc, val_f1))



Epoch:   7%|▋         | 1/15 [00:01<00:25,  1.84s/it]


 Epoch [1/15], Train Loss: 0.7117, Validation Loss: 0.6466, Validation Accuracy: 0.6667, Validation F1: 0.4000


Epoch:  13%|█▎        | 2/15 [00:03<00:23,  1.78s/it]


 Epoch [2/15], Train Loss: 0.7001, Validation Loss: 0.6466, Validation Accuracy: 0.6667, Validation F1: 0.4000


Epoch:  20%|██        | 3/15 [00:05<00:21,  1.80s/it]


 Epoch [3/15], Train Loss: 0.6623, Validation Loss: 0.6465, Validation Accuracy: 0.6310, Validation F1: 0.5295


Epoch:  27%|██▋       | 4/15 [00:07<00:19,  1.79s/it]


 Epoch [4/15], Train Loss: 0.6266, Validation Loss: 0.6463, Validation Accuracy: 0.6667, Validation F1: 0.4000


Epoch:  33%|███▎      | 5/15 [00:08<00:17,  1.79s/it]


 Epoch [5/15], Train Loss: 0.5718, Validation Loss: 0.7049, Validation Accuracy: 0.5119, Validation F1: 0.5085


Epoch:  40%|████      | 6/15 [00:10<00:16,  1.78s/it]


 Epoch [6/15], Train Loss: 0.5882, Validation Loss: 0.6064, Validation Accuracy: 0.7143, Validation F1: 0.6583


Epoch:  47%|████▋     | 7/15 [00:12<00:14,  1.77s/it]


 Epoch [7/15], Train Loss: 0.4467, Validation Loss: 0.6230, Validation Accuracy: 0.6786, Validation F1: 0.6415


Epoch:  53%|█████▎    | 8/15 [00:14<00:12,  1.76s/it]


 Epoch [8/15], Train Loss: 0.3960, Validation Loss: 0.6278, Validation Accuracy: 0.6548, Validation F1: 0.5599


Epoch:  60%|██████    | 9/15 [00:15<00:10,  1.73s/it]


 Epoch [9/15], Train Loss: 0.3693, Validation Loss: 0.6343, Validation Accuracy: 0.6548, Validation F1: 0.5822


Epoch:  67%|██████▋   | 10/15 [00:17<00:08,  1.76s/it]


 Epoch [10/15], Train Loss: 0.3759, Validation Loss: 0.6677, Validation Accuracy: 0.6071, Validation F1: 0.5753


Epoch:  73%|███████▎  | 11/15 [00:19<00:07,  1.80s/it]


 Epoch [11/15], Train Loss: 0.3583, Validation Loss: 0.5939, Validation Accuracy: 0.7262, Validation F1: 0.6406


Epoch:  80%|████████  | 12/15 [00:21<00:05,  1.78s/it]


 Epoch [12/15], Train Loss: 0.3360, Validation Loss: 0.6052, Validation Accuracy: 0.6786, Validation F1: 0.6351


Epoch:  87%|████████▋ | 13/15 [00:22<00:03,  1.76s/it]


 Epoch [13/15], Train Loss: 0.3446, Validation Loss: 0.5913, Validation Accuracy: 0.7143, Validation F1: 0.6500


Epoch:  93%|█████████▎| 14/15 [00:24<00:01,  1.76s/it]


 Epoch [14/15], Train Loss: 0.3315, Validation Loss: 0.5870, Validation Accuracy: 0.7143, Validation F1: 0.6500


Epoch: 100%|██████████| 15/15 [00:26<00:00,  1.77s/it]


 Epoch [15/15], Train Loss: 0.3390, Validation Loss: 0.6063, Validation Accuracy: 0.6905, Validation F1: 0.6677





In [20]:
best_model = CNN_Text(vocabulary_size, embedding_dim, output_size, kernels_num, kernel_sizes, dropout).to(device)
best_model.load_state_dict(torch.load('/content/drive/My Drive/Colab Notebooks/CNN_TEXT_15.pt')['state_dict'])
test_loss, test_acc, test_f1 = evaluate(best_model, test_iter, criterion)
print('Final Test Scores:', 'Loss:',test_loss, '| Accuracy:',test_acc, "| F1 score:",test_f1)

Final Test Scores: Loss: 0.6688445806503296 | Accuracy: 0.611764705882353 | F1 score: 0.5838896306186026
