In [None]:
# import sys
# !{sys.executable} -m pip install tensorflow

In [2]:
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

from matplotlib import pyplot as plt
import seaborn as sns

# Combine data

In [7]:
df1 = pd.read_csv('./../labeledTweets/allLabeledTweets_clean.csv')
df1 = df1[['id', 'message', 'message_lowercase', 'label']]
df1.head()

Unnamed: 0,id,message,message_lowercase,label
0,1478404,Tiek vērtēti trīs potenciālie airBaltic invest...,tiek vērtēti trīs potenciālie airbaltic invest...,0
1,1478695,Augulis: #airBaltic “potenciālie pircēji ir no...,augulis: #airbaltic “potenciālie pircēji ir no...,0
2,1478812,airBaltic uzsāks lidojumus uz diviem jauniem g...,airbaltic uzsāks lidojumus uz diviem jauniem g...,0
3,1479295,Ministrs: Sarunas turpinās ar trīs potenciālaj...,ministrs: sarunas turpinās ar trīs potenciālaj...,0
4,1480097,@krisjaniskarins @Janis_Kazocins @EU2017EE Net...,@krisjaniskarins @janis_kazocins @eu2017ee net...,0


In [9]:
df2 = pd.read_csv('./../labeledTweets/p_n_n_tilde_lv_clean.csv')
df2 = df2[['id','message_lv_tilde', 'message_lowercase', 'label']]
df2 = df2.rename(columns={'message_lv_tilde': 'message'})
df2.head()

Unnamed: 0,id,message,message_lowercase,label
0,1.34e+18,@pilsonenjeff @lauferlaw @donwinslows pa reize...,@pilsonenjeff @lauferlaw @donwinslows pa reize...,2
1,1.33e+18,@tkdylan cilvēkiem ir aģentūra. Lins Vuds ir v...,@tkdylan cilvēkiem ir aģentūra. lins vuds ir v...,2
2,1.33e+18,"@foenixaew Es nojaušu, ka WWE lika viņam iznāk...","@foenixaew es nojaušu, ka wwe lika viņam iznāk...",2
3,1.32e+18,Maksvels droši vien pačurās mājā pirms mūsu nā...,maksvels droši vien pačurās mājā pirms mūsu nā...,2
4,1.32e+18,@msamson56 Esmu pārsteigts. KĀ cilvēki var atb...,@msamson56 esmu pārsteigts. kā cilvēki var atb...,2


In [10]:
print(df1.shape[0])
print(df2.shape[0])
df=pd.concat([df1, df2])
print(df.shape[0])

2025
174644
176669


# Train, validate split

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

In [12]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,message,message_lowercase
label,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,train,66187,66187,66187
0,val,11809,11809,11809
1,train,44477,44477,44477
1,val,7927,7927,7927
2,train,38971,38971,38971
2,val,7298,7298,7298


In [13]:
df.head()

Unnamed: 0,id,message,message_lowercase,label,data_type
0,1478404.0,Tiek vērtēti trīs potenciālie airBaltic invest...,tiek vērtēti trīs potenciālie airbaltic invest...,0,train
1,1478695.0,Augulis: #airBaltic “potenciālie pircēji ir no...,augulis: #airbaltic “potenciālie pircēji ir no...,0,val
2,1478812.0,airBaltic uzsāks lidojumus uz diviem jauniem g...,airbaltic uzsāks lidojumus uz diviem jauniem g...,0,val
3,1479295.0,Ministrs: Sarunas turpinās ar trīs potenciālaj...,ministrs: sarunas turpinās ar trīs potenciālaj...,0,train
4,1480097.0,@krisjaniskarins @Janis_Kazocins @EU2017EE Net...,@krisjaniskarins @janis_kazocins @eu2017ee net...,0,train


In [12]:
# df.to_csv('./../labeledTweets/p_n_n_and_labeled_corpus_train_val.csv', index=False)

df = pd.read_csv('./../labeledTweets/p_n_n_and_labeled_corpus_train_val.csv')

# Tokenizer "bert-base-uncased"

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

### Find max length for tokenizer

In [None]:
token_lens = []
for txt in list(df[df.data_type=='train'].message_lowercase.values):
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
    
sns.displot(token_lens)
plt.xlim([0, 250])
plt.xlabel('Token count')
plt.show()

In [14]:
max_length = 200

### Encode messages

In [15]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].message_lowercase.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors='pt',
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].message_lowercase.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=max_length, 
    return_tensors='pt',
    truncation=True
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [48]:
len(dataset_train), len(dataset_val)

(149635, 27034)

In [52]:
torch.save(dataset_train, './datasetsLowercase/dataset_train.pt')
torch.save(dataset_val, './datasetsLowercase/dataset_val.pt')

In [3]:
dataset_train = torch.load('./datasetsLowercase/dataset_train.pt')
dataset_val = torch.load('./datasetsLowercase/dataset_val.pt')

In [4]:
len(dataset_train), len(dataset_val)

(149635, 27034)

# Model "bert-base-multilingual-cased"

In [5]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased",
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [6]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)

In [15]:
from transformers import get_linear_schedule_with_warmup

optimizer = torch.optim.AdamW(model.parameters(), lr= 0.01, eps=1e-8)
# optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

In [16]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)

In [9]:
# Function to measure weighted F1

from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [10]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
model.to(device)
print(device)

cpu


In [11]:
# Function to evaluate model. Returns average validation loss, predictions, true values

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

# Train

In [37]:
torch.cuda.device_count()

1

In [12]:
torch.cuda.memory_allocated(0)

0

In [13]:
torch.cuda.memory_reserved(0)

0

In [None]:
torch.cuda.empty_cache()
total_memory = torch.cuda.get_device_properties(0).total_memory
torch.cuda.set_per_process_memory_fraction(0.5, 0)
application = int(total_memory * 0.499) - torch.cuda.max_memory_reserved()
tmp_tensor = torch.empty(application, dtype=torch.int8, device='cuda')
del tmp_tensor
torch.cuda.empty_cache()

In [18]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
#         progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'modelsLowercase/finetuned_BERT_lowercase_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/49879 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Evaluate

In [None]:
model.load_state_dict(torch.load('modelsLowercase/finetuned_BERT_lowercase_epoch_X.model', map_location=torch.device('X')))

_, predictions, true_vals = evaluate(dataloader_validation)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(true_vals, preds_flat))
pd.DataFrame(confusion_matrix(true_vals, preds_flat),
        index = [['actual', 'actual', 'actual'], ['neutral', 'positive', 'negative']],
        columns = [['predicted', 'predicted', 'predicted'], ['neutral', 'positive', 'negative']])