In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import XLNetTokenizer, XLNetModel, XLNetForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv('data/sentiment_annotated_with_texts.csv')
df.head()

Unnamed: 0,published_at,ticker,true_sentiment,title,author,url,source,text,finbert_sentiment,finbert_sent_score
0,1/12/23 7:47,EURCHF,Positive,Euro to benefit from the ECBs pronounced hawki...,FXStreet Insights Team,https://www.fxstreet.com/news/euro-to-benefit-...,FX Street,The Euro was able to appreciate particularly s...,Positive,0.85
1,1/12/23 10:34,EURCHF,Positive,EURCHF Trend higher may remain in place – ING,FXStreet Insights Team,https://www.fxstreet.com/news/eur-chf-trend-hi...,FX Street,EUR/CHF yesterday broke above 1.00. Economists...,Positive,0.51
2,1/12/23 11:40,EURCHF,Neutral,Does a jump in EURCHF point to a break above 1...,FXStreet Insights Team,https://www.fxstreet.com/news/does-a-jump-in-e...,FX Street,EUR/CHF vaults parity for the first time since...,Neutral,0.37
3,1/12/23 15:32,EURCHF,Positive,EURCHF could extend its advance back to levels...,FXStreet Insights Team,https://www.fxstreet.com/news/eur-chf-could-ex...,FX Street,EUR/CHF climbs back above parity. Economists a...,Positive,0.64
4,1/13/23 11:37,EURCHF,Positive,EURCHF to head higher towards 10130 and projec...,FXStreet Insights Team,https://www.fxstreet.com/news/eur-chf-to-head-...,FX Street,EUR/CHF has broken out above the sideways rang...,Positive,0.83


In [5]:
# pre-process data
headlines = df['title'].tolist()

In [6]:
headlines

['Euro to benefit from the ECBs pronounced hawkish determination – Commerzbank',
 'EURCHF Trend higher may remain in place – ING',
 'Does a jump in EURCHF point to a break above 108 in EURUSD – SocGen',
 'EURCHF could extend its advance back to levels between 102 and 104 – MUFG',
 'EURCHF to head higher towards 10130 and projections of 1024010260 – SocGen',
 'EURCHF Room for the Euro to extend the move higher – MUFG',
 'USDCHF stalls its run higher at the 200 bar MA on the 4 hour chart and 38.2% retracement',
 'EURCHF reaches 38.2% of the 2 year range as run higher continues',
 'EURCHF Bias would be for stronger Franc but waiting for clearer SNB monetary policy stance – Credit Suisse',
 'New lows for the EURUSD. EURCHF down as well and tests its 200 day MA.',
 'EURCHF Still room to rise toward the 10500 area – MUFG',
 'EURCHF to remain well supported amid widening ECBSNB policy divergence – CIBC',
 'EUR enjoys advantage to USD CHF and GBP – TDS',
 'SNB Jordan: Price stability does not 

In [34]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')

XLNetTokenizer(name_or_path='xlnet-large-cased', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<sep>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>', 'additional_special_tokens': ['<eop>', '<eod>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<cls>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<sep>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("<pad>", rs

In [35]:
# positive, negative, neutral
model = XLNetForSequenceClassification.from_pretrained("xlnet/xlnet-base-cased", num_labels=3)


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet/xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# Tokenize headlines
encoded_input = tokenizer(headlines, padding=True, truncation=True, return_tensors='pt', max_length=512)
encoded_input

{'input_ids': tensor([[    5,     5,     5,  ...,  4847,     4,     3],
        [    5,     5,     5,  ...,  5103,     4,     3],
        [    5,     5,     5,  ..., 19451,     4,     3],
        ...,
        [    5,     5,     5,  ...,    83,     4,     3],
        [    5,     5,     5,  ..., 26722,     4,     3],
        [    5,     5,     5,  ...,  2931,     4,     3]]), 'token_type_ids': tensor([[3, 3, 3,  ..., 0, 0, 2],
        [3, 3, 3,  ..., 0, 0, 2],
        [3, 3, 3,  ..., 0, 0, 2],
        ...,
        [3, 3, 3,  ..., 0, 0, 2],
        [3, 3, 3,  ..., 0, 0, 2],
        [3, 3, 3,  ..., 0, 0, 2]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])}

In [39]:
with torch.no_grad():
    outputs = model(**encoded_input)
    predictions = torch.argmax(outputs.logits, dim=-1)
predictions

tensor([1, 1, 1,  ..., 1, 0, 1])

In [46]:
sentiment_labels = ['Negative', 'Neutral', 'Positive']
predicted_sentiments = [sentiment_labels[pred] for pred in predictions]

In [47]:
df['xlnet_sentiment'] = predicted_sentiments

In [48]:
df.to_csv('data/sentiment_annotated_with_texts_XLnet.csv', index=False)

In [61]:
# preprocess ground truth labels
label_dict = {"Negative": 0, "Neutral": 1, "Positive": 2}
df['true_label'] = df['true_sentiment'].map(label_dict)
labels_tensor = torch.tensor(df['true_label'], dtype=torch.long)
labels_tensor

tensor([2, 2, 1,  ..., 1, 2, 0])

Accuracy

In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)



In [58]:
def compute_accuracy(predictions, labels):
    pred_flat = predictions.flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [63]:
# Create train dataset
train_dataset = TensorDataset(encoded_input['input_ids'], encoded_input['attention_mask'], labels_tensor)

# Create the DataLoader
num_epochs = 2
batch_size = 16  # Adjust as necessary
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [66]:
for epoch in range(num_epochs):  # num_epochs is the number of epochs you want to train for
    model.train()
    total_loss, total_accuracy = 0, 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predictions = torch.argmax(logits, dim=-1)
        total_accuracy += compute_accuracy(predictions.cpu().numpy(), labels.cpu().numpy())

    avg_train_loss = total_loss / len(train_loader)
    avg_train_accuracy = total_accuracy / len(train_loader)
    print(f"Epoch {epoch} | Train Loss: {avg_train_loss:.4f} | Train Accuracy: {avg_train_accuracy:.4f}")


Epoch 0 | Train Loss: 1.1206 | Train Accuracy: 0.3652
Epoch 1 | Train Loss: 1.0382 | Train Accuracy: 0.4699
