In [1]:
# Example: Import data from csv-file stored on Google Drive
try:
    from google.colab import drive
    IN_COLAB = True
    drive.mount('/content/drive')
except ModuleNotFoundError:
    IN_COLAB = False

#text_column = "text"
#df_pred = pd.read_csv(file_name)
#pred_texts = df_pred[text_column].dropna().astype('str').tolist()

if IN_COLAB:
    data_folder='drive/MyDrive/ColabData'
else:
    data_folder='../data'
data_folder

'../data'

In [2]:
# Install the transformers library
!pip install transformers



In [3]:
import torch
import pandas as pd
import numpy as np
import os
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

In [5]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

def scale_tweet_list(percentage_per_chunk, save_every, tweets):
    # scales to minimum of the save_every time size

    length_of_tweets = len(tweets)
    percent_of_length = int(length_of_tweets * percentage_per_chunk/100)
    # percent_of_length = int(percent_of_length)
    last_tweet = percent_of_length - (percent_of_length % save_every)
    # last_tweet = int((length_of_tweets / percentage_per_chunk) - (percent_of_length % save_every))

    if last_tweet == 0:
        last_tweet = save_every

    scaled_tweets = tweets[:last_tweet]
    return scaled_tweets, last_tweet


def save_sentiments(ids, results, results_folder, date):
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)

    outputs = to_dict_of_lists(results)
    outputs["ids"] = ids

    df = pd.DataFrame(outputs)

    date_csv = date + ".csv"
    results_path = os.path.join(results_folder, date_csv)
    df.to_csv(results_path, mode="a", header=False, index=False)


def to_dict_of_lists(LD):

    nd = {}
    for d in LD:
        for k, v in d[0].items():
            try:
                nd[k].append(v)
            except KeyError:
                nd[k] = [v]
    return nd


def get_path(model_name, data_folder, date = "2017-01-01", crypto = "bitcoin"):

    raw_source_folder = f"{crypto}_tweets/"
    raw_results_folder = f"{crypto}_scores/"

    try:
        model_developer, model_name, = model_name.split('/', 1)
    except:
        pass

    model_folder = '-'.join(model_name.split('-')[:4])
    try:
        model_folder = f"{model_folder}_{model_developer}"
    except:
        pass

    source_folder = os.path.join(data_folder, raw_source_folder)
    results_folder = os.path.join(data_folder, raw_results_folder, model_folder)

    return source_folder, results_folder

In [6]:
model_name = "siebert/sentiment-roberta-large-english"
date = "2017-01-01"
source_folder, results_folder = get_path(model_name, data_folder, date=date)

month = datetime.strptime(date, "%Y-%m-%d").strftime('%b %y')
path = source_folder + month + "/MTurk_" + date + ".csv"
path

'../data/bitcoin_tweets/Jan 17/MTurk_2017-01-01.csv'

In [8]:
with open(path) as f:
    df = pd.read_csv(f)

ids, tweets = df["id"].values.tolist(), df["tweet"].values.tolist()
percentage_per_chunk = 1
save_every = 20

pred_texts, preds_length = scale_tweet_list(percentage_per_chunk, save_every, tweets)

In [10]:
# Load tokenizer and model, create trainer
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name)

trainer = Trainer(model=model)

In [11]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
tokenized_texts = tokenized_texts

In [13]:
# importing the required libraries
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Create class for data preparation
class SimpleDataset(Dataset):
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}  
  
pred_dataset = SimpleDataset(tokenized_texts)


# device = "cuda" if torch.cuda.is_available() else "cpu"
# kwargs = {'num_workers': 1, 'pin_memory': True} if device=='cuda' else {}

# train_loader = torch.utils.data.DataLoader(
#   torchvision.datasets.MNIST('/files/', train=True, download=True),
#   batch_size=batch_size_train, **kwargs)


# implementing dataloader on the dataset and printing per batch
dataloader = DataLoader(pred_dataset, batch_size=10, shuffle=True, pin_memory=True)


for inputs, labels in dataloader:
    inputs, labels = inputs.to(device), labels.to(device)
    predictions = trainer.predict(inputs, labels)
    save_sentiments(ids, predictions, results_folder, date)


KeyboardInterrupt: 

In [None]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text','pred','label','score'])
df.head()