In [4]:
# Example: Import data from csv-file stored on Google Drive

from google.colab import drive
drive.mount('/content/drive')

data_folder = "/content/drive/MyDrive/ColabData/"
#text_column = "text"

#df_pred = pd.read_csv(file_name)
#pred_texts = df_pred[text_column].dropna().astype('str').tolist()

Mounted at /content/drive


In [1]:
# Install the transformers library
!pip install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 8.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 31.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 74.7 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 7.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 74.3 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [2]:
# Import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [3]:
# Create list of texts (can be imported from .csv, .xls etc.)
pred_texts = ['I like that','That is annoying','This is great!','Wouldn´t recommend it.']

In [6]:
import os
from datetime import datetime


def get_path(model_name, date = "2017-01-01", crypto = "bitcoin"):

    raw_source_folder = f"{crypto}_tweets/"
    raw_results_folder = f"{crypto}_scores/"

    try:
        model_developer, model_name, = model_name.split('/', 1)
    except:
        pass

    model_folder = '-'.join(model_name.split('-')[:4])
    try:
        model_folder = f"{model_folder}_{model_developer}"
    except:
        pass

    source_folder = os.path.join(data_folder, raw_source_folder)
    results_folder = os.path.join(data_folder, raw_results_folder, model_folder)

    month = datetime.strptime(date, "%Y-%m-%d").strftime('%b %y')
    path = source_folder + month + "/MTurk_" + date + ".csv"

    return path

In [7]:
model_name = "siebert/sentiment-roberta-large-english"
date = "2017-01-01"
path = get_path(model_name, date=date)
path

'/content/drive/MyDrive/ColabData/bitcoin_tweets/Jan 17/MTurk_2017-01-01.csv'

In [8]:
def scale_tweet_list(percentage_per_chunk, save_every, tweets):
    # scales to minimum of the save_every time size

    length_of_tweets = len(tweets)
    percent_of_length = int(length_of_tweets * percentage_per_chunk/100)
    # percent_of_length = int(percent_of_length)
    last_tweet = percent_of_length - (percent_of_length % save_every)
    # last_tweet = int((length_of_tweets / percentage_per_chunk) - (percent_of_length % save_every))

    if last_tweet == 0:
        last_tweet = save_every

    scaled_tweets = tweets[:last_tweet]
    return scaled_tweets, last_tweet

In [9]:
with open(path) as f:
    df = pd.read_csv(f)

ids, tweets = df["id"].values.tolist(), df["tweet"].values.tolist()
pred_texts, preds_length = scale_tweet_list(50, 2000, tweets)

In [10]:
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
# Load tokenizer and model, create trainer
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name).to(device)

trainer = Trainer(model=model, tokenizer=tokenizer)
trainer

Downloading:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
# Run predictions
predictions = trainer.predict(pred_dataset)

In [None]:
def save_sentiments(ids, results, results_folder, date):
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)  

    outputs = to_dict_of_lists(results)
    outputs["ids"] = ids

    df = pd.DataFrame(outputs)

    date_csv = date + ".csv"
    results_path = os.path.join(results_folder, date_csv)
    df.to_csv(results_path, mode="a", header=False, index=False)

In [None]:
results = predictions
save_sentiments(ids, results, results_folder, date)

In [None]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text','pred','label','score'])
df.head()