Transfer learning using cryptobert and roberta

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig, TextClassificationPipeline
from transformers import pipeline
import numpy as np
import pandas as pd
from scipy.special import softmax
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install torch



In [3]:
!pip install protobuf==3.20.0



## Step 1 - Importing a sample of some Bitcoin Tweet Data to begin analysing the model

In [4]:
# 20230309 - test data from twitter api, from 20220924 to 20221223 
data = pd.read_csv('~/Code/giadapi/crypto/data/raw/tweet_24-09-22_to_24-12-23.csv')

In [5]:
data

Unnamed: 0,datetime,username,text
0,2022-09-24 06:09:59+00:00,martindarkobtc,"White House Press Secretary is like, ""It's coo..."
1,2022-09-24 06:09:44+00:00,BigSeanHarris,Raise capital\nBuy #bitcoin
2,2022-09-24 06:09:43+00:00,hungjae_oh,We are programmed to react on fear. So if you ...
3,2022-09-24 06:09:18+00:00,gravityink1,The future is with #bitcoin.
4,2022-09-24 06:09:17+00:00,onCh41n,Volatility Continues as Interest Rates Rise #b...
...,...,...,...
39508,2022-12-23 22:00:01+00:00,HourlyBTCUpdate,Bitcoin: $16810.1\n💚 +5.44 last 1 Hour (+0.03%...
39509,2022-12-23 22:00:00+00:00,moret_io,"Now #Bitcoin is worth $16.8 thousand, #Ethereu..."
39510,2022-12-23 22:00:00+00:00,whalesradar_com,#WANUSDT #WAN \nSignal #2 \n\nLast Signal: 37...
39511,2022-12-23 22:00:00+00:00,murray_rothbot,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...


In [6]:
data['datetime']

0        2022-09-24 06:09:59+00:00
1        2022-09-24 06:09:44+00:00
2        2022-09-24 06:09:43+00:00
3        2022-09-24 06:09:18+00:00
4        2022-09-24 06:09:17+00:00
                   ...            
39508    2022-12-23 22:00:01+00:00
39509    2022-12-23 22:00:00+00:00
39510    2022-12-23 22:00:00+00:00
39511    2022-12-23 22:00:00+00:00
39512    2022-12-23 22:00:00+00:00
Name: datetime, Length: 39513, dtype: object

In [7]:
#only run it if the dataset is from Twitter API
# data['text'] = data[['tweet']]
# data['date'] = data[['created_at']]

#only run if the dataset is tweets_2021_reduced.csv
data['date'] = data['datetime']
for i in range(len(data)):
    data['date'][i] = data['datetime'][i][0:10]

In [8]:
data = data[['text', 'date']]

In [9]:
data

Unnamed: 0,text,date
0,"White House Press Secretary is like, ""It's coo...",2022-09-24
1,Raise capital\nBuy #bitcoin,2022-09-24
2,We are programmed to react on fear. So if you ...,2022-09-24
3,The future is with #bitcoin.,2022-09-24
4,Volatility Continues as Interest Rates Rise #b...,2022-09-24
...,...,...
39508,Bitcoin: $16810.1\n💚 +5.44 last 1 Hour (+0.03%...,2022-12-23
39509,"Now #Bitcoin is worth $16.8 thousand, #Ethereu...",2022-12-23
39510,#WANUSDT #WAN \nSignal #2 \n\nLast Signal: 37...,2022-12-23
39511,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,2022-12-23


In [10]:
data['date'].value_counts()

2022-11-08    450
2022-10-17    450
2022-10-20    450
2022-10-21    450
2022-12-12    450
             ... 
2022-10-02    383
2022-12-03    378
2022-12-18    377
2022-09-24    371
2022-09-25    344
Name: date, Length: 91, dtype: int64

## Step 2 - Cleaning the data

In [11]:
# I have changed this to remove more information

def preprocess(text):
    new_text = []
    text = str(text)
    text = text.replace("\n", " ")
    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [12]:
#Create a dummy data
data['process_text'] = data.text
data['negative_bert'] = data.text
data['neutral_bert'] = data.text
data['positive_bert'] = data.text

#use the preprocess to clean the data
data['process_text'] = data['text'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['process_text'] = data.text


In [13]:
data

Unnamed: 0,text,date,process_text,negative_bert,neutral_bert,positive_bert
0,"White House Press Secretary is like, ""It's coo...",2022-09-24,"White House Press Secretary is like, ""It's coo...","White House Press Secretary is like, ""It's coo...","White House Press Secretary is like, ""It's coo...","White House Press Secretary is like, ""It's coo..."
1,Raise capital\nBuy #bitcoin,2022-09-24,Raise capital Buy #bitcoin,Raise capital\nBuy #bitcoin,Raise capital\nBuy #bitcoin,Raise capital\nBuy #bitcoin
2,We are programmed to react on fear. So if you ...,2022-09-24,We are programmed to react on fear. So if you ...,We are programmed to react on fear. So if you ...,We are programmed to react on fear. So if you ...,We are programmed to react on fear. So if you ...
3,The future is with #bitcoin.,2022-09-24,The future is with #bitcoin.,The future is with #bitcoin.,The future is with #bitcoin.,The future is with #bitcoin.
4,Volatility Continues as Interest Rates Rise #b...,2022-09-24,Volatility Continues as Interest Rates Rise #b...,Volatility Continues as Interest Rates Rise #b...,Volatility Continues as Interest Rates Rise #b...,Volatility Continues as Interest Rates Rise #b...
...,...,...,...,...,...,...
39508,Bitcoin: $16810.1\n💚 +5.44 last 1 Hour (+0.03%...,2022-12-23,Bitcoin: $16810.1 💚 +5.44 last 1 Hour (+0.03%)...,Bitcoin: $16810.1\n💚 +5.44 last 1 Hour (+0.03%...,Bitcoin: $16810.1\n💚 +5.44 last 1 Hour (+0.03%...,Bitcoin: $16810.1\n💚 +5.44 last 1 Hour (+0.03%...
39509,"Now #Bitcoin is worth $16.8 thousand, #Ethereu...",2022-12-23,"Now #Bitcoin is worth $16.8 thousand, #Ethereu...","Now #Bitcoin is worth $16.8 thousand, #Ethereu...","Now #Bitcoin is worth $16.8 thousand, #Ethereu...","Now #Bitcoin is worth $16.8 thousand, #Ethereu..."
39510,#WANUSDT #WAN \nSignal #2 \n\nLast Signal: 37...,2022-12-23,#WANUSDT #WAN Signal #2 Last Signal: 379 m...,#WANUSDT #WAN \nSignal #2 \n\nLast Signal: 37...,#WANUSDT #WAN \nSignal #2 \n\nLast Signal: 37...,#WANUSDT #WAN \nSignal #2 \n\nLast Signal: 37...
39511,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,2022-12-23,#Bitcoin 📊 Recommended Fees Recommended Fees ...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...


## Step 3 - analyse the language and sentiments by pretrained model

In [14]:
!pyenv local crypto

In [15]:
#Model 2b - Use the bert model, with full scores
MODEL_bert = f"ElKulako/cryptobert"
tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_bert)
tokenizer_bert.model_max_length = 512 #solve the error: RuntimeError: The expanded size of the tensor (562) must match the existing size (514) at non-singleton dimension
config_bert = AutoConfig.from_pretrained(MODEL_bert)


# PT
model_bert = AutoModelForSequenceClassification.from_pretrained(MODEL_bert)
model_bert.config.max_position_embeddings = 512

In [16]:
def scores_bert(sample_text):
    encoded_input_bert = tokenizer_bert(sample_text, return_tensors='pt')
    output_bert = model_bert(**encoded_input_bert)
    scores_bert = output_bert[0][0].detach().numpy()
    scores_bert = softmax(scores_bert) #1st score is negative, 2nd score is netural, 3rd score is positive
    return scores_bert

In [17]:
data['text'] = data['process_text'].apply(scores_bert)

In [18]:
data

Unnamed: 0,text,date,process_text,negative_bert,neutral_bert,positive_bert
0,"[0.004218805, 0.3782313, 0.61754996]",2022-09-24,"White House Press Secretary is like, ""It's coo...","White House Press Secretary is like, ""It's coo...","White House Press Secretary is like, ""It's coo...","White House Press Secretary is like, ""It's coo..."
1,"[0.0007797184, 0.5238835, 0.47533682]",2022-09-24,Raise capital Buy #bitcoin,Raise capital\nBuy #bitcoin,Raise capital\nBuy #bitcoin,Raise capital\nBuy #bitcoin
2,"[0.00038310103, 0.58267516, 0.41694173]",2022-09-24,We are programmed to react on fear. So if you ...,We are programmed to react on fear. So if you ...,We are programmed to react on fear. So if you ...,We are programmed to react on fear. So if you ...
3,"[0.0009656231, 0.39355516, 0.6054792]",2022-09-24,The future is with #bitcoin.,The future is with #bitcoin.,The future is with #bitcoin.,The future is with #bitcoin.
4,"[0.9251631, 0.07036751, 0.004469425]",2022-09-24,Volatility Continues as Interest Rates Rise #b...,Volatility Continues as Interest Rates Rise #b...,Volatility Continues as Interest Rates Rise #b...,Volatility Continues as Interest Rates Rise #b...
...,...,...,...,...,...,...
39508,"[0.0003781108, 0.7676998, 0.2319221]",2022-12-23,Bitcoin: $16810.1 💚 +5.44 last 1 Hour (+0.03%)...,Bitcoin: $16810.1\n💚 +5.44 last 1 Hour (+0.03%...,Bitcoin: $16810.1\n💚 +5.44 last 1 Hour (+0.03%...,Bitcoin: $16810.1\n💚 +5.44 last 1 Hour (+0.03%...
39509,"[0.0070498134, 0.83938944, 0.15356068]",2022-12-23,"Now #Bitcoin is worth $16.8 thousand, #Ethereu...","Now #Bitcoin is worth $16.8 thousand, #Ethereu...","Now #Bitcoin is worth $16.8 thousand, #Ethereu...","Now #Bitcoin is worth $16.8 thousand, #Ethereu..."
39510,"[0.00036276868, 0.9405311, 0.059106126]",2022-12-23,#WANUSDT #WAN Signal #2 Last Signal: 379 m...,#WANUSDT #WAN \nSignal #2 \n\nLast Signal: 37...,#WANUSDT #WAN \nSignal #2 \n\nLast Signal: 37...,#WANUSDT #WAN \nSignal #2 \n\nLast Signal: 37...
39511,"[0.029729657, 0.71850157, 0.25176874]",2022-12-23,#Bitcoin 📊 Recommended Fees Recommended Fees ...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...


In [19]:
for i in range(len(data)):    
    data['negative_bert'][i] = data['text'][i][0]
    data['neutral_bert'][i] = data['text'][i][1]
    data['positive_bert'][i] = data['text'][i][2]

In [20]:
data = data[['date', 'process_text', 'negative_bert', 'neutral_bert','positive_bert']]

In [21]:
data

Unnamed: 0,date,process_text,negative_bert,neutral_bert,positive_bert
0,2022-09-24,"White House Press Secretary is like, ""It's coo...",0.004219,0.378231,0.61755
1,2022-09-24,Raise capital Buy #bitcoin,0.00078,0.523884,0.475337
2,2022-09-24,We are programmed to react on fear. So if you ...,0.000383,0.582675,0.416942
3,2022-09-24,The future is with #bitcoin.,0.000966,0.393555,0.605479
4,2022-09-24,Volatility Continues as Interest Rates Rise #b...,0.925163,0.070368,0.004469
...,...,...,...,...,...
39508,2022-12-23,Bitcoin: $16810.1 💚 +5.44 last 1 Hour (+0.03%)...,0.000378,0.7677,0.231922
39509,2022-12-23,"Now #Bitcoin is worth $16.8 thousand, #Ethereu...",0.00705,0.839389,0.153561
39510,2022-12-23,#WANUSDT #WAN Signal #2 Last Signal: 379 m...,0.000363,0.940531,0.059106
39511,2022-12-23,#Bitcoin 📊 Recommended Fees Recommended Fees ...,0.02973,0.718502,0.251769


## Step 4: Count the total number of labels/scores (positive, negative vs neutral) by date

In [22]:
grouped_data = pd.DataFrame(data.groupby(['date'])[['negative_bert', 'neutral_bert', 'positive_bert']].sum().fillna(0).reset_index())
grouped_data = grouped_data.rename_axis("", axis="columns")

In [23]:
grouped_data

Unnamed: 0,date,negative_bert,neutral_bert,positive_bert
0,2022-09-24,34.404747,206.772110,129.823090
1,2022-09-25,35.125938,203.181046,105.693001
2,2022-09-26,29.715998,232.964111,159.319733
3,2022-09-27,43.433617,239.275162,143.291321
4,2022-09-28,31.568678,254.573959,155.857361
...,...,...,...,...
86,2022-12-19,46.421448,258.037628,142.540878
87,2022-12-20,35.761200,234.286316,154.952499
88,2022-12-21,36.978165,228.291870,152.730026
89,2022-12-22,48.039040,229.943283,154.017715


## Step 5 - Download the data

In [24]:
start_date = grouped_data['date'][0]
# start_date_str = datetime.datetime.strftime(start_date, "%Y-%m-%d")
start_date

'2022-09-24'

In [25]:
end_date = grouped_data['date'][len(grouped_data)-1]
# end_date_str = datetime.datetime.strftime(end_date, "%Y-%m-%d")
end_date

'2022-12-23'

In [26]:
file_name = f"{start_date}_{end_date}_twitter_comments.csv"
data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")

In [27]:
file_name = f"{start_date}_{end_date}_twitter_transferlearning.csv"
grouped_data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")