Transfer learning using cryptobert and roberta

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig, TextClassificationPipeline
from transformers import pipeline
import numpy as np
import pandas as pd
from scipy.special import softmax
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install torch



In [3]:
!pip install protobuf==3.20.0



## Step 1 - Importing a sample of some Bitcoin Tweet Data to begin analysing the model

In [4]:
# 20230308 - test data from twitter api, from 20220624 to 20220812 
data = pd.read_csv('~/Code/giadapi/crypto/data/raw/tweet_24-6-22_to_23-9-22.csv')

# 20230307 - test data from kaggle
# data = pd.read_csv('~/code/giadapi/crypto/data/raw/bitcoin_tweets1000000.csv', nrows = 1000)
# "\\wsl.localhost\Ubuntu\home\peter\code\giadapi\crypto\kaggle-tweets.zip"

In [5]:
data

Unnamed: 0,datetime,username,text
0,2022-06-24 06:09:58+00:00,WaranInvestment,I'm looking for these levels today 👀 \n\n#Bitc...
1,2022-06-24 06:09:46+00:00,titterboy2,I am claiming my free Lightning sats from @_bi...
2,2022-06-24 06:09:34+00:00,Adityaroypspk,#Saga mobile phone is out now with #DApp store...
3,2022-06-24 06:09:21+00:00,LivePriceCrypto,#Bitcoin Last Price $20924 #BTC 🚀\nDaily Indic...
4,2022-06-24 06:09:09+00:00,SkepticBitcoin,@BitcoinNewslet1 Particularly given the libert...
...,...,...,...
39706,2022-09-23 22:00:02+00:00,topcryptostats,"Bitcoin - BTC\nPrice: $18,993.64\nChange in 1h..."
39707,2022-09-23 22:00:01+00:00,ndeficompany,Do you know how much one millibitcoin is worth...
39708,2022-09-23 22:00:01+00:00,drisk_io,How To Profit From #FUD In Crypto!\n\n#crypto ...
39709,2022-09-23 22:00:00+00:00,murray_rothbot,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...


In [6]:
data['datetime']

0        2022-06-24 06:09:58+00:00
1        2022-06-24 06:09:46+00:00
2        2022-06-24 06:09:34+00:00
3        2022-06-24 06:09:21+00:00
4        2022-06-24 06:09:09+00:00
                   ...            
39706    2022-09-23 22:00:02+00:00
39707    2022-09-23 22:00:01+00:00
39708    2022-09-23 22:00:01+00:00
39709    2022-09-23 22:00:00+00:00
39710    2022-09-23 22:00:00+00:00
Name: datetime, Length: 39711, dtype: object

In [7]:
#only run it if the dataset is from Twitter API
# data['text'] = data[['tweet']]
# data['date'] = data[['created_at']]

#only run if the dataset is tweets_2021_reduced.csv
data['date'] = data['datetime']
for i in range(len(data)):
    data['date'][i] = data['datetime'][i][0:10]

In [8]:
data = data[['text', 'date']]

In [9]:
data

Unnamed: 0,text,date
0,I'm looking for these levels today 👀 \n\n#Bitc...,2022-06-24
1,I am claiming my free Lightning sats from @_bi...,2022-06-24
2,#Saga mobile phone is out now with #DApp store...,2022-06-24
3,#Bitcoin Last Price $20924 #BTC 🚀\nDaily Indic...,2022-06-24
4,@BitcoinNewslet1 Particularly given the libert...,2022-06-24
...,...,...
39706,"Bitcoin - BTC\nPrice: $18,993.64\nChange in 1h...",2022-09-23
39707,Do you know how much one millibitcoin is worth...,2022-09-23
39708,How To Profit From #FUD In Crypto!\n\n#crypto ...,2022-09-23
39709,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,2022-09-23


## Step 2 - Cleaning the data

In [10]:
# I have changed this to remove more information

def preprocess(text):
    new_text = []
    text = str(text)
    text = text.replace("\n", " ")
    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [11]:
#Create a dummy data
data['process_text'] = data.text
data['negative_bert'] = data.text
data['neutral_bert'] = data.text
data['positive_bert'] = data.text

#use the preprocess_2 to clean the data
data['process_text'] = data['text'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['process_text'] = data.text


In [12]:
data

Unnamed: 0,text,date,process_text,negative_bert,neutral_bert,positive_bert
0,I'm looking for these levels today 👀 \n\n#Bitc...,2022-06-24,I'm looking for these levels today 👀 #Bitcoi...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...
1,I am claiming my free Lightning sats from @_bi...,2022-06-24,I am claiming my free Lightning sats from zes...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...
2,#Saga mobile phone is out now with #DApp store...,2022-06-24,#Saga mobile phone is out now with #DApp store...,#Saga mobile phone is out now with #DApp store...,#Saga mobile phone is out now with #DApp store...,#Saga mobile phone is out now with #DApp store...
3,#Bitcoin Last Price $20924 #BTC 🚀\nDaily Indic...,2022-06-24,#Bitcoin Last Price $20924 #BTC 🚀 Daily Indica...,#Bitcoin Last Price $20924 #BTC 🚀\nDaily Indic...,#Bitcoin Last Price $20924 #BTC 🚀\nDaily Indic...,#Bitcoin Last Price $20924 #BTC 🚀\nDaily Indic...
4,@BitcoinNewslet1 Particularly given the libert...,2022-06-24,Particularly given the libertarian underpinni...,@BitcoinNewslet1 Particularly given the libert...,@BitcoinNewslet1 Particularly given the libert...,@BitcoinNewslet1 Particularly given the libert...
...,...,...,...,...,...,...
39706,"Bitcoin - BTC\nPrice: $18,993.64\nChange in 1h...",2022-09-23,"Bitcoin - BTC Price: $18,993.64 Change in 1h: ...","Bitcoin - BTC\nPrice: $18,993.64\nChange in 1h...","Bitcoin - BTC\nPrice: $18,993.64\nChange in 1h...","Bitcoin - BTC\nPrice: $18,993.64\nChange in 1h..."
39707,Do you know how much one millibitcoin is worth...,2022-09-23,Do you know how much one millibitcoin is worth...,Do you know how much one millibitcoin is worth...,Do you know how much one millibitcoin is worth...,Do you know how much one millibitcoin is worth...
39708,How To Profit From #FUD In Crypto!\n\n#crypto ...,2022-09-23,How To Profit From #FUD In Crypto! #crypto #b...,How To Profit From #FUD In Crypto!\n\n#crypto ...,How To Profit From #FUD In Crypto!\n\n#crypto ...,How To Profit From #FUD In Crypto!\n\n#crypto ...
39709,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,2022-09-23,#Bitcoin 📊 Recommended Fees Recommended Fees ...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...


## Step 3 - analyse the language and sentiments by pretrained model

In [13]:
!pyenv local crypto

In [15]:
#Model 2b - Use the bert model, with full scores
MODEL_bert = f"ElKulako/cryptobert"
tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_bert)
tokenizer_bert.model_max_length = 512 #solve the error: RuntimeError: The expanded size of the tensor (562) must match the existing size (514) at non-singleton dimension
config_bert = AutoConfig.from_pretrained(MODEL_bert)


# PT
model_bert = AutoModelForSequenceClassification.from_pretrained(MODEL_bert)
model_bert.config.max_position_embeddings = 512

In [16]:
def scores_bert(sample_text):
    encoded_input_bert = tokenizer_bert(sample_text, return_tensors='pt')
    output_bert = model_bert(**encoded_input_bert)
    scores_bert = output_bert[0][0].detach().numpy()
    scores_bert = softmax(scores_bert) #1st score is negative, 2nd score is netural, 3rd score is positive
    return scores_bert

In [17]:
data['text'] = data['process_text'].apply(scores_bert)

In [18]:
data

Unnamed: 0,text,date,process_text,negative_bert,neutral_bert,positive_bert
0,"[0.00023513255, 0.67456746, 0.32519737]",2022-06-24,I'm looking for these levels today 👀 #Bitcoi...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...
1,"[0.0046280683, 0.72594976, 0.26942214]",2022-06-24,I am claiming my free Lightning sats from zes...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...
2,"[9.394332e-05, 0.31564373, 0.68426234]",2022-06-24,#Saga mobile phone is out now with #DApp store...,#Saga mobile phone is out now with #DApp store...,#Saga mobile phone is out now with #DApp store...,#Saga mobile phone is out now with #DApp store...
3,"[0.013022703, 0.9428376, 0.04413965]",2022-06-24,#Bitcoin Last Price $20924 #BTC 🚀 Daily Indica...,#Bitcoin Last Price $20924 #BTC 🚀\nDaily Indic...,#Bitcoin Last Price $20924 #BTC 🚀\nDaily Indic...,#Bitcoin Last Price $20924 #BTC 🚀\nDaily Indic...
4,"[0.005179823, 0.6816461, 0.31317407]",2022-06-24,Particularly given the libertarian underpinni...,@BitcoinNewslet1 Particularly given the libert...,@BitcoinNewslet1 Particularly given the libert...,@BitcoinNewslet1 Particularly given the libert...
...,...,...,...,...,...,...
39706,"[0.0030673994, 0.80368894, 0.19324368]",2022-09-23,"Bitcoin - BTC Price: $18,993.64 Change in 1h: ...","Bitcoin - BTC\nPrice: $18,993.64\nChange in 1h...","Bitcoin - BTC\nPrice: $18,993.64\nChange in 1h...","Bitcoin - BTC\nPrice: $18,993.64\nChange in 1h..."
39707,"[0.00010119539, 0.6120971, 0.38780177]",2022-09-23,Do you know how much one millibitcoin is worth...,Do you know how much one millibitcoin is worth...,Do you know how much one millibitcoin is worth...,Do you know how much one millibitcoin is worth...
39708,"[0.0013919583, 0.72930276, 0.2693052]",2022-09-23,How To Profit From #FUD In Crypto! #crypto #b...,How To Profit From #FUD In Crypto!\n\n#crypto ...,How To Profit From #FUD In Crypto!\n\n#crypto ...,How To Profit From #FUD In Crypto!\n\n#crypto ...
39709,"[0.019517228, 0.75018203, 0.2303007]",2022-09-23,#Bitcoin 📊 Recommended Fees Recommended Fees ...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...,#Bitcoin 📊 Recommended Fees\n\nRecommended Fee...


In [19]:
for i in range(len(data)):    
    data['negative_bert'][i] = data['text'][i][0]
    data['neutral_bert'][i] = data['text'][i][1]
    data['positive_bert'][i] = data['text'][i][2]

In [20]:
data = data[['date', 'process_text', 'negative_bert', 'neutral_bert','positive_bert']]

In [21]:
data

Unnamed: 0,date,process_text,negative_bert,neutral_bert,positive_bert
0,2022-06-24,I'm looking for these levels today 👀 #Bitcoi...,0.000235,0.674567,0.325197
1,2022-06-24,I am claiming my free Lightning sats from zes...,0.004628,0.72595,0.269422
2,2022-06-24,#Saga mobile phone is out now with #DApp store...,0.000094,0.315644,0.684262
3,2022-06-24,#Bitcoin Last Price $20924 #BTC 🚀 Daily Indica...,0.013023,0.942838,0.04414
4,2022-06-24,Particularly given the libertarian underpinni...,0.00518,0.681646,0.313174
...,...,...,...,...,...
39706,2022-09-23,"Bitcoin - BTC Price: $18,993.64 Change in 1h: ...",0.003067,0.803689,0.193244
39707,2022-09-23,Do you know how much one millibitcoin is worth...,0.000101,0.612097,0.387802
39708,2022-09-23,How To Profit From #FUD In Crypto! #crypto #b...,0.001392,0.729303,0.269305
39709,2022-09-23,#Bitcoin 📊 Recommended Fees Recommended Fees ...,0.019517,0.750182,0.230301


## Step 4: Count the total number of labels/scores (positive, negative vs neutral) by date

In [22]:
grouped_data = pd.DataFrame(data.groupby(['date'])[['negative_bert', 'neutral_bert', 'positive_bert']].sum().fillna(0).reset_index())
grouped_data = grouped_data.rename_axis("", axis="columns")

In [23]:
grouped_data

Unnamed: 0,date,negative_bert,neutral_bert,positive_bert
0,2022-06-24,38.890251,250.165253,143.944550
1,2022-06-25,37.844418,218.009384,143.146118
2,2022-06-26,40.502777,232.348297,119.148918
3,2022-06-27,53.988686,250.235413,142.775772
4,2022-06-28,33.441006,257.452911,144.106033
...,...,...,...,...
87,2022-09-19,46.799217,223.474930,147.725769
88,2022-09-20,39.988178,246.227585,150.784210
89,2022-09-21,38.674816,254.772293,145.553055
90,2022-09-22,33.622601,222.790024,136.587341


## Step 5 - Download the data

In [25]:
start_date = grouped_data['date'][0]
# start_date_str = datetime.datetime.strftime(start_date, "%Y-%m-%d")
start_date

'2022-06-24'

In [26]:
end_date = grouped_data['date'][len(grouped_data)-1]
# end_date_str = datetime.datetime.strftime(end_date, "%Y-%m-%d")
end_date

'2022-09-23'

In [27]:
file_name = f"{start_date}_{end_date}_twitter_comments.csv"
data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")

In [28]:
file_name = f"{start_date}_{end_date}_twitter_transferlearning.csv"
grouped_data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")