Transfer learning using cryptobert and roberta

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig, TextClassificationPipeline
from transformers import pipeline
import numpy as np
import pandas as pd
from scipy.special import softmax
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install torch



In [3]:
!pip install protobuf==3.20.0



## Step 1 - Importing a sample of some Bitcoin Tweet Data to begin analysing the model

In [42]:
# 20230309 - test data from twitter api, from 20221224 to 20230308 
data = pd.read_csv('~/Code/giadapi/crypto/09-03-23_to_15-03-23.csv', lineterminator='\n')

In [43]:
data = data.rename(columns={'text\r':'text'})

In [44]:
data['datetime']

0       2023-03-09 06:09:06+00:00
1       2023-03-09 06:08:45+00:00
2       2023-03-09 06:08:33+00:00
3       2023-03-09 06:08:27+00:00
4       2023-03-09 06:08:27+00:00
                  ...            
2514    2023-03-14 22:00:33+00:00
2515    2023-03-14 22:00:30+00:00
2516    2023-03-14 22:00:29+00:00
2517    2023-03-14 22:00:29+00:00
2518    2023-03-14 22:00:25+00:00
Name: datetime, Length: 2519, dtype: object

In [45]:
data['datetime'] = pd.to_datetime(data['datetime'])
data['date'] = data['datetime'].dt.date

In [46]:
data

Unnamed: 0,datetime,username,text,date
0,2023-03-09 06:09:06+00:00,investor_shyla1,I believe there is an 80% chance #Bitcoin hit...,2023-03-09
1,2023-03-09 06:08:45+00:00,AHzCrypto,"I was Max bullish on #Bitcoin at the bottom, ...",2023-03-09
2,2023-03-09 06:08:33+00:00,weeblueghost,"- \nBTC price: $21,722 / £18,151 \n\n46.03 Nak...",2023-03-09
3,2023-03-09 06:08:27+00:00,Available4Sale,📢\n\nPremium domain name that can be used to o...,2023-03-09
4,2023-03-09 06:08:27+00:00,Bitcoin8News,"The $BTC price is at $21,732.23 right now.\n🔴 ...",2023-03-09
...,...,...,...,...
2514,2023-03-14 22:00:33+00:00,ashleygrilo12,They are about to turn the money printers on a...,2023-03-14
2515,2023-03-14 22:00:30+00:00,BtcPulse,BTC hourly update\n$24658.30 | -0.54%📉\n...,2023-03-14
2516,2023-03-14 22:00:29+00:00,Bcubeai,#Bitcoin rallied 10% in one day. The next ones...,2023-03-14
2517,2023-03-14 22:00:29+00:00,EvertryLtd,Bitcoin price alert 2023-03-14 22:00:28 GMT\nB...,2023-03-14


## Step 2 - Cleaning the data

In [47]:
# I have changed this to remove more information

def preprocess(text):
    new_text = []
    text = str(text)
    text = text.replace("\n", " ")
    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [48]:
#Create a dummy data
data['process_text'] = data.text
data['negative_bert'] = data.text
data['neutral_bert'] = data.text
data['positive_bert'] = data.text

#use the preprocess_2 to clean the data
data['process_text'] = data['text'].apply(preprocess)

In [49]:
data

Unnamed: 0,datetime,username,text,date,process_text,negative_bert,neutral_bert,positive_bert
0,2023-03-09 06:09:06+00:00,investor_shyla1,I believe there is an 80% chance #Bitcoin hit...,2023-03-09,I believe there is an 80% chance #Bitcoin hit...,I believe there is an 80% chance #Bitcoin hit...,I believe there is an 80% chance #Bitcoin hit...,I believe there is an 80% chance #Bitcoin hit...
1,2023-03-09 06:08:45+00:00,AHzCrypto,"I was Max bullish on #Bitcoin at the bottom, ...",2023-03-09,"I was Max bullish on #Bitcoin at the bottom, ...","I was Max bullish on #Bitcoin at the bottom, ...","I was Max bullish on #Bitcoin at the bottom, ...","I was Max bullish on #Bitcoin at the bottom, ..."
2,2023-03-09 06:08:33+00:00,weeblueghost,"- \nBTC price: $21,722 / £18,151 \n\n46.03 Nak...",2023-03-09,"- BTC price: $21,722 / £18,151 46.03 Naks p...","- \nBTC price: $21,722 / £18,151 \n\n46.03 Nak...","- \nBTC price: $21,722 / £18,151 \n\n46.03 Nak...","- \nBTC price: $21,722 / £18,151 \n\n46.03 Nak..."
3,2023-03-09 06:08:27+00:00,Available4Sale,📢\n\nPremium domain name that can be used to o...,2023-03-09,📢 Premium domain name that can be used to off...,📢\n\nPremium domain name that can be used to o...,📢\n\nPremium domain name that can be used to o...,📢\n\nPremium domain name that can be used to o...
4,2023-03-09 06:08:27+00:00,Bitcoin8News,"The $BTC price is at $21,732.23 right now.\n🔴 ...",2023-03-09,"The $BTC price is at $21,732.23 right now. 🔴 C...","The $BTC price is at $21,732.23 right now.\n🔴 ...","The $BTC price is at $21,732.23 right now.\n🔴 ...","The $BTC price is at $21,732.23 right now.\n🔴 ..."
...,...,...,...,...,...,...,...,...
2514,2023-03-14 22:00:33+00:00,ashleygrilo12,They are about to turn the money printers on a...,2023-03-14,They are about to turn the money printers on a...,They are about to turn the money printers on a...,They are about to turn the money printers on a...,They are about to turn the money printers on a...
2515,2023-03-14 22:00:30+00:00,BtcPulse,BTC hourly update\n$24658.30 | -0.54%📉\n...,2023-03-14,BTC hourly update $24658.30 | -0.54%📉 $B...,BTC hourly update\n$24658.30 | -0.54%📉\n...,BTC hourly update\n$24658.30 | -0.54%📉\n...,BTC hourly update\n$24658.30 | -0.54%📉\n...
2516,2023-03-14 22:00:29+00:00,Bcubeai,#Bitcoin rallied 10% in one day. The next ones...,2023-03-14,#Bitcoin rallied 10% in one day. The next ones...,#Bitcoin rallied 10% in one day. The next ones...,#Bitcoin rallied 10% in one day. The next ones...,#Bitcoin rallied 10% in one day. The next ones...
2517,2023-03-14 22:00:29+00:00,EvertryLtd,Bitcoin price alert 2023-03-14 22:00:28 GMT\nB...,2023-03-14,Bitcoin price alert 2023-03-14 22:00:28 GMT BT...,Bitcoin price alert 2023-03-14 22:00:28 GMT\nB...,Bitcoin price alert 2023-03-14 22:00:28 GMT\nB...,Bitcoin price alert 2023-03-14 22:00:28 GMT\nB...


## Step 3 - analyse the language and sentiments by pretrained model

In [50]:
!pyenv local crypto

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [51]:
#Model 2b - Use the bert model, with full scores
MODEL_bert = f"ElKulako/cryptobert"
tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_bert)
tokenizer_bert.model_max_length = 512 #solve the error: RuntimeError: The expanded size of the tensor (562) must match the existing size (514) at non-singleton dimension
config_bert = AutoConfig.from_pretrained(MODEL_bert)


# PT
model_bert = AutoModelForSequenceClassification.from_pretrained(MODEL_bert)
model_bert.config.max_position_embeddings = 512

In [52]:
def scores_bert(sample_text):
    encoded_input_bert = tokenizer_bert(sample_text, return_tensors='pt')
    output_bert = model_bert(**encoded_input_bert)
    scores_bert = output_bert[0][0].detach().numpy()
    scores_bert = softmax(scores_bert) #1st score is negative, 2nd score is netural, 3rd score is positive
    return scores_bert

In [53]:
data['text'] = data['process_text'].apply(scores_bert)

In [54]:
data

Unnamed: 0,datetime,username,text,date,process_text,negative_bert,neutral_bert,positive_bert
0,2023-03-09 06:09:06+00:00,investor_shyla1,"[0.034752283, 0.54298496, 0.4222628]",2023-03-09,I believe there is an 80% chance #Bitcoin hit...,I believe there is an 80% chance #Bitcoin hit...,I believe there is an 80% chance #Bitcoin hit...,I believe there is an 80% chance #Bitcoin hit...
1,2023-03-09 06:08:45+00:00,AHzCrypto,"[0.005958033, 0.66700155, 0.32704043]",2023-03-09,"I was Max bullish on #Bitcoin at the bottom, ...","I was Max bullish on #Bitcoin at the bottom, ...","I was Max bullish on #Bitcoin at the bottom, ...","I was Max bullish on #Bitcoin at the bottom, ..."
2,2023-03-09 06:08:33+00:00,weeblueghost,"[0.012368703, 0.8358021, 0.15182915]",2023-03-09,"- BTC price: $21,722 / £18,151 46.03 Naks p...","- \nBTC price: $21,722 / £18,151 \n\n46.03 Nak...","- \nBTC price: $21,722 / £18,151 \n\n46.03 Nak...","- \nBTC price: $21,722 / £18,151 \n\n46.03 Nak..."
3,2023-03-09 06:08:27+00:00,Available4Sale,"[0.00035810127, 0.7142423, 0.2853996]",2023-03-09,📢 Premium domain name that can be used to off...,📢\n\nPremium domain name that can be used to o...,📢\n\nPremium domain name that can be used to o...,📢\n\nPremium domain name that can be used to o...
4,2023-03-09 06:08:27+00:00,Bitcoin8News,"[0.7245476, 0.19003384, 0.08541857]",2023-03-09,"The $BTC price is at $21,732.23 right now. 🔴 C...","The $BTC price is at $21,732.23 right now.\n🔴 ...","The $BTC price is at $21,732.23 right now.\n🔴 ...","The $BTC price is at $21,732.23 right now.\n🔴 ..."
...,...,...,...,...,...,...,...,...
2514,2023-03-14 22:00:33+00:00,ashleygrilo12,"[0.0005702601, 0.3126462, 0.6867835]",2023-03-14,They are about to turn the money printers on a...,They are about to turn the money printers on a...,They are about to turn the money printers on a...,They are about to turn the money printers on a...
2515,2023-03-14 22:00:30+00:00,BtcPulse,"[0.36892492, 0.52352697, 0.1075481]",2023-03-14,BTC hourly update $24658.30 | -0.54%📉 $B...,BTC hourly update\n$24658.30 | -0.54%📉\n...,BTC hourly update\n$24658.30 | -0.54%📉\n...,BTC hourly update\n$24658.30 | -0.54%📉\n...
2516,2023-03-14 22:00:29+00:00,Bcubeai,"[0.00038301575, 0.45493403, 0.5446829]",2023-03-14,#Bitcoin rallied 10% in one day. The next ones...,#Bitcoin rallied 10% in one day. The next ones...,#Bitcoin rallied 10% in one day. The next ones...,#Bitcoin rallied 10% in one day. The next ones...
2517,2023-03-14 22:00:29+00:00,EvertryLtd,"[0.005006583, 0.31588405, 0.67910933]",2023-03-14,Bitcoin price alert 2023-03-14 22:00:28 GMT BT...,Bitcoin price alert 2023-03-14 22:00:28 GMT\nB...,Bitcoin price alert 2023-03-14 22:00:28 GMT\nB...,Bitcoin price alert 2023-03-14 22:00:28 GMT\nB...


In [55]:
for i in range(len(data)):    
    data['negative_bert'][i] = data['text'][i][0]
    data['neutral_bert'][i] = data['text'][i][1]
    data['positive_bert'][i] = data['text'][i][2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['negative_bert'][i] = data['text'][i][0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['neutral_bert'][i] = data['text'][i][1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['positive_bert'][i] = data['text'][i][2]


In [57]:
new_df = data.copy()

In [59]:
new_df.drop("text",axis=1,inplace=True)
new_df

Unnamed: 0,datetime,username,date,process_text,negative_bert,neutral_bert,positive_bert
0,2023-03-09 06:09:06+00:00,investor_shyla1,2023-03-09,I believe there is an 80% chance #Bitcoin hit...,0.034752,0.542985,0.422263
1,2023-03-09 06:08:45+00:00,AHzCrypto,2023-03-09,"I was Max bullish on #Bitcoin at the bottom, ...",0.005958,0.667002,0.32704
2,2023-03-09 06:08:33+00:00,weeblueghost,2023-03-09,"- BTC price: $21,722 / £18,151 46.03 Naks p...",0.012369,0.835802,0.151829
3,2023-03-09 06:08:27+00:00,Available4Sale,2023-03-09,📢 Premium domain name that can be used to off...,0.000358,0.714242,0.2854
4,2023-03-09 06:08:27+00:00,Bitcoin8News,2023-03-09,"The $BTC price is at $21,732.23 right now. 🔴 C...",0.724548,0.190034,0.085419
...,...,...,...,...,...,...,...
2514,2023-03-14 22:00:33+00:00,ashleygrilo12,2023-03-14,They are about to turn the money printers on a...,0.00057,0.312646,0.686783
2515,2023-03-14 22:00:30+00:00,BtcPulse,2023-03-14,BTC hourly update $24658.30 | -0.54%📉 $B...,0.368925,0.523527,0.107548
2516,2023-03-14 22:00:29+00:00,Bcubeai,2023-03-14,#Bitcoin rallied 10% in one day. The next ones...,0.000383,0.454934,0.544683
2517,2023-03-14 22:00:29+00:00,EvertryLtd,2023-03-14,Bitcoin price alert 2023-03-14 22:00:28 GMT BT...,0.005007,0.315884,0.679109


In [60]:
new_df = new_df.rename(columns={'process_text':'text'})
new_df

Unnamed: 0,datetime,username,date,text,negative_bert,neutral_bert,positive_bert
0,2023-03-09 06:09:06+00:00,investor_shyla1,2023-03-09,I believe there is an 80% chance #Bitcoin hit...,0.034752,0.542985,0.422263
1,2023-03-09 06:08:45+00:00,AHzCrypto,2023-03-09,"I was Max bullish on #Bitcoin at the bottom, ...",0.005958,0.667002,0.32704
2,2023-03-09 06:08:33+00:00,weeblueghost,2023-03-09,"- BTC price: $21,722 / £18,151 46.03 Naks p...",0.012369,0.835802,0.151829
3,2023-03-09 06:08:27+00:00,Available4Sale,2023-03-09,📢 Premium domain name that can be used to off...,0.000358,0.714242,0.2854
4,2023-03-09 06:08:27+00:00,Bitcoin8News,2023-03-09,"The $BTC price is at $21,732.23 right now. 🔴 C...",0.724548,0.190034,0.085419
...,...,...,...,...,...,...,...
2514,2023-03-14 22:00:33+00:00,ashleygrilo12,2023-03-14,They are about to turn the money printers on a...,0.00057,0.312646,0.686783
2515,2023-03-14 22:00:30+00:00,BtcPulse,2023-03-14,BTC hourly update $24658.30 | -0.54%📉 $B...,0.368925,0.523527,0.107548
2516,2023-03-14 22:00:29+00:00,Bcubeai,2023-03-14,#Bitcoin rallied 10% in one day. The next ones...,0.000383,0.454934,0.544683
2517,2023-03-14 22:00:29+00:00,EvertryLtd,2023-03-14,Bitcoin price alert 2023-03-14 22:00:28 GMT BT...,0.005007,0.315884,0.679109


In [62]:
def remove_spam(data):
    dup_tweets = data[data['text'].duplicated()]
    blacklist = []
    for name in dup_tweets['username']:
        blacklist.append(name)
    blacklist = list(set(blacklist))
    def to_remove(x):
        if x in blacklist:
            return True
        else:
            return False
    data['to_remove'] = data['username'].apply(to_remove)
    mask = data['to_remove']
    data2 = data[~mask]
    return data2

In [67]:
final_df = remove_spam(new_df)
final_df
final_df[["datetime", "username", "text", "date", "negative_bert", "neutral_bert", "positive_bert"]].to_csv("last_")

In [61]:
file_name = f"{'2023-03-09'}_{'2023-03-14'}_twitter_cleaned.csv"
new_df.to_csv(f"~/code/giadapi/crypto/{file_name}")

## Step 4: Count the total number of labels/scores (positive, negative vs neutral) by date

In [36]:
grouped_data = pd.DataFrame(data.groupby(['date'])[['negative_bert', 'neutral_bert', 'positive_bert']].sum().fillna(0).reset_index())
grouped_data = grouped_data.rename_axis("", axis="columns")

In [37]:
grouped_data

Unnamed: 0,date,negative_bert,neutral_bert,positive_bert
0,2023-03-09,47.956829,197.499222,131.544037
1,2023-03-10,54.680729,223.092133,171.227005
2,2023-03-11,51.055069,205.959274,169.985703
3,2023-03-12,56.119164,184.109558,148.771332
4,2023-03-13,56.506569,215.113846,165.379562
5,2023-03-14,44.567947,221.880035,173.552078


## Step 5 - Download the data

In [27]:
start_date = grouped_data['date'][0]
# start_date_str = datetime.datetime.strftime(start_date, "%Y-%m-%d")
start_date

'2022-12-24'

In [28]:
end_date = grouped_data['date'][len(grouped_data)-1]
# end_date_str = datetime.datetime.strftime(end_date, "%Y-%m-%d")
end_date

'2023-03-09'

In [29]:
file_name = f"{start_date}_{end_date}_twitter_comments.csv"
data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")

In [30]:
file_name = f"{start_date}_{end_date}_twitter_transferlearning.csv"
grouped_data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")