Transfer learning using cryptobert and roberta

In [53]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig, TextClassificationPipeline
from transformers import pipeline
import numpy as np
import pandas as pd
from scipy.special import softmax
import datetime

In [2]:
!pip install torch



In [3]:
!pip install protobuf==3.20.0



## Step 1 - Importing a sample of some Bitcoin Tweet Data to begin analysing the model

In [4]:
# 20230308 - test data from twitter api, from 20220624 to 20220812 
data = pd.read_csv('~/Code/giadapi/crypto/data/tweets_20220624_20220812.csv')

# 20230307 - test data from kaggle
# data = pd.read_csv('~/code/giadapi/crypto/data/raw/bitcoin_tweets1000000.csv', nrows = 1000)
# "\\wsl.localhost\Ubuntu\home\peter\code\giadapi\crypto\kaggle-tweets.zip"

In [5]:
data

Unnamed: 0,author id,created_at,geo,id,lang,like_count,quote_count,reply_count,retweet_count,tweet
0,1538620097975267328,2022-06-24 06:09:58+00:00,,1540215638651797504,en,2,0,1,0,I'm looking for these levels today 👀 \n\n#Bitc...
1,3017689166,2022-06-24 06:09:55+00:00,,1540215626815647746,en,0,1,0,0,create twitter tasks and pay with #bitcoin #et...
2,1439292574645334020,2022-06-24 06:09:52+00:00,,1540215613754589184,en,1,0,0,0,Top 10 Cyptocurrencies in the World by Twitter...
3,1155038365881782272,2022-06-24 06:09:46+00:00,,1540215588638920704,en,0,0,0,0,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...
4,1098733905849417728,2022-06-24 06:09:46+00:00,,1540215586998722561,en,0,0,0,0,I am claiming my free Lightning sats from @_bi...
...,...,...,...,...,...,...,...,...,...,...
4900,1448763322812276737,2022-08-12 14:08:18+00:00,,1558093020536442882,en,0,0,0,0,👋 A new block was found on the #Bitcoin networ...
4901,77002234,2022-08-12 14:08:15+00:00,,1558093008112943107,en,0,0,1,0,Byzantine Generals are back on campus. #byzant...
4902,1514825067800449025,2022-08-12 14:08:05+00:00,,1558092966727921664,en,0,0,0,0,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...
4903,1082726807919415296,2022-08-12 14:08:02+00:00,,1558092951246741507,en,0,0,1,0,Trade-Ideas Trading Room is opening now - join...


In [6]:
#only run it if the dataset is from Twitter API
data['text'] = data[['tweet']]
data['date'] = data[['created_at']]

In [7]:
data = data[['text', 'date']]

In [8]:
data['text'][0]

"I'm looking for these levels today 👀 \n\n#Bitcoin #Ethereum #Solana #MATIC #Cryptocurency #ETH #BTC #TradingView https://t.co/L1SMaXRQiY"

## Step 2 - Cleaning the data

In [9]:
# Preprocess text (username and link placeholders) - this is suggested from the example in 
# https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment
def preprocess_1(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

preprocess_1(data['text'][0])

"I'm looking for these levels today 👀 \n\n#Bitcoin #Ethereum #Solana #MATIC #Cryptocurency #ETH #BTC #TradingView http"

In [10]:
# I have changed this to remove more information

def preprocess_2(text):
    new_text = []
    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

sample_text = preprocess_2(data['text'][0])
sample_text

"I'm looking for these levels today 👀 \n\n#Bitcoin #Ethereum #Solana #MATIC #Cryptocurency #ETH #BTC #TradingView "

In [11]:
#use the preprocess_2 to clean the data

data['process_text'] = data.text
data['label_xlm'] = data.text
# data['score_xlm'] = data.text
data['negative_xlm'] = data.text
data['neutral_xlm'] = data.text
data['positive_xlm'] = data.text

data['label_bert'] = data.text
# data['score_bert'] = data.text
data['negative_bert'] = data.text
data['neutral_bert'] = data.text
data['positive_bert'] = data.text
for i in range(len(data)):
    data['process_text'][i] = preprocess_2(data['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['process_text'] = data.text
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label_xlm'] = data.text
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['negative_xlm'] = data.text
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

## Step 3 - analyse the language and sentiments by pretrained model

In [12]:
!pyenv local crypto

In [13]:
#Model 1a - Use the xlm model - https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment?text=go+up

model_path_xlm = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task_xlm = pipeline("sentiment-analysis", model=model_path_xlm, tokenizer=model_path_xlm)

In [14]:
test_sent = sentiment_task_xlm(sample_text)
print(test_sent[0]['score'])
print(test_sent[0]['label'])

0.8055609464645386
neutral


In [15]:
#Model 1b - Use the xlm model, with full score
MODEL_xlm = f"{model_path_xlm}"
tokenizer_xlm = AutoTokenizer.from_pretrained(MODEL_xlm)
config_xlm = AutoConfig.from_pretrained(MODEL_xlm)

# PT
model_xlm = AutoModelForSequenceClassification.from_pretrained(MODEL_xlm)
# model_xlm.save_pretrained(MODEL_xlm)

In [16]:
def scores_xlm(x):
    encoded_input_xlm = tokenizer_xlm(x, return_tensors='pt')
    output_xlm = model_xlm(**encoded_input_xlm)
    scores_xlm = output_xlm[0][0].detach().numpy()
    scores_xlm = softmax(scores_xlm) #1st score is negative, 2nd score is netural, 3rd score is positive
    return scores_xlm

In [17]:
score = scores_xlm(sample_text)
score

array([0.03531584, 0.80556095, 0.15912327], dtype=float32)

In [18]:
#Model 2a - Use the CryptoBERT model - https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment?text=go+up

model_path_bert = "ElKulako/cryptobert"
sentiment_task_bert = pipeline("sentiment-analysis", model=model_path_bert, tokenizer=model_path_bert)

In [19]:
test_sent = sentiment_task_bert(sample_text)
print(test_sent[0]['score'])
print(test_sent[0]['label'])

0.5417445302009583
Neutral


In [20]:
#Model 2b - Use the bert model, with full scores
MODEL_bert = f"{model_path_bert}"
tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_bert)
config_bert = AutoConfig.from_pretrained(MODEL_bert)

# PT
model_bert = AutoModelForSequenceClassification.from_pretrained(MODEL_bert)
# model_bert.save_pretrained(MODEL_bert)

In [21]:
def scores_bert(sample_text):
    encoded_input_bert = tokenizer_bert(sample_text, return_tensors='pt')
    output_bert = model_bert(**encoded_input_bert)
    scores_bert = output_bert[0][0].detach().numpy()
    scores_bert = softmax(scores_bert) #1st score is negative, 2nd score is netural, 3rd score is positive
    return scores_bert

In [22]:
score = scores_bert(sample_text)
score

array([3.1575037e-04, 5.4174453e-01, 4.5793974e-01], dtype=float32)

In [23]:
data

Unnamed: 0,text,date,process_text,label_xlm,negative_xlm,neutral_xlm,positive_xlm,label_bert,negative_bert,neutral_bert,positive_bert
0,I'm looking for these levels today 👀 \n\n#Bitc...,2022-06-24 06:09:58+00:00,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...,I'm looking for these levels today 👀 \n\n#Bitc...
1,create twitter tasks and pay with #bitcoin #et...,2022-06-24 06:09:55+00:00,create twitter tasks and pay with #bitcoin #et...,create twitter tasks and pay with #bitcoin #et...,create twitter tasks and pay with #bitcoin #et...,create twitter tasks and pay with #bitcoin #et...,create twitter tasks and pay with #bitcoin #et...,create twitter tasks and pay with #bitcoin #et...,create twitter tasks and pay with #bitcoin #et...,create twitter tasks and pay with #bitcoin #et...,create twitter tasks and pay with #bitcoin #et...
2,Top 10 Cyptocurrencies in the World by Twitter...,2022-06-24 06:09:52+00:00,Top 10 Cyptocurrencies in the World by Twitter...,Top 10 Cyptocurrencies in the World by Twitter...,Top 10 Cyptocurrencies in the World by Twitter...,Top 10 Cyptocurrencies in the World by Twitter...,Top 10 Cyptocurrencies in the World by Twitter...,Top 10 Cyptocurrencies in the World by Twitter...,Top 10 Cyptocurrencies in the World by Twitter...,Top 10 Cyptocurrencies in the World by Twitter...,Top 10 Cyptocurrencies in the World by Twitter...
3,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,2022-06-24 06:09:46+00:00,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...
4,I am claiming my free Lightning sats from @_bi...,2022-06-24 06:09:46+00:00,I am claiming my free Lightning sats from zes...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...,I am claiming my free Lightning sats from @_bi...
...,...,...,...,...,...,...,...,...,...,...,...
4900,👋 A new block was found on the #Bitcoin networ...,2022-08-12 14:08:18+00:00,👋 A new block was found on the #Bitcoin networ...,👋 A new block was found on the #Bitcoin networ...,👋 A new block was found on the #Bitcoin networ...,👋 A new block was found on the #Bitcoin networ...,👋 A new block was found on the #Bitcoin networ...,👋 A new block was found on the #Bitcoin networ...,👋 A new block was found on the #Bitcoin networ...,👋 A new block was found on the #Bitcoin networ...,👋 A new block was found on the #Bitcoin networ...
4901,Byzantine Generals are back on campus. #byzant...,2022-08-12 14:08:15+00:00,Byzantine Generals are back on campus. #byzant...,Byzantine Generals are back on campus. #byzant...,Byzantine Generals are back on campus. #byzant...,Byzantine Generals are back on campus. #byzant...,Byzantine Generals are back on campus. #byzant...,Byzantine Generals are back on campus. #byzant...,Byzantine Generals are back on campus. #byzant...,Byzantine Generals are back on campus. #byzant...,Byzantine Generals are back on campus. #byzant...
4902,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,2022-08-12 14:08:05+00:00,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...
4903,Trade-Ideas Trading Room is opening now - join...,2022-08-12 14:08:02+00:00,Trade-Ideas Trading Room is opening now - join...,Trade-Ideas Trading Room is opening now - join...,Trade-Ideas Trading Room is opening now - join...,Trade-Ideas Trading Room is opening now - join...,Trade-Ideas Trading Room is opening now - join...,Trade-Ideas Trading Room is opening now - join...,Trade-Ideas Trading Room is opening now - join...,Trade-Ideas Trading Room is opening now - join...,Trade-Ideas Trading Room is opening now - join...


In [24]:
for i in range(len(data)):
    sentiment_xlm = sentiment_task_xlm(data['process_text'][i])
    data['label_xlm'][i] = sentiment_xlm[0]['label']
#     data['score_xlm'][i] = sentiment_xlm[0]['score']
    score_xlm = scores_xlm(data['process_text'][i])
    data['negative_xlm'][i] = score_xlm[0]
    data['neutral_xlm'][i] = score_xlm[1]
    data['positive_xlm'][i] = score_xlm[2]
    
    sentiment_bert = sentiment_task_bert(data['process_text'][i])
    data['label_bert'][i] = sentiment_bert[0]['label']
#     data['score_bert'][i] = sentiment_bert[0]['score']
    score_bert = scores_bert(data['process_text'][i])
    data['negative_bert'][i] = score_bert[0]
    data['neutral_bert'][i] = score_bert[1]
    data['positive_bert'][i] = score_bert[2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label_xlm'][i] = sentiment_xlm[0]['label']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['negative_xlm'][i] = score_xlm[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['neutral_xlm'][i] = score_xlm[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['positive_xlm'][i] = score_xlm[2]
A v

In [25]:
data

Unnamed: 0,text,date,process_text,label_xlm,negative_xlm,neutral_xlm,positive_xlm,label_bert,negative_bert,neutral_bert,positive_bert
0,I'm looking for these levels today 👀 \n\n#Bitc...,2022-06-24 06:09:58+00:00,I'm looking for these levels today 👀 \n\n#Bitc...,neutral,0.035316,0.805561,0.159123,Neutral,0.000316,0.541745,0.45794
1,create twitter tasks and pay with #bitcoin #et...,2022-06-24 06:09:55+00:00,create twitter tasks and pay with #bitcoin #et...,neutral,0.032351,0.829668,0.137981,Bullish,0.000407,0.434556,0.565037
2,Top 10 Cyptocurrencies in the World by Twitter...,2022-06-24 06:09:52+00:00,Top 10 Cyptocurrencies in the World by Twitter...,neutral,0.179284,0.696499,0.124217,Neutral,0.000707,0.863628,0.135665
3,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,2022-06-24 06:09:46+00:00,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,positive,0.028341,0.124245,0.847414,Neutral,0.000268,0.510206,0.489527
4,I am claiming my free Lightning sats from @_bi...,2022-06-24 06:09:46+00:00,I am claiming my free Lightning sats from zes...,neutral,0.05642,0.780599,0.16298,Neutral,0.001384,0.640739,0.357877
...,...,...,...,...,...,...,...,...,...,...,...
4900,👋 A new block was found on the #Bitcoin networ...,2022-08-12 14:08:18+00:00,👋 A new block was found on the #Bitcoin networ...,neutral,0.044105,0.782324,0.173571,Bullish,0.005152,0.434237,0.560612
4901,Byzantine Generals are back on campus. #byzant...,2022-08-12 14:08:15+00:00,Byzantine Generals are back on campus. #byzant...,neutral,0.052524,0.595462,0.352014,Neutral,0.000143,0.65811,0.341748
4902,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,2022-08-12 14:08:05+00:00,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,neutral,0.067207,0.86844,0.064353,Neutral,0.022624,0.603121,0.374255
4903,Trade-Ideas Trading Room is opening now - join...,2022-08-12 14:08:02+00:00,Trade-Ideas Trading Room is opening now - join...,neutral,0.016118,0.79939,0.184492,Neutral,0.000213,0.806042,0.193745


## Step 4 - compare the model performance

In [26]:
def standardise(x):
    if (x == 'Neutral'):
        return 'neutral'
    elif (x == 'Bullish'):
        return 'positive'
    else:
        return 'negative'
data["label_bert"] = data["label_bert"].apply(standardise)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["label_bert"] = data["label_bert"].apply(standardise)


In [27]:
data

Unnamed: 0,text,date,process_text,label_xlm,negative_xlm,neutral_xlm,positive_xlm,label_bert,negative_bert,neutral_bert,positive_bert
0,I'm looking for these levels today 👀 \n\n#Bitc...,2022-06-24 06:09:58+00:00,I'm looking for these levels today 👀 \n\n#Bitc...,neutral,0.035316,0.805561,0.159123,neutral,0.000316,0.541745,0.45794
1,create twitter tasks and pay with #bitcoin #et...,2022-06-24 06:09:55+00:00,create twitter tasks and pay with #bitcoin #et...,neutral,0.032351,0.829668,0.137981,positive,0.000407,0.434556,0.565037
2,Top 10 Cyptocurrencies in the World by Twitter...,2022-06-24 06:09:52+00:00,Top 10 Cyptocurrencies in the World by Twitter...,neutral,0.179284,0.696499,0.124217,neutral,0.000707,0.863628,0.135665
3,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,2022-06-24 06:09:46+00:00,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,positive,0.028341,0.124245,0.847414,neutral,0.000268,0.510206,0.489527
4,I am claiming my free Lightning sats from @_bi...,2022-06-24 06:09:46+00:00,I am claiming my free Lightning sats from zes...,neutral,0.05642,0.780599,0.16298,neutral,0.001384,0.640739,0.357877
...,...,...,...,...,...,...,...,...,...,...,...
4900,👋 A new block was found on the #Bitcoin networ...,2022-08-12 14:08:18+00:00,👋 A new block was found on the #Bitcoin networ...,neutral,0.044105,0.782324,0.173571,positive,0.005152,0.434237,0.560612
4901,Byzantine Generals are back on campus. #byzant...,2022-08-12 14:08:15+00:00,Byzantine Generals are back on campus. #byzant...,neutral,0.052524,0.595462,0.352014,neutral,0.000143,0.65811,0.341748
4902,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,2022-08-12 14:08:05+00:00,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,neutral,0.067207,0.86844,0.064353,neutral,0.022624,0.603121,0.374255
4903,Trade-Ideas Trading Room is opening now - join...,2022-08-12 14:08:02+00:00,Trade-Ideas Trading Room is opening now - join...,neutral,0.016118,0.79939,0.184492,neutral,0.000213,0.806042,0.193745


In [28]:
data['equality'] = data['label_xlm'] == data['label_bert']

In [29]:
data

Unnamed: 0,text,date,process_text,label_xlm,negative_xlm,neutral_xlm,positive_xlm,label_bert,negative_bert,neutral_bert,positive_bert,equality
0,I'm looking for these levels today 👀 \n\n#Bitc...,2022-06-24 06:09:58+00:00,I'm looking for these levels today 👀 \n\n#Bitc...,neutral,0.035316,0.805561,0.159123,neutral,0.000316,0.541745,0.45794,True
1,create twitter tasks and pay with #bitcoin #et...,2022-06-24 06:09:55+00:00,create twitter tasks and pay with #bitcoin #et...,neutral,0.032351,0.829668,0.137981,positive,0.000407,0.434556,0.565037,False
2,Top 10 Cyptocurrencies in the World by Twitter...,2022-06-24 06:09:52+00:00,Top 10 Cyptocurrencies in the World by Twitter...,neutral,0.179284,0.696499,0.124217,neutral,0.000707,0.863628,0.135665,True
3,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,2022-06-24 06:09:46+00:00,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,positive,0.028341,0.124245,0.847414,neutral,0.000268,0.510206,0.489527,False
4,I am claiming my free Lightning sats from @_bi...,2022-06-24 06:09:46+00:00,I am claiming my free Lightning sats from zes...,neutral,0.05642,0.780599,0.16298,neutral,0.001384,0.640739,0.357877,True
...,...,...,...,...,...,...,...,...,...,...,...,...
4900,👋 A new block was found on the #Bitcoin networ...,2022-08-12 14:08:18+00:00,👋 A new block was found on the #Bitcoin networ...,neutral,0.044105,0.782324,0.173571,positive,0.005152,0.434237,0.560612,False
4901,Byzantine Generals are back on campus. #byzant...,2022-08-12 14:08:15+00:00,Byzantine Generals are back on campus. #byzant...,neutral,0.052524,0.595462,0.352014,neutral,0.000143,0.65811,0.341748,True
4902,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,2022-08-12 14:08:05+00:00,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,neutral,0.067207,0.86844,0.064353,neutral,0.022624,0.603121,0.374255,True
4903,Trade-Ideas Trading Room is opening now - join...,2022-08-12 14:08:02+00:00,Trade-Ideas Trading Room is opening now - join...,neutral,0.016118,0.79939,0.184492,neutral,0.000213,0.806042,0.193745,True


In [30]:
data['label_xlm'].value_counts()

neutral     3614
positive     789
negative     502
Name: label_xlm, dtype: int64

In [31]:
data['label_bert'].value_counts()

neutral     2916
positive    1674
negative     315
Name: label_bert, dtype: int64

In [32]:
data['equality'].value_counts()

True     2693
False    2212
Name: equality, dtype: int64

## Step 5: Count the total number of labels/scores (positive, negative vs neutral) by date

In [34]:
data['date'] = pd.to_datetime(data.date)
data['new_date'] = data['date'].dt.date

In [35]:
data

Unnamed: 0,text,date,process_text,label_xlm,negative_xlm,neutral_xlm,positive_xlm,label_bert,negative_bert,neutral_bert,positive_bert,equality,new_date
0,I'm looking for these levels today 👀 \n\n#Bitc...,2022-06-24 06:09:58+00:00,I'm looking for these levels today 👀 \n\n#Bitc...,neutral,0.035316,0.805561,0.159123,neutral,0.000316,0.541745,0.45794,True,2022-06-24
1,create twitter tasks and pay with #bitcoin #et...,2022-06-24 06:09:55+00:00,create twitter tasks and pay with #bitcoin #et...,neutral,0.032351,0.829668,0.137981,positive,0.000407,0.434556,0.565037,False,2022-06-24
2,Top 10 Cyptocurrencies in the World by Twitter...,2022-06-24 06:09:52+00:00,Top 10 Cyptocurrencies in the World by Twitter...,neutral,0.179284,0.696499,0.124217,neutral,0.000707,0.863628,0.135665,True,2022-06-24
3,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,2022-06-24 06:09:46+00:00,#Airdrop #Airdrops #Airdropinspector #BSC #Cyb...,positive,0.028341,0.124245,0.847414,neutral,0.000268,0.510206,0.489527,False,2022-06-24
4,I am claiming my free Lightning sats from @_bi...,2022-06-24 06:09:46+00:00,I am claiming my free Lightning sats from zes...,neutral,0.05642,0.780599,0.16298,neutral,0.001384,0.640739,0.357877,True,2022-06-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4900,👋 A new block was found on the #Bitcoin networ...,2022-08-12 14:08:18+00:00,👋 A new block was found on the #Bitcoin networ...,neutral,0.044105,0.782324,0.173571,positive,0.005152,0.434237,0.560612,False,2022-08-12
4901,Byzantine Generals are back on campus. #byzant...,2022-08-12 14:08:15+00:00,Byzantine Generals are back on campus. #byzant...,neutral,0.052524,0.595462,0.352014,neutral,0.000143,0.65811,0.341748,True,2022-08-12
4902,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,2022-08-12 14:08:05+00:00,📉 $BTC / $USDT Shorted(Sell) $23773.50\n[14:08...,neutral,0.067207,0.86844,0.064353,neutral,0.022624,0.603121,0.374255,True,2022-08-12
4903,Trade-Ideas Trading Room is opening now - join...,2022-08-12 14:08:02+00:00,Trade-Ideas Trading Room is opening now - join...,neutral,0.016118,0.79939,0.184492,neutral,0.000213,0.806042,0.193745,True,2022-08-12


In [37]:
grouped_data = pd.DataFrame(data.groupby(['new_date'])['label_xlm'].value_counts().unstack().fillna(0).reset_index())
grouped_data = grouped_data.rename_axis("", axis="columns")

grouped_data_bert = pd.DataFrame(data.groupby(['new_date'])['label_bert'].value_counts().unstack().fillna(0).reset_index())
grouped_score = pd.DataFrame(data.groupby(['new_date'])[['negative_xlm', 'neutral_xlm', 'positive_xlm','negative_bert', 'neutral_bert', 'positive_bert']].sum().fillna(0).reset_index())

grouped_data[['date', 'count_negative_xlm', 'count_neutral_xlm', 'count_positive_xlm']] = grouped_data[['new_date', 'negative', 'neutral', 'positive']]
grouped_data[['count_negative_bert', 'count_neutral_bert', 'count_positive_bert']] = grouped_data_bert[['negative', 'neutral', 'positive']]
grouped_data[['negative_xlm', 'neutral_xlm', 'positive_xlm','negative_bert', 'neutral_bert', 'positive_bert']] = grouped_score[['negative_xlm', 'neutral_xlm', 'positive_xlm','negative_bert', 'neutral_bert', 'positive_bert']]

grouped_data = grouped_data.drop(["new_date", 'negative', 'neutral', 'positive'], axis='columns')

In [38]:
grouped_data

Unnamed: 0,date,count_negative_xlm,count_neutral_xlm,count_positive_xlm,count_negative_bert,count_neutral_bert,count_positive_bert,negative_xlm,neutral_xlm,positive_xlm,negative_bert,neutral_bert,positive_bert
0,2022-06-24,6,57,15,3,51,24,10.757743,48.626522,18.615738,4.84323,43.021858,30.134912
1,2022-06-25,8,73,10,5,55,31,12.39642,57.172752,21.430817,4.944597,47.671741,38.383656
2,2022-06-26,6,63,16,5,38,42,9.410669,51.040672,24.548651,3.567415,42.689507,38.743065
3,2022-06-27,17,68,14,14,59,26,20.643763,55.461174,22.895071,12.516508,50.333969,36.149529
4,2022-06-28,10,68,23,11,44,46,16.376936,58.804665,25.818392,9.588241,48.004971,43.40678
5,2022-06-29,9,57,12,12,45,21,12.726689,47.485916,17.787397,8.710379,41.640835,27.648781
6,2022-06-30,13,65,17,10,50,35,18.186152,52.810375,24.003471,8.194161,49.909798,36.896053
7,2022-07-01,10,65,24,10,63,26,15.544599,56.053684,27.401722,8.10988,55.050465,35.839645
8,2022-07-02,11,82,9,14,64,24,15.372499,67.569374,19.058142,11.698927,54.279682,36.021385
9,2022-07-03,3,60,13,3,54,19,9.674458,48.017109,18.308447,3.834375,44.462799,27.702829


## Step 6 - Download the data

In [54]:
start_date = grouped_data['date'][0]
start_date_str = datetime.datetime.strftime(start_date, "%Y-%m-%d")
start_date_str

'2022-06-24'

In [55]:
end_date = grouped_data['date'][len(grouped_data)-1]
end_date_str = datetime.datetime.strftime(end_date, "%Y-%m-%d")
end_date_str

'2022-08-12'

In [56]:
file_name = f"{start_date_str}_{end_date_str}_twitter_comments.csv"
data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")

In [57]:
file_name = f"{start_date_str}_{end_date_str}_twitter_transferlearning.csv"
grouped_data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")