Transfer learning using cryptobert and roberta

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig, TextClassificationPipeline
from transformers import pipeline
import numpy as np
import pandas as pd
from scipy.special import softmax
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install torch



In [3]:
!pip install protobuf==3.20.0



## Step 1 - Importing a sample of some Bitcoin Tweet Data to begin analysing the model

In [4]:
# 20230308 - test data from twitter api, from 20220624 to 20220812 
data = pd.read_csv('~/Code/giadapi/crypto/data/raw/tweets_2022_reduced.csv')

# 20230307 - test data from kaggle
# data = pd.read_csv('~/code/giadapi/crypto/data/raw/bitcoin_tweets1000000.csv', nrows = 1000)
# "\\wsl.localhost\Ubuntu\home\peter\code\giadapi\crypto\kaggle-tweets.zip"

In [5]:
data

Unnamed: 0,datetime,date,username,text,has_#bitcoin
0,2022-01-01 22:09:49+00:00,2022-01-01,ahmad00816,Bitcoin 1-month social activity:\n\nGalaxy Sco...,True
1,2022-01-01 22:09:40+00:00,2022-01-01,davidgokhshtein,@PeterSchiff You accept #Bitcoin.,True
2,2022-01-01 22:09:34+00:00,2022-01-01,CircleCryptic,Someone gunna send #bitcoin then? $btc,True
3,2022-01-01 22:09:23+00:00,2022-01-01,fcwilson,"""Welcome, I like to invest in companies, with...",True
4,2022-01-01 22:09:17+00:00,2022-01-01,beat_brunner,@LianLia71340337 @mikealfred @saylor I think 4...,True
...,...,...,...,...,...
93337,2022-06-30 06:00:01+00:00,2022-06-30,SocietySatoshi,Good morning #crypto Twitter🌤️\n\nHow are you ...,True
93338,2022-06-30 06:00:00+00:00,2022-06-30,AltalixLTD,"Since its peak, Bitcoin’s entire market valuat...",True
93339,2022-06-30 06:00:00+00:00,2022-06-30,BasitCrypto98,People who purchased #Bitcoin at the ATH are n...,True
93340,2022-06-30 06:00:00+00:00,2022-06-30,CryptingUp,"Bitcoin price $19,976.70 - Decreased by -1.03%...",True


In [6]:
#only run it if the dataset is from Twitter API
# data['text'] = data[['tweet']]
# data['date'] = data[['created_at']]

In [7]:
data = data[['text', 'date']]

In [8]:
data

Unnamed: 0,text,date
0,Bitcoin 1-month social activity:\n\nGalaxy Sco...,2022-01-01
1,@PeterSchiff You accept #Bitcoin.,2022-01-01
2,Someone gunna send #bitcoin then? $btc,2022-01-01
3,"""Welcome, I like to invest in companies, with...",2022-01-01
4,@LianLia71340337 @mikealfred @saylor I think 4...,2022-01-01
...,...,...
93337,Good morning #crypto Twitter🌤️\n\nHow are you ...,2022-06-30
93338,"Since its peak, Bitcoin’s entire market valuat...",2022-06-30
93339,People who purchased #Bitcoin at the ATH are n...,2022-06-30
93340,"Bitcoin price $19,976.70 - Decreased by -1.03%...",2022-06-30


## Step 2 - Cleaning the data

In [9]:
# I have changed this to remove more information

def preprocess_2(text):
    new_text = []
    text = str(text)
    text = text.replace("\n", " ")
    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [10]:
#Create a dummy data
data['process_text'] = data.text
data['negative_bert'] = data.text
data['neutral_bert'] = data.text
data['positive_bert'] = data.text

#use the preprocess_2 to clean the data
data['process_text'] = data['text'].apply(preprocess_2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['process_text'] = data.text
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['negative_bert'] = data.text


In [11]:
data

Unnamed: 0,text,date,process_text,negative_bert,neutral_bert,positive_bert
0,Bitcoin 1-month social activity:\n\nGalaxy Sco...,2022-01-01,Bitcoin 1-month social activity: Galaxy Score...,Bitcoin 1-month social activity:\n\nGalaxy Sco...,Bitcoin 1-month social activity:\n\nGalaxy Sco...,Bitcoin 1-month social activity:\n\nGalaxy Sco...
1,@PeterSchiff You accept #Bitcoin.,2022-01-01,You accept #Bitcoin.,@PeterSchiff You accept #Bitcoin.,@PeterSchiff You accept #Bitcoin.,@PeterSchiff You accept #Bitcoin.
2,Someone gunna send #bitcoin then? $btc,2022-01-01,Someone gunna send #bitcoin then? $btc,Someone gunna send #bitcoin then? $btc,Someone gunna send #bitcoin then? $btc,Someone gunna send #bitcoin then? $btc
3,"""Welcome, I like to invest in companies, with...",2022-01-01,"""Welcome, I like to invest in companies, with...","""Welcome, I like to invest in companies, with...","""Welcome, I like to invest in companies, with...","""Welcome, I like to invest in companies, with..."
4,@LianLia71340337 @mikealfred @saylor I think 4...,2022-01-01,I think 47-48k is bottom for #bitcoin in 20...,@LianLia71340337 @mikealfred @saylor I think 4...,@LianLia71340337 @mikealfred @saylor I think 4...,@LianLia71340337 @mikealfred @saylor I think 4...
...,...,...,...,...,...,...
93337,Good morning #crypto Twitter🌤️\n\nHow are you ...,2022-06-30,Good morning #crypto Twitter🌤️ How are you fe...,Good morning #crypto Twitter🌤️\n\nHow are you ...,Good morning #crypto Twitter🌤️\n\nHow are you ...,Good morning #crypto Twitter🌤️\n\nHow are you ...
93338,"Since its peak, Bitcoin’s entire market valuat...",2022-06-30,"Since its peak, Bitcoin’s entire market valuat...","Since its peak, Bitcoin’s entire market valuat...","Since its peak, Bitcoin’s entire market valuat...","Since its peak, Bitcoin’s entire market valuat..."
93339,People who purchased #Bitcoin at the ATH are n...,2022-06-30,People who purchased #Bitcoin at the ATH are n...,People who purchased #Bitcoin at the ATH are n...,People who purchased #Bitcoin at the ATH are n...,People who purchased #Bitcoin at the ATH are n...
93340,"Bitcoin price $19,976.70 - Decreased by -1.03%...",2022-06-30,"Bitcoin price $19,976.70 - Decreased by -1.03%...","Bitcoin price $19,976.70 - Decreased by -1.03%...","Bitcoin price $19,976.70 - Decreased by -1.03%...","Bitcoin price $19,976.70 - Decreased by -1.03%..."


## Step 3 - analyse the language and sentiments by pretrained model

In [12]:
!pyenv local crypto

In [13]:
#Model 2b - Use the bert model, with full scores
MODEL_bert = f"ElKulako/cryptobert"
tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_bert)
config_bert = AutoConfig.from_pretrained(MODEL_bert)

# PT
model_bert = AutoModelForSequenceClassification.from_pretrained(MODEL_bert)

In [14]:
def scores_bert(sample_text):
    encoded_input_bert = tokenizer_bert(sample_text, return_tensors='pt')
    output_bert = model_bert(**encoded_input_bert)
    scores_bert = output_bert[0][0].detach().numpy()
    scores_bert = softmax(scores_bert) #1st score is negative, 2nd score is netural, 3rd score is positive
    return scores_bert

In [None]:
data['text'] = data['process_text'].apply(scores_bert)

In [None]:
for i in range(len(data)):    
    data['negative_bert'][i] = data['text'][i][0]
    data['neutral_bert'][i] = data['text'][i][1]
    data['positive_bert'][i] = data['text'][i][2]

In [None]:
data = data[['date', 'process_text', 'negative_bert', 'neutral_bert','positive_bert']]

In [None]:
data

## Step 5: Count the total number of labels/scores (positive, negative vs neutral) by date

In [None]:
grouped_data = pd.DataFrame(data.groupby(['date'])[['negative_bert', 'neutral_bert', 'positive_bert']].sum().fillna(0).reset_index())
grouped_data = grouped_data.rename_axis("", axis="columns")

In [None]:
grouped_data

## Step 6 - Download the data

In [None]:
start_date = grouped_data['date'][0]
start_date_str = datetime.datetime.strftime(start_date, "%Y-%m-%d")
start_date_str

In [None]:
end_date = grouped_data['date'][len(grouped_data)-1]
end_date_str = datetime.datetime.strftime(end_date, "%Y-%m-%d")
end_date_str

In [None]:
file_name = f"{start_date_str}_{end_date_str}_twitter_comments.csv"
data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")

In [None]:
file_name = f"{start_date_str}_{end_date_str}_twitter_transferlearning.csv"
grouped_data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")