# Imports

In [1]:
import torch
import pandas as pd
import os
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import ast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
ROOT = '../'
sys.path.append(ROOT)  # Add the root folder to the sys.path

# Import the modules
from shared.constants import sentiment_mapping
from config import *

# Reload the configuration
from importlib import reload
reload(sys.modules['shared.constants'])
reload(sys.modules['config'])

# Import the reloaded modules
from shared.constants import sentiment_mapping
from config import *


# Cyptobert analysis

In [None]:
def cointelegraph_sentiment_analysis(cointelegraph):
    # Convert 'title' and 'leadtext' to string
    cointelegraph['title'] = cointelegraph['title'].astype(str)
    cointelegraph['leadtext'] = cointelegraph['leadtext'].astype(str)
    cointelegraph['body'] = cointelegraph['body'].astype(str)

    # Iterate over the news dataset and for each news, predict the sentiment and store it in the dataframe
    # Add a new column to the dataframe to store the sentiment: title_sentiment, title_sentiment_score, leadtext_sentiment, leadtext_sentiment_score 
    # Use the crypto_bert_mapping dictionary to map the sentiment to the corresponding label
    # In order to maximize efficiency use a dataset when using the pipelines sequentially on GPU
    # Add a progress bar to track the progress of the pipe execution

    # Create a dataset
    dataset = []

    for index, row in cointelegraph.iterrows():
        title = row['title']
        leadtext = row['leadtext']
        body = row['body']
        dataset.append(title)
        dataset.append(leadtext)
        dataset.append(body)

    # Predict the sentiment
    sentiments = pipe(dataset)

    # Add the sentiment to the dataframe
    cointelegraph.loc[:, 'title_sentiment'] = [sentiment_mapping[sentiment['label']] for sentiment in sentiments[::3]]
    cointelegraph.loc[:, 'title_sentiment_score'] = [sentiment['score'] for sentiment in sentiments[::3]]
    cointelegraph.loc[:, 'leadtext_sentiment'] = [sentiment_mapping[sentiment['label']] for sentiment in sentiments[1::3]]
    cointelegraph.loc[:, 'leadtext_sentiment_score'] = [sentiment['score'] for sentiment in sentiments[1::3]]
    cointelegraph.loc[:, 'body_sentiment'] = [sentiment_mapping[sentiment['label']] for sentiment in sentiments[2::3]]
    cointelegraph.loc[:, 'body_sentiment_score'] = [sentiment['score'] for sentiment in sentiments[2::3]]

    return cointelegraph

In [None]:
def submissions_sentiment_analysis(submissions):
    # Convert 'title' and 'text' to string
    submissions['title'] = submissions['title'].astype(str)
    submissions['text'] = submissions['text'].astype(str)

    # Iterate over the news dataset and for each news, predict the sentiment and store it in the dataframe
    # Add a new column to the dataframe to store the sentiment: title_sentiment, title_sentiment_score, leadtext_sentiment, leadtext_sentiment_score 
    # Use the crypto_bert_mapping dictionary to map the sentiment to the corresponding label

    # Create a dataset
    dataset = []

    for index, row in submissions.iterrows():
        title = row['title']
        text = row['text']
        dataset.append(title)
        dataset.append(text)

    # Predict the sentiment of the dataset
    sentiments = pipe(dataset)

    # Add the sentiment to the dataframe
    submissions.loc[:, 'title_sentiment'] = [sentiment_mapping[sentiment['label']] for sentiment in sentiments[::2]]
    submissions.loc[:, 'title_sentiment_score'] = [sentiment['score'] for sentiment in sentiments[::2]]
    submissions.loc[:, 'text_sentiment'] = [sentiment_mapping[sentiment['label']] for sentiment in sentiments[1::2]]
    submissions.loc[:, 'text_sentiment_score'] = [sentiment['score'] for sentiment in sentiments[1::2]]
            
    return submissions

In [None]:
def comments_sentiment_analysis(comments):
    # Convert 'title' and 'text' to string
    comments['body'] = comments['body'].astype(str)

    # Iterate over the news dataset and for each news, predict the sentiment and store it in the dataframe
    # Add a new column to the dataframe to store the sentiment: title_sentiment, title_sentiment_score, leadtext_sentiment, leadtext_sentiment_score 
    # Use the crypto_bert_mapping dictionary to map the sentiment to the corresponding label

    # Create a dataset
    dataset = []

    for index, row in comments.iterrows():
        body = row['body']
        dataset.append(body)

    # Predict the sentiment of the dataset
    sentiments = pipe(dataset)

    # Add the sentiment to the dataframe
    comments.loc[:, 'body_sentiment'] = [sentiment_mapping[sentiment['label']] for sentiment in sentiments]
    comments.loc[:, 'body_sentiment_score'] = [sentiment['score'] for sentiment in sentiments]
            
    return comments

In [None]:
# Parse the arguments (TO FILL!)
path = ""
dataset_name = ""

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

In [None]:
# Model initialization
model_name = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 3)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, max_length=128, truncation=True, padding = 'max_length', device=0) # device=0 is the GPU 

# Set the chunk size (number of rows to read at a time)
chunk_size = 1000 # rows at a time 

In [None]:
# Create an empty file to store the data
if dataset_name == 'cointelegraph_with_body' :
    dataset_path = os.path.join(ROOT, NEWS_DATASET_PATH, dataset_name +".csv")
    output_file = os.path.join(ROOT, NEWS_DATASET_PATH, dataset_name + "_with_sentiment.csv")
    # Open dataset_path file and write the header
    tmp = pd.read_csv(dataset_path, nrows=0)

    # Add the sentiment columns to the dataframe
    tmp['title_sentiment'] = ""
    tmp['title_sentiment_score'] = ""
    tmp['leadtext_sentiment'] = ""
    tmp['leadtext_sentiment_score'] = ""
    tmp['body_sentiment'] = ""
    tmp['body_sentiment_score'] = ""
elif dataset_name == "submissions":
    dataset_path = os.path.join(ROOT, SOCIAL_DATASET_PATH, "reddit/submissions", dataset_name +".csv")
    output_file = os.path.join(ROOT, SOCIAL_DATASET_PATH, "reddit/submissions", dataset_name+"_with_sentiment.csv")

    # Open dataset_path file and write the header
    tmp = pd.read_csv(dataset_path, nrows=0)

    # Add the sentiment columns to the dataframe
    tmp['title_sentiment'] = ""
    tmp['title_sentiment_score'] = ""
    tmp['text_sentiment'] = ""
    tmp['text_sentiment_score'] = ""
elif dataset_name == "comments":
    dataset_path = os.path.join(ROOT, SOCIAL_DATASET_PATH, "reddit/comments",dataset_name +".csv")
    output_file = os.path.join(ROOT, SOCIAL_DATASET_PATH, "reddit/comments",dataset_name+"_with_sentiment.csv")

    # Open dataset_path file and write the header
    tmp = pd.read_csv(dataset_path, nrows=0)

    # Add the sentiment columns to the dataframe
    tmp['body_sentiment'] = ""
    tmp['body_sentiment_score'] = ""
else:
    raise ValueError(f"Dataset {path+'/'+dataset_name} not supported")

# Write the header
tmp.to_csv(output_file, index=False)

In [None]:
# Read the dataset in chunks
for chunk in tqdm(pd.read_csv(dataset_path, chunksize=chunk_size), total=100000//chunk_size):
    if dataset_name == 'cointelegraph_with_body':
        # Sentiment analysis
        chunk = cointelegraph_sentiment_analysis(chunk)
        # Save the chunk to the file
        chunk.to_csv(output_file, mode='a', header=False, index=False)
    elif dataset_name == "submissions":
        # Sentiment analysis
        chunk = submissions_sentiment_analysis(chunk)
        # Save the dataframe to a new csv file
        chunk.to_csv(output_file, mode='a', header=False, index=False)
    elif dataset_name == "comments":
        chunk = comments_sentiment_analysis(chunk)
        # Save the chunk to the file
        chunk.to_csv(output_file, mode='a', header=False, index=False)
    else:
        raise ValueError(f"Dataset {path+'/'+dataset_name} not supported")

    del chunk

# Overall Cryptobert sentiment

In [3]:
# Load the datasets
merged_daily = pd.read_csv(os.path.join(ROOT, MERGED_DATASET_PATH + "_daily.csv"))
merged_hourly = pd.read_csv(os.path.join(ROOT, MERGED_DATASET_PATH + "_hourly.csv"))

In [4]:
merged_daily

Unnamed: 0,timestamp,open,close,high,low,volume,blocks-size,avg-block-size,n-transactions-total,n-transactions-per-block,...,fng_value_classification,fng_sentiment,cbbi_value,cbbi_sentiment,cointelegraph,reddit,avg_current_price,avg_next_price,pct_price_change,trend
0,2019-01-01,3832.628624,3835.163503,3846.200000,3822.4,265.354812,198101.219080,0.801779,369240247.0,1575.335570,...,0,negative,0.10,negative,"[[21957, 'fred-wilson-crypto-no-safe-haven-in-...","[['u/seriouslynotagayguy', 'Nick Mullen ""3000 ...",3834.098032,3957.675000,3.223104,up
1,2019-01-02,3963.000000,3948.900000,3980.200000,3938.6,734.112374,198221.159602,0.947861,369476938.0,1799.311258,...,1,negative,0.11,negative,"[[21975, 'good-ol-fud-the-brightest-media-buri...","[['u/landovalenz', '1 BTC', 13, '2019-01-02 00...",3957.675000,4034.561083,1.942708,same
2,2019-01-03,4049.000000,4016.400000,4056.644331,4016.2,517.702657,198364.321505,0.966222,369749053.0,1877.522581,...,1,negative,0.11,negative,"[[21997, 'proof-of-keys-event-aims-to-challeng...","[['u/renato_shira', 'The case for ""We have see...",4034.561083,3911.877967,-3.040805,down
3,2019-01-04,3923.200000,3910.011868,3924.300000,3890.0,419.631402,198514.023883,0.959199,370039173.0,1891.087248,...,2,neutral,0.11,negative,"[[22014, 'bitcoin-is-my-first-love-coinbase-ce...","[['u/Stuck_In_the_Matrix', 'Pushshift needs to...",3911.877967,3977.450000,1.676229,same
4,2019-01-05,3954.900000,3989.700000,4010.400000,3954.8,1245.876395,198656.957000,0.812946,370320307.0,1626.944785,...,1,negative,0.11,negative,"[[22029, 'coinbase-reportedly-bans-personal-ac...","[['u/Nilecrile', 'Hardforks are actually a gre...",3977.450000,3921.400000,-1.409194,same
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1881,2024-02-25,51629.000000,51572.000000,51645.000000,51526.0,127.809955,552018.319651,1.847371,969225553.0,2553.664634,...,3,positive,0.67,positive,"[[125095, 'grayscale-bitcoin-etf-records-lowes...","[['u/Diversified1977', 'Best exchange to buy b...",51593.000000,51702.500000,0.212238,same
1882,2024-02-26,51725.000000,51667.000000,51779.000000,51639.0,37.108193,552321.509635,1.732765,969645493.0,2678.075188,...,3,positive,0.67,positive,"[[125173, 'new-nine-spot-bitcoin-etf-volumes-r...","[['u/Subushie', 'The United States health care...",51702.500000,54462.500000,5.338233,up
1883,2024-02-27,54402.000000,54485.000000,54579.000000,54384.0,70.053708,552551.827136,1.664896,970001601.0,3004.635659,...,4,positive,0.69,positive,"[[125236, 'recognize-jurisdictions-believe-in-...","[['u/monilolita', 'Can someone please tell me ...",54462.500000,56960.000000,4.585724,up
1884,2024-02-28,57013.000000,56967.000000,57055.000000,56805.0,50.400155,552766.338478,1.806319,970388777.0,2777.181208,...,4,positive,0.74,positive,"[[125314, 'bitcoin-climber-orange-flag-everest...","[['u/2020rattler', 'Alt coin season theories',...",56960.000000,61785.750000,8.472173,up


In [5]:
merged_hourly

Unnamed: 0,timestamp,open,close,high,low,volume,blocks-size,avg-block-size,n-transactions-total,n-transactions-per-block,...,cbbi_value,cbbi_sentiment,timestamp_begin_merged,timestamp_end_merged,cointelegraph,reddit,avg_current_price,avg_next_price,pct_price_change,trend
0,2019-01-01 00:00:00,3832.628624,3835.163503,3846.2,3822.4,265.354812,198101.219080,0.801779,369240247.0,1575.335570,...,0.10,negative,2019-01-01 00:00:00,2019-01-01 01:00:00,[],[],3834.098032,3830.823709,-0.085400,same
1,2019-01-01 01:00:00,3835.048699,3829.146138,3840.1,3819.0,375.969689,198101.219080,0.801779,369240247.0,1575.335570,...,0.10,negative,2019-01-01 01:00:00,2019-01-01 02:00:00,[],"[['u/seriouslynotagayguy', 'Nick Mullen ""3000 ...",3830.823709,3822.350000,-0.221198,same
2,2019-01-01 02:00:00,3827.600000,3817.400000,3834.2,3810.2,425.147469,198101.219080,0.801779,369240247.0,1575.335570,...,0.10,negative,2019-01-01 02:00:00,2019-01-01 03:00:00,[],"[['u/pedxs', 'HITBTC restricting my account. A...",3822.350000,3823.825000,0.038589,same
3,2019-01-01 03:00:00,3817.300000,3828.400000,3832.3,3817.3,176.998692,198101.219080,0.801779,369240247.0,1575.335570,...,0.10,negative,2019-01-01 03:00:00,2019-01-01 04:00:00,[],[],3823.825000,3830.700000,0.179794,same
4,2019-01-01 04:00:00,3828.400000,3825.000000,3844.9,3824.5,307.675489,198101.219080,0.801779,369240247.0,1575.335570,...,0.10,negative,2019-01-01 04:00:00,2019-01-01 05:00:00,[],"[['u/6fGfGsz70s2dX4R4jeHg', ""I've put 5% of ea...",3830.700000,3828.400000,-0.060041,same
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45236,2024-02-28 20:00:00,60870.000000,60280.000000,60993.0,59800.0,286.742040,552766.338478,1.806319,970388777.0,2777.181208,...,0.74,positive,2024-02-28 20:00:00,2024-02-28 21:00:00,"[[125314, 'bitcoin-climber-orange-flag-everest...","[['u/matt3526', 'Wanting to move my crypto som...",60485.750000,60231.500000,-0.420347,same
45237,2024-02-28 21:00:00,60264.000000,60505.000000,60535.0,59622.0,174.752420,552766.338478,1.806319,970388777.0,2777.181208,...,0.74,positive,2024-02-28 21:00:00,2024-02-28 22:00:00,[],"[['u/FunnyGamer97', 'I miss when bull markets ...",60231.500000,60850.500000,1.027701,same
45238,2024-02-28 22:00:00,60488.000000,61186.000000,61296.0,60432.0,190.472745,552766.338478,1.806319,970388777.0,2777.181208,...,0.74,positive,2024-02-28 22:00:00,2024-02-28 23:00:00,[],"[['u/obsdude', 'Just sold my vehicle for btc',...",60850.500000,61832.750000,1.614202,same
45239,2024-02-28 23:00:00,61195.000000,62372.000000,62593.0,61171.0,199.761855,552766.338478,1.806319,970388777.0,2777.181208,...,0.74,positive,2024-02-28 23:00:00,2024-02-29 00:00:00,[],"[['u/Gemini_Gianna', 'Announcing The Successfu...",61832.750000,61785.750000,-0.076011,same


In [6]:
# Generate two empty datasets having the same number of rows as daily and hourly datasets
daily_sentiment = pd.DataFrame(index=merged_daily.index)
hourly_sentiment = pd.DataFrame(index=merged_hourly.index)

# Add "cryptobert_overall_sentiment" to the daily and hourly datasets
daily_sentiment['cryptobert_overall_sentiment'] = None
hourly_sentiment['cryptobert_overall_sentiment'] = None

In [7]:
daily_sentiment

Unnamed: 0,cryptobert_overall_sentiment
0,
1,
2,
3,
4,
...,...
1881,
1882,
1883,
1884,


In [8]:
hourly_sentiment

Unnamed: 0,cryptobert_overall_sentiment
0,
1,
2,
3,
4,
...,...
45236,
45237,
45238,
45239,


In [9]:
for index, row in tqdm(merged_daily.iterrows(), total=merged_daily.shape[0]):
    sentiment = {
        'positive': 0,
        'neutral': 0,
        'negative': 0
    }
    # Read the news items
    news = ast.literal_eval(row['cointelegraph'])
    for item in news:
        # Title sentiment: (item[7], item[8]), leadtext sentiment: (item[9]item[10]), body sentiment: (item[11], item[12])
        title_sentiment = (item[7], item[8])
        leadtext_sentiment = (item[9], item[10])
        body_sentiment = (item[11], item[12])

        # Count the sentiment
        sentiment[title_sentiment[0]] += 1
        sentiment[leadtext_sentiment[0]] += 1
        sentiment[body_sentiment[0]] += 1

    # Read the submissions items
    submissions = ast.literal_eval(row['reddit'])
    for item in submissions:
        # Title sentiment: (item[8], item[9]), text sentiment: (item[10], item[11])
        title_sentiment = (item[8], item[9])
        text_sentiment = (item[10], item[11])

        # Count the sentiment
        sentiment[title_sentiment[0]] += 1
        sentiment[text_sentiment[0]] += 1

        # Read the comments items
        comments =item[12] if item[12] != '[]' else []
        for comment in comments:
            # Body sentiment: (comment[6], comment[7])
            body_sentiment = (comment[6], comment[7])

            # Count the sentiment
            sentiment[body_sentiment[0]] += 1

    # Calculate the overall sentiment
    overall_sentiment = max(sentiment, key=sentiment.get)
    daily_sentiment.loc[index, 'cryptobert_overall_sentiment'] = overall_sentiment

100%|██████████| 1886/1886 [00:21<00:00, 85.80it/s] 


In [10]:
daily_sentiment

Unnamed: 0,cryptobert_overall_sentiment
0,neutral
1,neutral
2,neutral
3,neutral
4,neutral
...,...
1881,neutral
1882,neutral
1883,neutral
1884,neutral


: 

In [None]:
for index, row in tqdm(merged_daily.iterrows(), total=merged_hourly.shape[0]):
    sentiment = {
        'positive': 0,
        'neutral': 0,
        'negative': 0
    }
    # Read the news items
    news = ast.literal_eval(row['cointelegraph'])
    for item in news:
        # Title sentiment: (item[7], item[8]), leadtext sentiment: (item[9]item[10]), body sentiment: (item[11], item[12])
        title_sentiment = (item[7], item[8])
        leadtext_sentiment = (item[9], item[10])
        body_sentiment = (item[11], item[12])

        # Count the sentiment
        sentiment[title_sentiment[0]] += 1
        sentiment[leadtext_sentiment[0]] += 1
        sentiment[body_sentiment[0]] += 1

    # Read the submissions items
    submissions = ast.literal_eval(row['reddit'])
    for item in submissions:
        # Title sentiment: (item[8], item[9]), text sentiment: (item[10], item[11])
        title_sentiment = (item[8], item[9])
        text_sentiment = (item[10], item[11])

        # Count the sentiment
        sentiment[title_sentiment[0]] += 1
        sentiment[text_sentiment[0]] += 1

        # Read the comments items
        comments =item[12] if item[12] != '[]' else []
        for comment in comments:
            # Body sentiment: (comment[6], comment[7])
            body_sentiment = (comment[6], comment[7])

            # Count the sentiment
            sentiment[body_sentiment[0]] += 1

    # Calculate the overall sentiment
    overall_sentiment = max(sentiment, key=sentiment.get)
    hourly_sentiment.loc[index, 'cryptobert_overall_sentiment'] = overall_sentiment

In [None]:
hourly_sentiment

In [None]:
# Add the sentiment to the daily and hourly datasets
merged_daily['cryptobert_overall_sentiment'] = daily_sentiment['cryptobert_overall_sentiment']
merged_hourly['cryptobert_overall_sentiment'] = hourly_sentiment['cryptobert_overall_sentiment']

In [None]:
merged_daily

In [None]:
merged_hourly

In [None]:
# # Save the datasets
# merged_daily.to_csv(os.path.join(ROOT, ANNOTATED_DATASET_PATH, "merged_daily_cryptobert_opinion.csv"), index=False)
# merged_hourly.to_csv(os.path.join(ROOT, ANNOTATED_DATASET_PATH, "merged_hourly_cryptobert_opinion.csv"), index=False)