In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp drive/MyDrive/masterThesis/scripts/preprocessing/* . -r

In [None]:
#!unzip drive/MyDrive/masterThesis/data/twitter/Twitter.parquet.zip -d drive/MyDrive/masterThesis/data/twitter/

In [None]:
!pip install pandarallel
!pip install pyarrow
!pip install transformers
!pip install gensim==4.1.2


In [None]:
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=4, progress_bar= True)

import pyarrow as pa
import pyarrow.parquet as pq

from utlities import strip_tweets

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from techniques import *

from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from transformers import pipeline

In [None]:
finbert = BertModel.from_pretrained('yiyanghkust/finbert-tone')
finbert_class = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
pipe_bertEmbed = pipeline("feature-extraction", model=finbert, tokenizer=tokenizer)
pipe_bertSent = pipeline("sentiment-analysis", model=finbert_class, tokenizer=tokenizer)

input_path = "drive/MyDrive/masterThesis/data/twitter/Twitter.parquet"
output_path = "drive/MyDrive/masterThesis/data/twitter/Twitter.proc.parquet"

datatype = "Twitter"

# only look at top 10 CCs
ccs = ['btc', 'eth', 'xrp', 'xem', 'etc', 'ltc', 'dash', 'xmr', 'strat', 'xlm']

counts = {}

In [None]:
for cc in tqdm(ccs):
    print(cc)
    dataset = pq.ParquetDataset(input_path, validate_schema=False, filters=[('cc', '=', cc.upper())])
    dataset = dataset.read().to_pandas()
    full_shape = dataset.shape[0]

    #apply filtering rules
    if datatype == "Twitter":
        filter_words = ["give away", "giving away", "pump", "register", "join", "follow"]
        for filter_word in filter_words:
            dataset = dataset.loc[~dataset["content"].apply(lambda x: filter_word in x.lower())]

        dataset = dataset.loc[dataset["content"].apply(lambda x: x.count("#") < 14)]
        dataset = dataset.loc[dataset["content"].apply(lambda x: x.count("$") < 14)]
        dataset = dataset.loc[dataset["content"].apply(lambda x: x.count("@") < 14)]
        dataset = dataset.loc[dataset["content"].apply(lambda x: x.count("|") < 4)]

        dataset = dataset.loc[dataset["content"].apply(lambda x: len(strip_tweets(x)) > 20)]

        # track how much data is lost due to filtering
        counts[cc] = [full_shape, dataset.shape[0]]
    else:
        counts[cc] = [full_shape, full_shape]

    #start preprocessing
    dataset.rename(columns = {'content':'content_raw'}, inplace = True)

    # for testing
    dataset = dataset.iloc[:300, :]

    # clear up text
    print("preprocessing")
    dataset["content_processed"] = dataset["content_raw"].parallel_apply(lambda x: preprocessor(x))

    # create w2v embeds
    dataset["content_w2v"] = dataset["content_processed"].progress_apply(lambda x: generateW2V(x))
    dataset["content_w2vSum"] = dataset["content_w2v"].progress_apply(lambda x: x.mean(axis=0).tolist())
    dataset["content_w2v"] = dataset["content_w2v"].progress_apply(lambda x: x.tolist())

    # create FinBERT embeds
    dataset["content_bert"] = pipe_bertEmbed(dataset["content_raw"].to_list())
    dataset["content_bert"] = dataset["content_bert"].progress_apply(lambda x: x.tolist())

    # create Loughran McDonald sentiment scores
    dataset["sentiment_LM"] = dataset["content_processed"].progress_apply(lambda x: generateLMSentimentScore(x))

    # create Vader sentiment scores
    dataset["sentiment_Vader"] = dataset["content_processed"].progress_apply(lambda x: generateVaderSentimentScore(x))

    # create bert sentiment scores
    dataset["sentiment_bert"] = pipe_bertSent(dataset["content_raw"].to_list())
    dataset["sentiment_bert"] = dataset["sentiment_bert"].progress_apply(lambda x: BERTSentimentConversionDict[x["label"]] * x["score"])

    # Write direct to parquet file
    table = pa.Table.from_pandas(dataset)
    pq.write_to_dataset(table, root_path=output_path, partition_cols=['date', 'source', 'cc'])