In [80]:
import pandas as pd
import datetime
import numpy as np

#fields = ['timestamp', 'replies','likes', 'retweets' ,'text']
# clean_tweets_part_2.csv #Contains data from 2015-11-03 to 2017-02-17
filename = 'gs://bigdata-general/clean/clean_tweets_part_0.csv'

tweets = pd.read_csv(filename, delimiter=',', index_col=0)
# Convert timestamp to date object
tweets['timestamp'] = pd.to_datetime(tweets['timestamp']).dt.date
tweets

Unnamed: 0,timestamp,replies,likes,retweets,text
0,2019-05-27,0,0,0,È appena uscito un nuovo video! LES CRYPTOMONN...
1,2019-05-27,0,0,0,Cardano: Digitize Currencies; EOS https://t.co...
2,2019-05-27,0,2,1,Another Test tweet that wasn't caught in the s...
3,2019-05-27,0,0,0,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,2019-05-27,0,0,0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...
...,...,...,...,...,...
999995,2019-05-21,0,0,0,Dutch Man Arrested Over $2.2 Million Bitcoin M...
999996,2019-05-21,2,35,21,This is how crypto can do some of it's best wo...
999997,2019-05-21,0,0,0,クレイグ・ライト氏、\nBitcoinホワイトペーパー\n著作権を登録 ✍️\n💸BSV高騰...
999998,2019-05-21,0,0,0,Bitcoin doesn't have inherent value. \nAltcoin...


# Cleaning Text from tweets

In [81]:
import re
import emoji
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet) #Remove non-ASCII characters
    tweet = " ".join(tweet.split())
    #tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
         if w.lower() in words or not w.isalpha())
    return tweet


[nltk_data] Downloading package words to /home/amaru-
[nltk_data]     razerblade/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [82]:
tweets['text'] = tweets['text'].map(lambda x: cleaner(x))

In [83]:
# # Removing empty tweets
tweets = tweets[~tweets.text.eq('')]


In [79]:
tweets.reset_index(drop=True)

Unnamed: 0,replies,likes,retweets,text,Date,Category
0,790,14470,5542,Running,2009-01-11,
1,2,39,18,"1 = $ 0 . 32 , Size : 5 . 09M , Last 24h : 0 ....",2011-01-09,
2,0,3,4,I can ' t exactly explain what the hell it is ...,2011-04-14,
3,0,9,7,"The Dollar Vigilante , the anarchist financial...",2011-05-10,
4,0,0,5,""" I like how on the , people sign forum """,2011-05-14,
...,...,...,...,...,...,...
930012,0,0,0,: 8604 . 30 : 7664 . 56 Yen : 941110 . 19 Rubl...,2019-05-27,
930013,0,1,1,5 incoming,2019-05-27,
930014,0,0,0,Bull : EU Chaos is Price Rally trading investor,2019-05-27,
930015,0,0,0,WE ARE HAPPY TO ANNOUNCE THE LAUNCH OF NEW . V...,2019-05-27,


# Labeling Cleaned tweets

In [84]:
# Sort dataframe set by timestamp and reindex
#
tweets = tweets.sort_values(by=['timestamp'])
tweets = tweets.reset_index(drop=True)

In [85]:
# Load historical bitcoin data from the following file located in gs://bigdata-general/bitcoin_historical_price.csv
# Parse date column as a date

bitcoin = pd.read_csv("gs://bigdata-general/bitcoin_historical_price.csv", index_col=0, parse_dates=True)

In [86]:
# Remove time from Date
bitcoin['Date'] = pd.to_datetime(bitcoin['Date']).dt.date

In [87]:
# calculate daily percentage change from opening to close.
daily_change = (bitcoin['Close'] - bitcoin['Open'])/bitcoin['Open']*100
# Label daily_change depending on the following conditions:
#     if daily_change is less than -10, label it as 0
#     if daily_change is in range -10 to -3, label it as 1
#     if daily_change is in range -3 to 3, label it as 2
#     if daily_change is in range 3 to 10, label it as 3
#     if daily_change is greater than 10, label it as 4

# create a new column in the dataframe
bitcoin['%Daily Chg'] = daily_change
bitcoin['label'] = np.where(daily_change < -10, 0,
                   np.where(daily_change < -3, 1,
                   np.where(daily_change < 3, 2,
                   np.where(daily_change < 10, 3, 4))))

bitcoin

                    

Unnamed: 0_level_0,Name,Symbol,Date,High,Low,Open,Close,Volume,Marketcap,%Daily Chg,label
SNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Bitcoin,BTC,2013-04-29,147.488007,134.000000,134.444000,144.539993,0.000000e+00,1.603769e+09,7.509441,3
2,Bitcoin,BTC,2013-04-30,146.929993,134.050003,144.000000,139.000000,0.000000e+00,1.542813e+09,-3.472222,1
3,Bitcoin,BTC,2013-05-01,139.889999,107.720001,139.000000,116.989998,0.000000e+00,1.298955e+09,-15.834534,0
4,Bitcoin,BTC,2013-05-02,125.599998,92.281898,116.379997,105.209999,0.000000e+00,1.168517e+09,-9.597868,1
5,Bitcoin,BTC,2013-05-03,108.127998,79.099998,106.250000,97.750000,0.000000e+00,1.085995e+09,-8.000000,1
...,...,...,...,...,...,...,...,...,...,...,...
2987,Bitcoin,BTC,2021-07-02,33939.588699,32770.680780,33549.600177,33897.048590,3.872897e+10,6.354508e+11,1.035626,2
2988,Bitcoin,BTC,2021-07-03,34909.259899,33402.696536,33854.421362,34668.548402,2.438396e+10,6.499397e+11,2.404788,2
2989,Bitcoin,BTC,2021-07-04,35937.567147,34396.477458,34665.564866,35287.779766,2.492431e+10,6.615748e+11,1.794908,2
2990,Bitcoin,BTC,2021-07-05,35284.344430,33213.661034,35284.344430,33746.002456,2.672155e+10,6.326962e+11,-4.359843,1


In [88]:
labeled = []
labeled = pd.DataFrame(labeled)
labeled['Date'] = bitcoin['Date']
labeled['Daily %Chg'] = daily_change
labeled['Cat.'] = bitcoin['label']
labeled = labeled.reset_index(drop=True)
labeled

Unnamed: 0,Date,Daily %Chg,Cat.
0,2013-04-29,7.509441,3
1,2013-04-30,-3.472222,1
2,2013-05-01,-15.834534,0
3,2013-05-02,-9.597868,1
4,2013-05-03,-8.000000,1
...,...,...,...
2986,2021-07-02,1.035626,2
2987,2021-07-03,2.404788,2
2988,2021-07-04,1.794908,2
2989,2021-07-05,-4.359843,1


## Create a dictionary of historical bitcoin price changes by dat from lists

In [89]:
from dateutil import parser

dates = labeled.Date.values
categories = labeled['Cat.'].values

# Create a dictionary with dates as keys and categories as values
date_cat_dict = dict(zip(dates, categories))

# Create a list of dates
date = parser.parse('2021-06-28').date()
print(date_cat_dict[date])

2


## Add Category label to each Tweet

In [90]:
tweets['Date'] = tweets.timestamp.rename('Date')
# Drop column timestamp from tweets
tweets = tweets.drop(columns=['timestamp'])

In [None]:
# Set value of column 'Category' if 'Date' matches the given date
# #
# date = parser.parse('2011-01-09').date()
# # Change 'Category' column values to 3 if 'Date' matches the given date
# tweets[tweets['Date'] == date]['Category'].values[0] = 3

# # Replace values based on conditions 
# # https://kanoki.org/2019/07/17/pandas-how-to-replace-values-based-on-conditions/
# tweets.loc[(tweets.Date == date), 'Category'] = 2

# # Print the number of tweets labeled as '3'
# print(tweets['Category'].value_counts())


In [91]:
# Match the timestamp of the tweets with bitcoin price and add a category label to the tweets
for key, value in date_cat_dict.items():
    tweets.loc[(tweets['Date'] == key), 'Category'] = value

In [221]:
 # Drop rows with missing Category value
 tweets = tweets[tweets['Category'].notnull()]
 tweets.reset_index(drop=True)

Unnamed: 0,replies,likes,retweets,text,Date,Category
76,3,15,8,#Bitcoin is not a serious currency until mains...,2013-05-11,2.0
77,2,4,6,"#Bitcoin exchanges should add ""PANIC BUY"" and ...",2013-05-14,1.0
78,1,0,9,Rumor: Bitcoin Conference (this weekend San Jo...,2013-05-15,2.0
79,2,15,22,Peter Thiel Gets the Bitcoin Bug http://t.co/c...,2013-05-16,3.0
80,1,4,9,Standing room only at the #bitcoinconference @...,2013-05-18,2.0
...,...,...,...,...,...,...
999995,0,0,0,#BTC \nBitcoin USD: 8604.30 \nBitcoin EUR: 766...,2019-05-27,2.0
999996,0,1,1,5 digits incoming #BTC,2019-05-27,2.0
999997,0,0,0,Crypto Bull Max Keiser: EU Elections Chaos is ...,2019-05-27,2.0
999998,0,0,0,BTC馬鹿になって買えばいいだけ。そのうちアルトのターンも来るだろうし、ひたすらfiatの流...,2019-05-27,2.0


In [92]:
#clean_tweets_part_0.csv
tweets.to_csv('/home/amaru-razerblade/clean_text_tweets/clean_text_tweets_part_0.csv')

In [93]:
!cd '/home/amaru-razerblade/clean_text_tweets' && gsutil cp clean_text_tweets_part_0.csv gs://bigdata-general/clean_text/clean_text_tweets_part_0.csv

Copying file://clean_text_tweets_part_0.csv [Content-Type=text/csv]...

Operation completed over 1 objects/91.7 MiB.                                     


# Util functions

In [None]:
time = tweets['timestamp'][0]
# convert string to date object
time = datetime.datetime.strptime(time, '%Y-%m-%d').date()
# conver time to date
labeled['Date'].values[0]
time

In [102]:
d = '2013-04-29'
# convert d to a datetime object
d = datetime.datetime.strptime(d, '%Y-%m-%d').date()

val = labeled.loc[labeled.Date == r_date]['Cat.'].values[0]
val

2