## Download and import vader

In [1]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()
nltk.downloader.download('vader_lexicon')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yidanzhang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Import packages

In [2]:
import matplotlib.pyplot as plt
import os
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## Read all data from news.csv with head (已整理Title, Date, Time) 
## TODO: 重新整理content

In [3]:
sample_list = ["Title", "Date", "Time", "Content"]
sample_data = pd.read_csv('news_updated.csv', usecols=sample_list).astype(str) 
sample_data.head()

Unnamed: 0,Title,Date,Time,Content
0,Bitcoin price hits $34K as trader forecasts fr...,2021-07-24,15:46:15,"<p>Bitcoin (<a href=""https://cointelegraph.com..."
1,Bitcoin price hints at 'megaphone' bottom patt...,2021-07-24,13:42:02,"<p>Bitcoin's (<a href=""https://cointelegraph.c..."
2,Just HODL! Bitcoin and Ethereum outperform ‘lo...,2021-07-24,01:05:00,"<p>In the past two decades, index and exchange..."
3,"Price analysis 7/23: BTC, ETH, BNB, ADA, XRP, ...",2021-07-23,18:26:09,"<p>Bitcoin’s (<a href=""https://cointelegraph.c..."
4,Bitcoin payments for real estate gain traction...,2021-07-23,17:15:24,<p>Crypto investors are betting big on real es...


## Calculate Sentiment Scores

In [4]:
vader = SentimentIntensityAnalyzer()
content = sample_data['Content']
content = str(content).encode('utf-8')
scores = sample_data['Content'].apply(lambda content:vader.polarity_scores(content)).tolist()
scores_df = pd.DataFrame(scores)
scores_df.head()

Unnamed: 0,compound,neg,neu,pos
0,0.9811,0.011,0.903,0.085
1,0.8581,0.064,0.868,0.068
2,0.9961,0.031,0.841,0.128
3,0.9951,0.067,0.84,0.093
4,0.9966,0.012,0.905,0.083


## Add scores to original data frame

In [5]:
scored_news = sample_data.join(scores_df, rsuffix='_right')
scored_news.head()

Unnamed: 0,Title,Date,Time,Content,compound,neg,neu,pos
0,Bitcoin price hits $34K as trader forecasts fr...,2021-07-24,15:46:15,"<p>Bitcoin (<a href=""https://cointelegraph.com...",0.9811,0.011,0.903,0.085
1,Bitcoin price hints at 'megaphone' bottom patt...,2021-07-24,13:42:02,"<p>Bitcoin's (<a href=""https://cointelegraph.c...",0.8581,0.064,0.868,0.068
2,Just HODL! Bitcoin and Ethereum outperform ‘lo...,2021-07-24,01:05:00,"<p>In the past two decades, index and exchange...",0.9961,0.031,0.841,0.128
3,"Price analysis 7/23: BTC, ETH, BNB, ADA, XRP, ...",2021-07-23,18:26:09,"<p>Bitcoin’s (<a href=""https://cointelegraph.c...",0.9951,0.067,0.84,0.093
4,Bitcoin payments for real estate gain traction...,2021-07-23,17:15:24,<p>Crypto investors are betting big on real es...,0.9966,0.012,0.905,0.083


In [6]:
group_scored_news = scored_news.groupby("Date")
mean_scored_news = group_scored_news.mean()
mean_scored_news = mean_scored_news.reset_index()

In [7]:
mean_scored_news.head()

Unnamed: 0,Date,compound,neg,neu,pos
0,2018-12-25,-0.04795,0.0435,0.918,0.0385
1,2018-12-26,0.687233,0.026833,0.912333,0.061
2,2018-12-27,-0.20394,0.0698,0.8814,0.0488
3,2018-12-28,0.38565,0.048333,0.889167,0.0625
4,2018-12-29,-0.089475,0.06525,0.887,0.04825


## Read k_line.csv (price)

In [10]:
col_list_price = ["Time", "Price", "Volume", "Change"]
parsed_price_data = pd.read_csv('price_volume.csv')
parsed_price_data.head()

Unnamed: 0,Time,Price,Volume,Change
0,1514736000,13371.5,293805,-0.32
1,1514736060,13405.5,219089,-0.25
2,1514736120,13450.0,651858,-0.33
3,1514736180,13528.0,1433129,-0.58
4,1514736240,13616.0,4301649,-0.65


In [11]:
from datetime import datetime
parsed_price_data['Time'] = [datetime.fromtimestamp(x) for x in parsed_price_data['Time']]
parsed_price_data[['Date','Time']] = parsed_price_data['Time'].astype(str).str.split(' ',expand=True)
parsed_price_data

Unnamed: 0,Time,Price,Volume,Change,Date
0,08:00:00,13371.5,293805,-0.32,2017-12-31
1,08:01:00,13405.5,219089,-0.25,2017-12-31
2,08:02:00,13450.0,651858,-0.33,2017-12-31
3,08:03:00,13528.0,1433129,-0.58,2017-12-31
4,08:04:00,13616.0,4301649,-0.65,2017-12-31
5,08:05:00,13683.0,2993575,-0.49,2017-12-31
6,08:06:00,13705.0,3148908,-0.16,2017-12-31
7,08:07:00,13687.0,4065627,0.13,2017-12-31
8,08:08:00,13690.0,2341692,-0.02,2017-12-31
9,08:09:00,13705.5,884267,-0.11,2017-12-31


In [17]:
volume_data = parsed_price_data[['Date', ' Volume']]
group_volume_data = volume_data.groupby("Date")
sum_volume = group_volume_data.sum()
sum_volume = sum_volume.reset_index()

In [18]:
sum_volume.head()

Unnamed: 0,Date,Volume
0,2017-12-31,454478678
1,2018-01-01,680516250
2,2018-01-02,944152848
3,2018-01-03,817351695
4,2018-01-04,698479219


In [54]:
parsed_circulating_data = pd.read_csv('total-bitcoins.csv')
parsed_circulating_data.head()

Unnamed: 0,Timestamp,total-bitcoins
0,2009-01-03 17:15:05,50.0
1,2009-01-14 12:25:25,22750.0
2,2009-01-18 11:33:45,45450.0
3,2009-01-22 18:27:35,68150.0
4,2009-01-25 17:15:02,90850.0


In [62]:
from datetime import datetime
parsed_circulating_data['Timestamp'] = [datetime.fromtimestamp(x) for x in parsed_price_data['Timestamp']]

KeyError: 'Timestamp'

## 截取疫情前后两个时间段的数据 (news, volume, circulating)
* 2019/01/01 - 2019/06/30 --> mask1
* 2020/01/01 - 2020/06/30 --> mask2

### 2019/01/01 - 2019/06/30

In [21]:
mean_scored_news['Date'] = pd.to_datetime(mean_scored_news['Date'])
mask1 = (mean_scored_news['Date'] >= '2019-1-1') & (mean_scored_news['Date'] <= '2019-6-30')
train_data1 = mean_scored_news.loc[mask1] #Index(['Date', 'compound', 'neg', 'neu', 'pos'], dtype='object')
train_data1.head()

Unnamed: 0,Date,compound,neg,neu,pos
7,2019-01-01,-0.284267,0.0435,0.924667,0.031667
8,2019-01-02,0.041388,0.043125,0.90975,0.047375
9,2019-01-03,0.375067,0.040667,0.888333,0.0715
10,2019-01-04,0.32885,0.039,0.9275,0.0335
11,2019-01-05,0.331,0.0505,0.8935,0.056


In [23]:
sum_volume['Date'] = pd.to_datetime(sum_volume['Date'])
price_mask1 = (sum_volume['Date'] >= '2019-1-1') & (sum_volume['Date'] <= '2019-6-30')
train_volume_data1 = sum_volume.loc[price_mask1]
train_volume_data1

Unnamed: 0,Date,Volume
366,2019-01-01,979039368
367,2019-01-02,1097663564
368,2019-01-03,889059787
369,2019-01-04,923185106
370,2019-01-05,640874439
371,2019-01-06,1135382157
372,2019-01-07,983979089
373,2019-01-08,1299810701
374,2019-01-09,1300895254
375,2019-01-10,1288240535


In [59]:
parsed_circulating_data['Date'] = pd.to_datetime(parsed_circulating_data['Timestamp'])
circulating_mask1 = (parsed_circulating_data['Date'] >= '2019-1-1') & (parsed_circulating_data['Date'] <= '2019-6-30')
train_circulating_data1 = parsed_circulating_data.loc[price_mask1]


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match

### 2020/01/01 - 2020/06/30

In [118]:
scored_news['Date'] = pd.to_datetime(scored_news['Date'])
mask2 = (scored_news['Date'] >= '2020-1-1') & (scored_news['Date'] <= '2020-6-30')
train_data2 = scored_news.loc[mask2]
train_data2

Unnamed: 0,Title,Date,Time,compound,neg,neu,pos
3915,Survey Shows 42% of Investors Expect a $15K Bi...,2020-06-30,22:03:00,0.0000,0.000,1.000,0.000
3916,3 Reasons Why Bitcoin Price Recorded Its Third...,2020-06-30,20:17:00,0.6369,0.000,0.704,0.296
3917,Bitcoin Scam Exposes Thousands to Data Breach,2020-06-30,16:55:00,-0.6369,0.510,0.490,0.000
3918,An Israeli Blockchain Startup Claims It’s Inve...,2020-06-30,15:00:00,0.0000,0.000,1.000,0.000
3919,Max Keiser: US Hash Rate War With Iran Can Sen...,2020-06-30,14:20:00,-0.5994,0.231,0.769,0.000
3920,Gold vs. Bitcoin: $2.8B Fake Bullion Scam High...,2020-06-30,10:13:00,-0.6369,0.415,0.427,0.159
3921,Bitcoin Price Bulls Pin Hopes on Descending Ch...,2020-06-30,07:43:00,0.4215,0.000,0.781,0.219
3922,Bitcoin’s ROI Since 2015 Outperforms Five Majo...,2020-06-30,05:44:00,0.0000,0.000,1.000,0.000
3923,Researchers Say New Lightning Network Attack C...,2020-06-29,22:38:00,-0.2500,0.254,0.574,0.172
3924,Bitcoin as a Tool to Fight Authoritarian Regim...,2020-06-29,22:18:00,-0.5719,0.343,0.657,0.000


In [120]:
parsed_price_data['Date'] = pd.to_datetime(parsed_price_data['Date'])
price_mask2 = (parsed_price_data['Date'] >= '2020-1-1') & (parsed_price_data['Date'] <= '2020-6-30')
train_price_data2 = parsed_price_data.loc[price_mask2]
train_price_data2

Unnamed: 0,Time,Price,Date
1056201,00:00:00,7183.0,2020-01-01
1056202,00:01:00,7180.5,2020-01-01
1056203,00:02:00,7180.5,2020-01-01
1056204,00:03:00,7180.5,2020-01-01
1056205,00:04:00,7180.5,2020-01-01
1056206,00:05:00,7180.5,2020-01-01
1056207,00:06:00,7180.5,2020-01-01
1056208,00:07:00,7181.0,2020-01-01
1056209,00:08:00,7180.5,2020-01-01
1056210,00:09:00,7180.5,2020-01-01
