In [31]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# import data handling tools
import pandas as pd
import numpy as np
# import data visualization tools
import matplotlib.pyplot as plt
import seaborn as sns


## Load Data

#### News

In [32]:
# Reddit Data
df_Crypto_Currency_News = pd.read_csv('../Load/Reddit/comments/Crypto_Currency_News_comments.csv')
df_CryptoCurrencies = pd.read_csv('../Load/Reddit/comments/CryptoCurrencies_comments.csv')
df_CryptoCurrency = pd.read_csv('../Load/Reddit/comments/CryptoCurrency_comments.csv')
df_Cryptomarkets = pd.read_csv('../Load/Reddit/comments/Cryptomarkets_comments.csv')
df_eth = pd.read_csv('../Load/Reddit/comments/eth_comments.csv')
df_ethfinance = pd.read_csv('../Load/Reddit/comments/ethfinance_comments.csv')
df_ethtrader = pd.read_csv('../Load/Reddit/comments/ethtrader_comments.csv')

df_reddit = [df_Crypto_Currency_News, df_CryptoCurrencies, df_CryptoCurrency, df_Cryptomarkets, df_eth, df_ethfinance, df_ethtrader]

#### Financial Data

In [33]:
# Yahoo Data
df_yahoo = pd.read_csv('../Load/datasets/Yahoo/ETH-USD.csv', delimiter=';')
# Binanace Data
df_BNB_ETH = pd.read_csv('../Load/datasets/Binance/BNB-ETH-klines.csv', delimiter=';')
df_ETH_BTC = pd.read_csv('../Load/datasets/Binance/ETH-BTC-klines.csv', delimiter=';')
df_ETH_USDT = pd.read_csv('../Load/datasets/Binance/ETH-USDT-klines.csv', delimiter=';')
df_XRP_ETH = pd.read_csv('../Load/datasets/Binance/XRP-ETH-klines.csv', delimiter=';')

df_binance = [df_ETH_USDT, df_ETH_BTC, df_BNB_ETH, df_XRP_ETH]

## Transform Dataframes

#### Reddit

In [34]:
for df in df_reddit:
    # Drop columns we don't need
    df.drop(['id', 'num_comments', 'permalink', 'score', 'subreddit'], axis=1, inplace=True)
    # Change created_utc format to day/month/year
    df['date'] = pd.to_datetime(df['created_utc'])
    df['date'] = df['date'].dt.strftime('%#d/%#m/%Y')
    # Join title and comments into one column
    df['title'] = df['title'].astype(str)
    df['comment_1'] = df['comment_1'].astype(str)
    df['comment_2'] = df['comment_2'].astype(str)
    df['comment_3'] = df['comment_3'].astype(str)
    df['text'] = df['title'] + '. ' + df['comment_1'] + '. ' + df['comment_2'] + '. ' + df['comment_3']
    # Drop columns
    df.drop(['created_utc','title', 'comment_1', 'comment_2', 'comment_3'], axis=1, inplace=True)
    
print(df_Crypto_Currency_News.head())

        date                                               text
0  1/10/2019  Divi Project Update September 2019 Month in Re...
1  1/10/2019  Interview with the CEO of DIVI Geoff McCabe by...
2  1/10/2019  Is Wall Street Finally Learning Not To Panic O...
3  1/10/2019  In China, a large mining farm burned down. Dam...
4  1/10/2019  Bitcoin Bull Run's 'Likely Target' is $200-300...


Make a list of all the Dates in our Range

In [35]:
from datetime import datetime,timedelta

start_date = datetime.strptime("1/10/2019","%d/%m/%Y")
end_date = datetime.strptime("30/9/2022","%d/%m/%Y")

# This will create a list with complete dates
completeDates = [start_date + timedelta(days=x) for x in range(0,(end_date-start_date ).days + 1)]
# reformat completeDates from 2019-10-01 00:00:00 to 1/10/2019
completeDates = [date.strftime('%#d/%#m/%Y') for date in completeDates]

Reshape Reddit Dataframes to have one row per Date, with a column for each text

In [36]:
# create dummy dataframes for the 7 reddit dataframes
columns = ['date', 'text_1', 'text_2', 'text_3', 'text_4', 'text_5', 'text_6', 'text_7', 'text_8', 'text_9', 'text_10']
df_Crypto_Currency_News_f = pd.DataFrame(columns=columns)
df_CryptoCurrencies_f = pd.DataFrame(columns=columns)
df_CryptoCurrency_f = pd.DataFrame(columns=columns)
df_Cryptomarkets_f = pd.DataFrame(columns=columns)
df_eth_f = pd.DataFrame(columns=columns)
df_ethfinance_f = pd.DataFrame(columns=columns)
df_ethtrader_f = pd.DataFrame(columns=columns)

# create list of dataframes
df_reddit_f = [df_Crypto_Currency_News_f, df_CryptoCurrencies_f, df_CryptoCurrency_f, df_Cryptomarkets_f, df_eth_f, df_ethfinance_f, df_ethtrader_f]

# get dates from completeDates list
for df in df_reddit_f:
    df['date'] = completeDates

print(df_Crypto_Currency_News_f.head())


        date text_1 text_2 text_3 text_4 text_5 text_6 text_7 text_8 text_9  \
0  1/10/2019    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
1  2/10/2019    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
2  3/10/2019    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
3  4/10/2019    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
4  5/10/2019    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   

  text_10  
0     NaN  
1     NaN  
2     NaN  
3     NaN  
4     NaN  


In [37]:
# lets try to fill Crypto_Currency_News_f with data from Crypto_Currency_News
# run trough all dates in df_Crypto_Currency_News_f

# run for loop for the first 10 dates
for i in range(0,10):
    # get the date from df_Crypto_Currency_News_f
    date = df_Crypto_Currency_News_f['date'][i]
    # run trough all dates in df_Crypto_Currency_News
    for j in range(0,len(df_Crypto_Currency_News)):
        # get the date from df_Crypto_Currency_News
        date2 = df_Crypto_Currency_News['date'][j]
        # if the dates are the same
        if date == date2:
            # get the text from df_Crypto_Currency_News
            text = df_Crypto_Currency_News['text'][j]
            # get the column name
            column = 'text_' + str(i+1)
            # fill the column with the text
            df_Crypto_Currency_News_f[column][i] = text

for index, row in df_Crypto_Currency_News_f.iterrows():
    # run trough all dates in df_Crypto_Currency_News
    for index2, row2 in df_Crypto_Currency_News.iterrows():
        # if date in df_Crypto_Currency_News_f is equal to date in df_Crypto_Currency_News
        if row['date'] == row2['date']:
            # add text from df_Crypto_Currency_News to df_Crypto_Currency_News_f
            # add text to first empty column
            if pd.isnull(row['text_1']):
                df_Crypto_Currency_News_f.at[index, 'text_1'] = row2['text']
            elif pd.isnull(row['text_2']):
                df_Crypto_Currency_News_f.at[index, 'text_2'] = row2['text']
            elif pd.isnull(row['text_3']):
                df_Crypto_Currency_News_f.at[index, 'text_3'] = row2['text']
            elif pd.isnull(row['text_4']):
                df_Crypto_Currency_News_f.at[index, 'text_4'] = row2['text']
            elif pd.isnull(row['text_5']):
                df_Crypto_Currency_News_f.at[index, 'text_5'] = row2['text']
            elif pd.isnull(row['text_6']):
                df_Crypto_Currency_News_f.at[index, 'text_6'] = row2['text']
            elif pd.isnull(row['text_7']):
                df_Crypto_Currency_News_f.at[index, 'text_7'] = row2['text']
            elif pd.isnull(row['text_8']):
                df_Crypto_Currency_News_f.at[index, 'text_8'] = row2['text']
            elif pd.isnull(row['text_9']):
                df_Crypto_Currency_News_f.at[index, 'text_9'] = row2['text']
            elif pd.isnull(row['text_10']):
                df_Crypto_Currency_News_f.at[index, 'text_10'] = row2['text']
            else:
                # if all columns are filled or there is no text for that date, do nothing
                pass
            # break loop
            break

In [41]:
eth_test = pd.DataFrame(columns=['date', 'text_1', 'text_2', 'text_3', 'text_4', 'text_5', 'text_6', 'text_7', 'text_8', 'text_9', 'text_10'])
set_date_eth = completeDates
for df in df_reddit_f:
    cc = 0
    for day in df['date']:
        text_list = []
        cc+=1

        c = 0 
        for i, day_df in enumerate(df_eth['date'].tolist()):
            if day == day_df:
                c+=1 
                if df_eth['text'][i]:
                    text_list.append(df_eth['text'][i])
                else:
                    text_list.append('no_text')
            if c < 10:
                for i in range(c, 10):
                    text_list.append('no_text')

        eth_test = eth_test.append({'date': day, 
                'text_1': text_list[0], 'text_2': text_list[1], 
                'text_3': text_list[2], 'text_4': text_list[3], 
                'text_5': text_list[4], 'text_6': text_list[5], 
                'text_7': text_list[6], 'text_8': text_list[7], 
                'text_9': text_list[8], 'text_10': text_list[9]}, ignore_index=True)
        print(eth_test)
    break

eth_test.head()

        date                                             text_1   text_2  \
0  1/10/2019  Gradbase putting qualifications on the Ethereu...  no_text   

    text_3   text_4   text_5   text_6   text_7   text_8   text_9  text_10  
0  no_text  no_text  no_text  no_text  no_text  no_text  no_text  no_text  
        date                                             text_1   text_2  \
0  1/10/2019  Gradbase putting qualifications on the Ethereu...  no_text   
1  2/10/2019                                            no_text  no_text   

    text_3   text_4   text_5   text_6   text_7   text_8   text_9  text_10  
0  no_text  no_text  no_text  no_text  no_text  no_text  no_text  no_text  
1  no_text  no_text  no_text  no_text  no_text  no_text  no_text  no_text  
        date                                             text_1   text_2  \
0  1/10/2019  Gradbase putting qualifications on the Ethereu...  no_text   
1  2/10/2019                                            no_text  no_text   
2  3/10/20

KeyboardInterrupt: 

In [None]:
# Get your list from data frame index, and remove hours
myDates = df_yahoo['date'].tolist()

# Is possible that your dates are in datetime obj or in string
# If string
myDates = [d.split()[0] for d in myDates]

# Creates a list with missing data
missingDates = [d for d in completeDates if d not in myDates]

In [None]:
print(len(missingDates))
print(missingDates)

0
[]


#### Yahoo & Binance

In [None]:
# No changes needded for Yahoo data
print(df_yahoo.head())

        date   adj_close      volume
0  1/10/2019  177.340424  7676276225
1  2/10/2019  180.710510  6335595250
2  3/10/2019  175.199341  6381403725
3  4/10/2019  176.985001  6248928449
4  5/10/2019  176.351517  5837211771


In [None]:
for df in df_binance:
    # Drop columns we don't need
    df.drop(['quote_asset_volume', 'taker_buy_quote_asset_volume'], axis=1, inplace=True)
    # For df_ETH_USDT we also drop close, as we have the equivalent in df_yahoo
    if df is df_ETH_USDT:
        df.drop(['close'], axis=1, inplace=True)

# Rename columns
df_ETH_USDT.rename(columns={'num_trades':'num_trades_USDT'}, inplace=True)

df_ETH_BTC.rename(columns={'close':'close_BTC'}, inplace=True)
df_ETH_BTC.rename(columns={'num_trades':'num_trades_BTC'}, inplace=True)

df_BNB_ETH.rename(columns={'close':'close_BNB'}, inplace=True)
df_BNB_ETH.rename(columns={'num_trades':'num_trades_BNB'}, inplace=True)

df_XRP_ETH.rename(columns={'close':'close_XRP'}, inplace=True)
df_XRP_ETH.rename(columns={'num_trades':'num_trades_XRP'}, inplace=True)


print(df_ETH_BTC.head())

        date  close_BTC  num_trades_BTC
0  1/10/2019   0.021182          133240
1  2/10/2019   0.021566           92471
2  3/10/2019   0.021241           87748
3  4/10/2019   0.021580           87725
4  5/10/2019   0.021682           93049


In [None]:
# Join yahoo and binance dataframes on equal dates
df_finance = pd.merge(df_yahoo, df_ETH_USDT, on='date', how='left')
df_finance = pd.merge(df_finance, df_ETH_BTC, on='date', how='left')
df_finance = pd.merge(df_finance, df_BNB_ETH, on='date', how='left')
df_finance = pd.merge(df_finance, df_XRP_ETH, on='date', how='left')

print(df_finance.head())

        date   adj_close      volume  num_trades_USDT  close_BTC  \
0  1/10/2019  177.340424  7676276225           168492   0.021182   
1  2/10/2019  180.710510  6335595250            96660   0.021566   
2  3/10/2019  175.199341  6381403725           101114   0.021241   
3  4/10/2019  176.985001  6248928449            89655   0.021580   
4  5/10/2019  176.351517  5837211771            90341   0.021682   

   num_trades_BTC  close_BNB  num_trades_BNB  close_XRP  num_trades_XRP  
0          133240   0.090262           11802        NaN             NaN  
1           92471   0.088028            7465        NaN             NaN  
2           87748   0.089504            7160        NaN             NaN  
3           87725   0.089029            6246        NaN             NaN  
4           93049   0.089143            5373        NaN             NaN  
