In [36]:
import pandas as pd
import json
import requests
import glob
import os
from sqlalchemy import create_engine
from bs4 import BeautifulSoup


## Merge the Data

We'll be concatenating all the dataframes, droping any duplicated entries and observations with missing data

In [37]:
pd.read_csv('file_data_df_final.csv', index_col='timestamp').sort_index().iloc[[-1]].index

Int64Index([1355317620], dtype='int64', name='timestamp')

In [38]:
pd.read_csv('database_data.csv', index_col='timestamp').sort_index().iloc[[-1]].index

Int64Index([1351995420], dtype='int64', name='timestamp')

In [39]:
#html= pd.read_csv('data_from_html.csv', index_col='Timestamp').sort_index().iloc[[-1]].index
html= pd.read_csv('data_from_html.csv', index_col='Timestamp')
html = html.drop(columns =["Unnamed: 0.1",'Unnamed: 0'], axis =1)
html.columns = html.columns.str.lower()

html.to_csv('website_data.csv')

In [46]:
pd.read_csv('api_data.csv', index_col='Timestamp')

Unnamed: 0_level_0,rsi10,%k200,%k30,ema10
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1325346600,,,,
1325350740,,,,
1325350800,,,,
1325391360,,,,
1325431680,,,,
...,...,...,...,...
1351989660,,,,10.655024
1351991700,,,,10.617747
1351994460,,,,10.621793
1351994760,,,,10.623285


In [40]:
df = pd.concat([
    pd.read_csv('api_data.csv', index_col='Timestamp'),
    pd.read_csv('file_data_df_final.csv', index_col='timestamp'),
    pd.read_csv('database_data.csv', index_col='timestamp'),
    pd.read_csv('website_data.csv', index_col='Timestamp')
]).sort_index()
df

Unnamed: 0,rsi10,%k200,%k30,ema10,open,high,low,close,volume_(btc),volume_(currency),weighted_price,volumn_square,ma10,rsi30,%k10,ema200,ma200,rsi200
1325317920,,,,,,,,,,,,,,,,,,
1325317920,,,,,,,,,,,,,,,,,,
1325317920,,,,,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,,,,,,
1325317980,,,,,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,,,,,,
1325318040,,,,,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1355317380,,,,,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,,,,,,
1355317440,,,,,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,,,,,,
1355317500,,,,,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,,,,,,
1355317560,,,,,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,,,,,,


In [41]:
# so lets fill the null values (within the same column and the same id) 
# with any non null value for the same id
df = df.fillna(method='ffill').fillna(method='bfill').sort_index()
df

Unnamed: 0,rsi10,%k200,%k30,ema10,open,high,low,close,volume_(btc),volume_(currency),weighted_price,volumn_square,ma10,rsi30,%k10,ema200,ma200,rsi200
1325317920,85.365854,,100.000000,4.941577,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,4.781,78.025478,80.645161,6.787881,6.5341,59.016393
1325317920,85.365854,,100.000000,4.941577,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,4.781,78.025478,80.645161,6.787881,6.5341,59.016393
1325317920,85.365854,,100.000000,4.941577,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,4.781,78.025478,80.645161,6.787881,6.5341,59.016393
1325317980,85.365854,,100.000000,4.941577,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,4.781,78.025478,80.645161,6.787881,6.5341,59.016393
1325318040,85.365854,,100.000000,4.941577,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,4.781,78.025478,80.645161,6.787881,6.5341,59.016393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1355317380,52.948358,,91.304348,10.629961,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,10.639,51.737293,91.304348,10.624793,10.5949,49.925356
1355317440,52.948358,,91.304348,10.629961,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,10.639,51.737293,91.304348,10.624793,10.5949,49.925356
1355317500,52.948358,,91.304348,10.629961,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,10.639,51.737293,91.304348,10.624793,10.5949,49.925356
1355317560,52.948358,,91.304348,10.629961,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,10.639,51.737293,91.304348,10.624793,10.5949,49.925356


In [42]:
# then keep only 1 row per id
df = df.groupby(df.index).first()
df

Unnamed: 0,rsi10,%k200,%k30,ema10,open,high,low,close,volume_(btc),volume_(currency),weighted_price,volumn_square,ma10,rsi30,%k10,ema200,ma200,rsi200
1325317920,85.365854,,100.000000,4.941577,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,4.781,78.025478,80.645161,6.787881,6.5341,59.016393
1325317980,85.365854,,100.000000,4.941577,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,4.781,78.025478,80.645161,6.787881,6.5341,59.016393
1325318040,85.365854,,100.000000,4.941577,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,4.781,78.025478,80.645161,6.787881,6.5341,59.016393
1325318100,85.365854,,100.000000,4.941577,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,4.781,78.025478,80.645161,6.787881,6.5341,59.016393
1325318160,85.365854,,100.000000,4.941577,4.39,4.39,4.39,4.39,0.455581,2.000,4.39,4.079757,4.781,78.025478,80.645161,6.787881,6.5341,59.016393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1355317380,52.948358,,91.304348,10.629961,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,10.639,51.737293,91.304348,10.624793,10.5949,49.925356
1355317440,52.948358,,91.304348,10.629961,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,10.639,51.737293,91.304348,10.624793,10.5949,49.925356
1355317500,52.948358,,91.304348,10.629961,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,10.639,51.737293,91.304348,10.624793,10.5949,49.925356
1355317560,52.948358,,91.304348,10.629961,13.34,13.34,13.34,13.34,3.900000,52.026,13.34,0.490000,10.639,51.737293,91.304348,10.624793,10.5949,49.925356


## Create Train Dataset

In [43]:
df.to_csv("merge_data.csv")