In [1]:
import pandas as pd
import numpy as np
import pywt
pd.options.display.float_format = '{:.2f}'.format

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import missingno as msno
plt.rc("figure", figsize=(18,5))

import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_df(name):
    time = pd.DataFrame(pd.date_range(start ="2014-01-01 01:00:00", end ="2019-08-12 12:47:00", freq='T'))
    time.columns = ["Timestamp"]
    df = pd.read_csv('/home/filip/Desktop/Ruđer/MinuteData/'+name+'_1-min_data.csv')
    df["Timestamp"] = df["Timestamp"].astype('M8[s]')
    df = df.loc[df["Timestamp"]>"2014-01-01 01:00:00"]
    df = pd.merge(time, df, left_on='Timestamp', right_on='Timestamp', how='left')
    return df

def append_suffix(df):
    cols = []
    for col in df.columns:
        if str(col)!="Timestamp":
            cols.append(str(col)+"_"+name)
        else:
            cols.append("Timestamp")
    df.columns = cols
    return df

# Plota broj missing i existing podataka 
def plot_missing(df, column):
    missing = df.loc[df[column].isnull()]
    missing["Counter"] = 1
    vals = missing.set_index("Timestamp").groupby(pd.Grouper(freq='M')).count()["Counter"]
    try:
        vals.plot(x="Timestamp",y="Counter")
    except:
        pass
    found = df.loc[~df[column].isnull()]
    found["Counter"] = 1
    vals = found.set_index("Timestamp").groupby(pd.Grouper(freq='M')).count()["Counter"]
    vals.plot(x="Timestamp",y="Counter")
    plt.show()
    
# Funkcija wavelet transformacije
def lowpassfilter(signal, thresh = 0.01, wavelet='haar', level = 1):
    coeff = pywt.wavedec(signal, wavelet, mode="per",level=level)
    thresh = thresh*np.nanmax(coeff[1])
    coeff[1:] = (pywt.threshold(i, value=thresh, mode="soft" ) for i in coeff[1:])
    reconstructed_signal = pywt.waverec(coeff, wavelet, mode="per")
    return pd.DataFrame(reconstructed_signal)
    
# Kreiranje dodatnih feature-a
def create_column_features(df, column, thresh):
    df["Denoised_Price_1_"+column] = lowpassfilter(df[column],thresh)
    df["Denoised_Price_2_"+column] = lowpassfilter(df[column],level =2,thresh = thresh)
    df["Price_Change_"+column] = df[column].pct_change()
    df.loc[df["Price_Change_"+column]>10,"Price_Change_"+column]= 0
    df["Denoised_Price_Change_"+column] = df["Denoised_Price_1_"+column].pct_change()
    df.loc[df["Denoised_Price_Change_"+column]>10,"Denoised_Price_Change_"+column]= 0
    df.plot(x="Timestamp",y=[column,"Denoised_Price_1_"+column,"Denoised_Price_2_"+column],figsize=(20,5))
    plt.show()
    df.plot(x="Timestamp",y=["Price_Change_"+column,"Denoised_Price_Change_"+column],figsize=(20,5))
    plt.show()
    return df

# Popunjava missing vrijednosti
def fill_missing(df,ls):
    print("Before:")
    print(df.isnull().sum())
    for i in range(len(df.columns)):
        col = df.columns[i]
        if i in ls:
            df[col] = df[col].fillna(0)
        else:
            df[col] = df[col].interpolate()
    print("\nAfter:")
    print(df.isnull().sum())
    plot_missing(coinbase,"Weighted_Price")
    df = df.dropna()
    df.head()
    return df

In [None]:
coinbase = load_df("coinbaseUSD.csv")
coinbase["Timestamp"] = pd.to_datetime(coinbase["Timestamp"], errors='coerce')
plot_missing(coinbase,"Weighted_Price")
plt.show()
coinbase.plot(x="Timestamp",y="Weighted_Price")
plt.show()
print(coinbase.columns)

In [None]:
coinbase = fill_missing(coinbase,[3,4,8]).reset_index(drop=True)

In [None]:
coinbase["Dolars"] = (coinbase["Volume"]*coinbase["Close"])
coinbase.plot(x="Timestamp",y=["Dolars"])
plt.show()
coinbase.plot(x="Timestamp",y=["Volume"])

In [None]:
coinbase = create_column_features(coinbase, "Weighted_Price",0.05)
coinbase = create_column_features(coinbase, "Open",0.05)
coinbase = create_column_features(coinbase, "Close",0.05)
coinbase = create_column_features(coinbase, "Dolars",0.05)
coinbase.head()

In [None]:
fig = px.line(coinbase.loc[(coinbase["Timestamp"]>"2018-08-08 02:00:00")&(coinbase["Timestamp"]<"2019-01-01 02:00:00")], x='Timestamp', y='Denoised_Price_1_Weighted_Price')
fig.show()

In [None]:
fig = px.line(coinbase.loc[(coinbase["Timestamp"]>"2018-08-08 02:00:00")&(coinbase["Timestamp"]<"2019-01-01 02:00:00")], x='Timestamp', y='Denoised_Price_Change_Weighted_Price')
fig.show()

In [None]:
sns.distplot(coinbase.loc[(coinbase["Volume"]<100)&(coinbase["Volume"]>0)]["Volume"])

In [None]:
bitstamp = load_df("bitstampUSD.csv")
bitstamp["Timestamp"] = pd.to_datetime(bitstamp["Timestamp"], errors='coerce')
plot_missing(bitstamp,"Weighted_Price")
plt.show()
bitstamp.plot(x="Timestamp",y="Weighted_Price")
plt.show()
print(bitstamp.columns)

In [None]:
bitstamp = fill_missing(bitstamp,[3,4,8]).reset_index(drop=True)

In [None]:
bitstamp["Dolars"] = (bitstamp["Volume"]*bitstamp["Close"])
bitstamp.plot(x="Timestamp",y=["Dolars"])
plt.show()
bitstamp.plot(x="Timestamp",y=["Volume"])

In [None]:
bitstamp = create_column_features(bitstamp, "Weighted_Price",0.05)
bitstamp = create_column_features(bitstamp, "Close",0.05)
bitstamp = create_column_features(bitstamp, "Open",0.05)
bitstamp = create_column_features(bitstamp, "Dolars",0.05)
bitstamp.head()

In [None]:
fig = px.line(bitstamp.loc[(bitstamp["Timestamp"]>"2018-08-08 02:00:00")&(bitstamp["Timestamp"]<"2019-01-01 02:00:00")], x='Timestamp', y='Denoised_Price_1_Weighted_Price')
fig.show()

In [None]:
fig = px.line(bitstamp.loc[(bitstamp["Timestamp"]>"2018-08-08 02:00:00")&(bitstamp["Timestamp"]<"2019-01-01 02:00:00")], x='Timestamp', y='Denoised_Price_Change_Weighted_Price')
fig.show()

In [None]:
sns.distplot(bitstamp.loc[(bitstamp["Volume"]<100)&(bitstamp["Volume"]>0)]["Volume"])

In [None]:
kraken = load_df("krakenUSD.csv")
kraken["Timestamp"] = pd.to_datetime(kraken["Timestamp"], errors='coerce')
plot_missing(kraken,"Weighted_Price")
plt.show()
kraken.plot(x="Timestamp",y="Weighted_Price")
plt.show()
print(kraken.columns)

In [None]:
combined = pd.DataFrame(pd.date_range(start ="2014-01-01 02:00:00", end ="2019-01-01 12:47:00", freq='T'))
combined.columns = ["Timestamp"]
combined = pd.merge(combined, coinbase, left_on='Timestamp', right_on='Timestamp', how='left')
combined = pd.merge(combined, bitstamp, left_on='Timestamp', right_on='Timestamp', how='left')
print(combined.columns)
combined.tail()

In [None]:
combined.to_pickle("/home/filip/Desktop/Ruđer/FinalData/general.pkl")