In [225]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import datetime

from datetime import timedelta

Please do the following in a jupyter notebook

- There is a csv under the data folder called hw3.csv
- the goal of this hw is to calculate volumeUSD by chain

Some hints for this hw:

- volumeUSD = volume * close
- you will need to clean data
- visualize your data to better understand what is going on

In [226]:
df = pd.read_csv("../../../Lectures/Lecture 3/data/hw3.csv")

In [227]:
df.head()

Unnamed: 0,ts,open,high,low,close,volume,volumeUSD,token,chain
0,2021-12-02 14:00:00,22.415,22.4913,22.0816,22.3516,31502.151631,,UNI,ETH
1,2021-12-02 23:00:00,4.8043,4.8043,4.7426,4.7806,73686.23,,CRV,ETH
2,2021-12-10 15:00:00,182.49,,175.21,175.86,73736.747,,SOL,SOL
3,2021-12-10 18:00:00,3978.43,3989.74,3932.0,3972.34,18508.040102,,ETH,ETH
4,2021-12-08 21:00:00,193.324,194.242,192.564,193.154,69426.909,,SOL,SOL


In [228]:
# As we work with time series, let's sort the data by timestamp
df['ts'] = pd.to_datetime(df['ts'])
df.sort_values('ts', inplace=True)
df = df.reset_index(drop=True)

In [229]:
df.head()

Unnamed: 0,ts,open,high,low,close,volume,volumeUSD,token,chain
0,2021-12-01,210.312,,208.432,208.676,70031.618,,SOL,SOL
1,2021-12-01,280.59,281.4,278.3,278.7,207.849,,COMP,ETH
2,2021-12-01,257.102,260.775,255.345,257.078,2730.299,,AAVE,ETH
3,2021-12-01,21.2004,21.3115,21.0337,21.2659,12406.133674,,UNI,ETH
4,2021-12-01,57321.41,57451.05,56814.34,56987.97,388.482022,,BTC,BTC


In [230]:
# Let's check the size of the dataframe
df.shape

(2667, 9)

In [231]:
# And available chains
set(df['chain'])

{'BTC', 'ETH', 'SOL', 'USDT'}

In [232]:
# Calculate the volume
df['volumeUSD'] = df['close'] * df['volume']

In [233]:
volume_initial = df.groupby('chain').sum()['volumeUSD'].to_frame().sort_values('volumeUSD', 
                                                              ascending=False)
volume_initial.style.format("{:,.0f}")

Unnamed: 0_level_0,volumeUSD
chain,Unnamed: 1_level_1
BTC,46684524066
ETH,20469314784
SOL,7172882528
USDT,1557237812


Let's check whether there are some NaNs for volumeUSD

In [234]:
df[df['volumeUSD'].isnull()]

Unnamed: 0,ts,open,high,low,close,volume,volumeUSD,token,chain
54,2021-12-01 06:00:00,5.2505,5.3002,5.1566,,389798.700000,,CRV,ETH
64,2021-12-01 07:00:00,4707.7400,4750.0000,4691.6200,,8911.031559,,"<span name=""tokenName"">ETH</span>",ETH
115,2021-12-01 12:00:00,264.9130,264.9260,261.8000,,1019.545000,,AAVE,ETH
149,2021-12-01 16:00:00,268.7450,269.1060,264.8280,,2063.599000,,AAVE,ETH
165,2021-12-01 18:00:00,58485.8800,58631.4000,58007.2400,,633.239868,,BTC,BTC
...,...,...,...,...,...,...,...,...,...
2598,2021-12-12 23:00:00,1.0006,,1.0005,,861409.230000,,USDT,USDT
2611,2021-12-13 00:00:00,180.5100,181.1900,,,1016.050000,,AAVE,ETH
2639,2021-12-13 04:00:00,191.8500,192.8700,191.1900,,1046.025000,,COMP,ETH
2662,2021-12-13 06:00:00,3.8700,3.8700,3.8200,,45270.080000,,CRV,ETH


As we have 2 variables: price and volume, then there are 2 possible sources of NaNs. Let's check both

In [235]:
df[df['volumeUSD'].isnull()&df['close'].isnull()]

Unnamed: 0,ts,open,high,low,close,volume,volumeUSD,token,chain
54,2021-12-01 06:00:00,5.2505,5.3002,5.1566,,389798.700000,,CRV,ETH
64,2021-12-01 07:00:00,4707.7400,4750.0000,4691.6200,,8911.031559,,"<span name=""tokenName"">ETH</span>",ETH
115,2021-12-01 12:00:00,264.9130,264.9260,261.8000,,1019.545000,,AAVE,ETH
149,2021-12-01 16:00:00,268.7450,269.1060,264.8280,,2063.599000,,AAVE,ETH
165,2021-12-01 18:00:00,58485.8800,58631.4000,58007.2400,,633.239868,,BTC,BTC
...,...,...,...,...,...,...,...,...,...
2598,2021-12-12 23:00:00,1.0006,,1.0005,,861409.230000,,USDT,USDT
2611,2021-12-13 00:00:00,180.5100,181.1900,,,1016.050000,,AAVE,ETH
2639,2021-12-13 04:00:00,191.8500,192.8700,191.1900,,1046.025000,,COMP,ETH
2662,2021-12-13 06:00:00,3.8700,3.8700,3.8200,,45270.080000,,CRV,ETH


In [236]:
df[df['volumeUSD'].isnull()&df['volume'].isnull()]

Unnamed: 0,ts,open,high,low,close,volume,volumeUSD,token,chain


So the problem is only in close prices. Let's zoom some cases

In [237]:
problem_index = df[df['volumeUSD'].isnull()&df['close'].isnull()].index

We see that there are 2 types of problems:
- Missing price
- Incorrectly parsed toke name

In [245]:
df.iloc[problem_index[:5], :]

Unnamed: 0,ts,open,high,low,close,volume,volumeUSD,token,chain
54,2021-12-01 06:00:00,5.2505,5.3002,5.1566,,389798.7,,CRV,ETH
64,2021-12-01 07:00:00,4707.74,4750.0,4691.62,,8911.031559,,"<span name=""tokenName"">ETH</span>",ETH
115,2021-12-01 12:00:00,264.913,264.926,261.8,,1019.545,,AAVE,ETH
149,2021-12-01 16:00:00,268.745,269.106,264.828,,2063.599,,AAVE,ETH
165,2021-12-01 18:00:00,58485.88,58631.4,58007.24,,633.239868,,BTC,BTC


In [246]:
# Let's fix it

def fix_prices(problem_index, df):
    '''
    This function is used to replace NaN close prices with the preceeding ones.
    
    Args:
    problem_index (list) - list of indices with NaN prices
    df (DataFrame) - initial dataframe
    
    Outputs:
    df_adj (DataFrame) - final dataframe with fixed prices
    '''

    for i in problem_index:
        ts_tmp = df.loc[i, 'ts']
        ts_adj = ts_tmp - timedelta(hours=1)

        chain_tmp = df.loc[i, 'chain']
        token_tmp = df.loc[i, 'token']
        
        # Remove broken token name
        if "span name" in token_tmp:
            df.loc[i, 'token'] = token_tmp.split('>')[1].split('<')[0]
            
        # Calculate the previous price
        try:
            previous_price = df[(df['ts'] < df.loc[i, 'ts'])&
                                (df['chain'] == df.loc[i, 'chain'])&
                                (df['token'] == df.loc[i, 'token'])]['close'].values[-1]

            df.loc[i, 'close'] = previous_price
        except IndexError:
            print('Does not work for case {}, ts {}, chain {}, token {}'.format(i, ts_tmp, 
                                                                                chain_tmp, token_tmp))

    # Recalculate the volume
    df['volumeUSD'] = df['close'] * df['volume']
    
    return df

In [247]:
df = fix_prices(problem_index, df)

In [248]:
# So we see that we removed all problems
df[df['volumeUSD'].isnull()]

Unnamed: 0,ts,open,high,low,close,volume,volumeUSD,token,chain


In [249]:
volume_final = df.groupby('chain').sum()['volumeUSD'].to_frame().sort_values('volumeUSD', 
                                                              ascending=False)
volume_final.style.format("{:,.0f}")

Unnamed: 0_level_0,volumeUSD
chain,Unnamed: 1_level_1
BTC,49454021505
ETH,21525201689
SOL,7475235361
USDT,1656244971


In [251]:
# This is the % of volume we fixed
round((volume_final / volume_initial - 1) * 100, 2)

Unnamed: 0_level_0,volumeUSD
chain,Unnamed: 1_level_1
BTC,5.93
ETH,5.16
SOL,4.22
USDT,6.36
