In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
raw_data = pd.read_csv('hw3.csv')
raw_data.info()
raw_data['token'].value_counts()
#raw_data

In [None]:
dups = raw_data.duplicated().sum()
dups

In [None]:
raw_data = raw_data.drop_duplicates()
raw_data.loc[raw_data['token'].str.contains('<span '), 'token'] = raw_data.loc[raw_data['token'].str.contains('<span '), 'token'].str.extract('<span>(.|\n)*?</span>')
raw_data['ts'] = pd.to_datetime(raw_data['ts'])
raw_data.info()
raw_data['token'].value_counts()

In [None]:
raw_data['chain'].value_counts()

In [None]:
x = raw_data.loc[(raw_data.close / raw_data.open >= 2.0) | (raw_data.close / raw_data.open <= 0.5) | (raw_data['close'].isnull())]
x
# y = raw_data.loc[(raw_data.close.isnull() & (raw_data.high.isnull() | raw_data.low.isnull()))] 
# There are a total of 33 rows where the close value is null and the high or low value is null.
# One of these rows has null values for all three columns. We can try sorting by token and ts and use ffill()
# to obtain an estimated close price. However as only 33/2360 (~1.4%) rows have this issue, I instead chose to drop these rows.
# y
# len(y)
# 100 * (len(y)/2360)

In [None]:
# we know that we have 2360 non-null/duplicate rows in this data set
# some rows must have nulls in the following columns: token, high, low, and close
# close has some null values and some bad values (total 169 rows), 
# what complicates our cleaning efforts is that some null close rows have missing high or low values
# I have elected to drop these null rows as I couldn't think of a reliable heuristic to generate close values for these rows
raw_data['close'] = raw_data['close'].mask((raw_data['close'].isnull() | 
                                            (raw_data.close/raw_data.open >= 2.0)
                                           | (raw_data.close/raw_data.open <= 0.5)), (0.5 * (raw_data['high'] + raw_data['low'])))
raw_data.dropna(subset=['close'], inplace=True)
raw_data = raw_data.reset_index()
raw_data.info()
# If I run the previous cell again after running this one, x will be empty.

In [None]:
raw_data
raw_data.info()

In [None]:
# I have refactored the code in the following cell from the data cleaning example notebook in lecture 3
tokens = raw_data.token.unique()

fig, axes = plt.subplots(nrows=math.ceil(tokens.size / 2), ncols=2, figsize=(15, 5 * math.ceil(tokens.size / 2)))

idx = 0
for label, t in raw_data[['token', 'close']].groupby('token'):
    t['close'].plot(ax=axes[idx // 2, idx % 2], label=label)
    axes[idx // 2, idx % 2].legend()
    
    idx += 1

In [None]:
'''
chains = raw_data.chain.unique()
fig, axes = plt.subplots(nrows=math.ceil(chains.size / 2), ncols=2, figsize=(15, 5 * math.ceil(chains.size / 2)))
idx = 0
for label, ch in raw_data[['chain', 'close']].groupby('chain'):
    ch['close'].plot(ax=axes[idx // 2, idx % 2], label=label)
    axes[idx // 2, idx % 2].legend()
    
    idx += 1
'''
# Plotting close prices by token is emminently more relevant since in the chain case 
#   the 'ETH plot' will contain close prices for multiple tokens and is not as useful. 
#   On the otherhand the chain column contains no null values.

In [None]:
# I noticed a spike in close prices for USDT. I wanted to ensure 
#    that this was just a spike that naturally occurred in the market.
usdt = raw_data.loc[raw_data['chain'] == 'USDT'].sort_values(by=['close'], ascending=False)
#usdt.head(25)
usdt
#Indeed this spike seems to have naturally occurred in the market between 12/4-12/5

In [None]:
# The remaining step before calculating the volumeUSD by chain is
#    to examine the volume column and look for/cleanup any irregularities.
tokens = tokens[0:-1]
for t in tokens:
    current_token = raw_data.loc[raw_data['token'] == str(t)]
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    fig.suptitle(str(t)+' Volume')
    axes[0].hist(current_token['volume'])
    axes[0].set_xlabel('Volume')
    axes[0].set_ylabel('Count')
    axes[1].plot(current_token['volume'])
    axes[1].set_ylabel('Volume')
    fig
    print(current_token['volume'].describe())
# For each token, min and max are not too far from 25th and 75th percentiles

In [None]:
# There are a handful of volume outliers for each token, given the volatility of cryptocurrencies,
#   none of the spikes are particularly abnormal. Even the most extreme outliers are pretty much within 5*mean.
# From these three approaches, there are likely no anomalies within the volume data.
# We are ready to proceed with calculating the volumeUSD by chain.
raw_data['volumeUSD'] = raw_data['close'] * raw_data['volume']
volume_USD = raw_data.groupby('chain')['volumeUSD'].sum().to_frame()
volume_USD