# Preprocessing exchange information

## Libraries

In [24]:
import pandas as pd
import json
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.metrics import jaccard_score
import numpy as np

## Load exchange data from JSON and save it in a CSV

In [3]:
with open('../../data/json/coin_exchanges_new.json', 'r') as file:
    json_data = json.load(file)

# Lists for the csv columns
crypto_ids = []
exchange_ids = []
exchange_names = []
exchange_slugs = []

# Process every crypto based on its id from the JSON
for crypto_id, crypto_info in json_data.items():
    # Verify if the data is present (in case of errors like request limit or something similar)
    if "data" in crypto_info:
        for exchange in crypto_info["data"]:
            # Verify that all the fields are present
            if all(k in exchange for k in ("id", "name", "slug")):
                crypto_ids.append(crypto_id)
                exchange_ids.append(exchange["id"])
                exchange_names.append(exchange["name"])
                exchange_slugs.append(exchange["slug"])
            else:
                print(f"Missing fields in the exchange: {exchange}")

# Creating the dataframe
df = pd.DataFrame({
    "crypto_id": crypto_ids,
    "exchange_id": exchange_ids,
    "exchange_name": exchange_names,
    "exchange_slug": exchange_slugs
})

# Swow the first rows to verify
print(df.head())

# Save the dataframe in a csv file
df.to_csv("../../data/csv/cryptocurrencies_exchanges.csv", index=False)

  crypto_id  exchange_id exchange_name exchange_slug
0      6535          270       Binance       binance
1      6535         7680          DOEX          doex
2      6535           21          BTCC          btcc
3      6535          521         Bybit         bybit
4      6535          407     DigiFinex     digifinex


## Convert the redundant data into a list of exchanges

In [6]:
list_df = df.groupby('crypto_id').agg({
    'exchange_id': lambda x: x.tolist(),
    'exchange_name': lambda x: x.tolist(),
    'exchange_slug': lambda x: x.tolist()
}).reset_index()

print(list_df.shape)
list_df.head()

(951, 4)


Unnamed: 0,crypto_id,exchange_id,exchange_name,exchange_slug
0,1,"[270, 7680, 21, 9200, 521, 10361, 407, 9867, 1...","[Binance, DOEX, BTCC, Zedcex Exchange, Bybit, ...","[binance, doex, btcc, zedcex-exchange, bybit, ..."
1,10040,"[302, 380, 6706, 1344, 1281, 1302]","[Gate.io, LATOKEN, PancakeSwap v3 (BSC), Panca...","[gate-io, latoken, pancakeswap-v3, pancakeswap..."
2,10046,"[102, 302, 544, 350, 1069]","[HTX, Gate.io, MEXC, CoinEx, Uniswap v2]","[htx, gate-io, mexc, coinex, uniswap-v2]"
3,10047,"[433, 102, 302, 311, 350, 6706, 1069, 1344, 6444]","[Bitrue, HTX, Gate.io, KuCoin, CoinEx, Pancake...","[bitrue, htx, gate-io, kucoin, coinex, pancake..."
4,10180,"[333, 513, 406, 1064, 102, 525, 302, 544, 955,...","[LBank, Bitget, BitMart, BingX, HTX, XT.COM, G...","[lbank, bitget, bitmart, bingx, htx, xt, gate-..."


The missing 49 coins are due to not having exchange information, but it will be regularized in **'preprocessing-coin.ipynb'**

## Saving the exchanges list dataframe in a CSV

In [7]:
list_df.to_csv('../../data/csv/exchanges_list.csv', index=False)

## Load exchanges data

In [25]:
exchange_data = pd.read_csv('../../data/csv/exchanges_data.csv')
exchange_data.head()

Unnamed: 0,id,name,slug,logo,date_launched,notice,weekly_visits,spot_volume_usd
0,4098,QuickSwap v3 (Polygon),quickswap-v3,https://s2.coinmarketcap.com/static/img/exchan...,2020-10-07T00:00:00.000Z,,35390.0,27210250.0
1,16,Poloniex,poloniex,https://s2.coinmarketcap.com/static/img/exchan...,2014-01-10T00:00:00.000Z,,73390.0,460341400.0
2,21,BTCC,btcc,https://s2.coinmarketcap.com/static/img/exchan...,2011-06-01T00:00:00.000Z,,155100.0,
3,24,Kraken,kraken,https://s2.coinmarketcap.com/static/img/exchan...,2011-07-28T00:00:00.000Z,,871315.0,406016500.0
4,34,Bittylicious,bittylicious,https://s2.coinmarketcap.com/static/img/exchan...,2013-05-22T00:00:00.000Z,,2723.0,20688.03


In [26]:
# Checking basic info
exchange_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               523 non-null    int64  
 1   name             523 non-null    object 
 2   slug             523 non-null    object 
 3   logo             523 non-null    object 
 4   date_launched    368 non-null    object 
 5   notice           19 non-null     object 
 6   weekly_visits    518 non-null    float64
 7   spot_volume_usd  492 non-null    float64
dtypes: float64(2), int64(1), object(5)
memory usage: 32.8+ KB


In [27]:
exchange_data.drop(columns=['notice', 'date_launched'], inplace=True)

In [28]:
# Show rows with null values in the exchange_data dataframe
null_rows = exchange_data[exchange_data.isnull().any(axis=1)]
null_rows

Unnamed: 0,id,name,slug,logo,weekly_visits,spot_volume_usd
2,21,BTCC,btcc,https://s2.coinmarketcap.com/static/img/exchan...,155100.0,
48,10445,Ebi.xyz,ebi-xyz,https://s2.coinmarketcap.com/static/img/exchan...,30398.0,
56,8418,APX v2 (BSC),apollox-v2-bsc,https://s2.coinmarketcap.com/static/img/exchan...,625.0,
96,363,Fatbtc,fatbtc,https://s2.coinmarketcap.com/static/img/exchan...,736.0,
107,391,CoinTiger,cointiger,https://s2.coinmarketcap.com/static/img/exchan...,272.0,
123,460,Coinsbit,coinsbit,https://s2.coinmarketcap.com/static/img/exchan...,13347.0,
140,516,dYdX,dydx,https://s2.coinmarketcap.com/static/img/exchan...,5304.0,
146,522,Deribit,deribit,https://s2.coinmarketcap.com/static/img/exchan...,104518.0,
151,10783,HyperPay Futures,hyperpay-futures,https://s2.coinmarketcap.com/static/img/exchan...,95.0,
165,8790,dYdX v4,dydx-v4,https://s2.coinmarketcap.com/static/img/exchan...,22998.0,


In [29]:
# Fill NaN values with the average of their respective columns
mean_weekly = exchange_data['weekly_visits'].mean()
mean_vol = exchange_data['spot_volume_usd'].mean()
exchange_data['weekly_visits'] = exchange_data['weekly_visits'].apply(lambda x: mean_weekly if np.isnan(x) else x)
exchange_data['spot_volume_usd'] = exchange_data['spot_volume_usd'].apply(lambda x: mean_vol if np.isnan(x) else x)

# Verify the changes
exchange_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               523 non-null    int64  
 1   name             523 non-null    object 
 2   slug             523 non-null    object 
 3   logo             523 non-null    object 
 4   weekly_visits    523 non-null    float64
 5   spot_volume_usd  523 non-null    float64
dtypes: float64(2), int64(1), object(3)
memory usage: 24.6+ KB


In [30]:
exchange_data.head()

Unnamed: 0,id,name,slug,logo,weekly_visits,spot_volume_usd
0,4098,QuickSwap v3 (Polygon),quickswap-v3,https://s2.coinmarketcap.com/static/img/exchan...,35390.0,27210250.0
1,16,Poloniex,poloniex,https://s2.coinmarketcap.com/static/img/exchan...,73390.0,460341400.0
2,21,BTCC,btcc,https://s2.coinmarketcap.com/static/img/exchan...,155100.0,158017900.0
3,24,Kraken,kraken,https://s2.coinmarketcap.com/static/img/exchan...,871315.0,406016500.0
4,34,Bittylicious,bittylicious,https://s2.coinmarketcap.com/static/img/exchan...,2723.0,20688.03


In [31]:
exchange_data.to_csv('../../data/csv/exchanges_data_clean.csv', index=False)