## To-do list
- metrics functions
- check for directories structure (model for prediction, databases, saving directory for the drawdown/VaR/etc plots, backup directory for model, backup directory for plots)
- import metrics (VaR, cVaR, drawdown, etc.) functions from metrics module
- functions module maybe?
- Binance access and etc (copy from Touring)
- Copy keys.py explanation from Touring
- Two main fund strategies: (1) passive - trade on 1st of each month, keep in basked the 10 crytpos with the most market cap; (2) active - trade every week based on predicted returns for the next week
- Let the user define how many assets to invest into


In [1]:
#------ Import basic packages
#import matplotlib.pyplot as plt
#import seaborn as sns
#import smtplib  # Needed for the e-mail reports
#import binance.enums  # Responsible for trading

import numpy as np
import pandas as pd
import pandas_ta
from functions import *
pd.set_option('display.float_format', lambda x: '%.8f' % x)

In [218]:
# FUNCTIONS

#------ Binance access
def binance_wallet(live_trade=False):
    """
    Description: function that fetches Binance balance for the user

    Inputs: live_trade - bool, default False. If True checks for live trade ability (not needed for backtesting)
    
    Outputs objects: wallet - pd.DataFrame, existing assets and balances
                    cliente - object, binance.client.Client (EXCLUDED FROM return)
                    infos - dict, overall informations about the client (EXCLUDED FROM return)
    """
    # Necessary packages
    import pandas as pd
    from binance.client import Client  # Binance

    def request_wallet():
            # Requests user wallet infos
            print('Fetching wallet balance...')
            wallet = pd.DataFrame(infos['balances'])

            # Gets the 'numerical' informations about the balances
            nums = ['free', 'locked']

            # Transform objs in float
            wallet[nums] = wallet[nums].astype(float)

            # Filter the assets with balance
            mask = wallet[nums][wallet[nums] > 0].dropna(how='all').index
            print('Cleaning wallet from non-positive cryptos...')
            wallet = wallet.iloc[mask]  # keep only assets with positive balance

            # If needed, excludes some cryptos (asset blacklisting)
            black_list = ['NFT', 'SHIB', 'BTTC']
            mask = wallet[wallet['asset'].isin(black_list)].index  # blacklist index
            wallet.drop(mask, axis=0, inplace=True)  # dropping blacklist
            print('Done.')

            print(f'\n--> Please note this account type: {infos['accountType']} <--')

            wallet.reset_index(drop=True, inplace=True)

            return wallet
    
    # Get overall info from Binance, using offline (not live) keys
    cliente = Client(api_key_offline, api_secret_offline)

    # Checks for systems online
    if cliente.get_system_status()['msg'] != 'normal':
        print('\n\n!!!! **** WARNING **** !!!!\n')
        print('!!!! BINANCE OFFLINE !!!!\n')
        print('Unable to fetch data\n\n')

    else:
        print('\nBinance on-line. Requesting data.')
        # Fetch user data
        infos = cliente.get_account()

        if live_trade == True:
            # Check if the user is able to live trade (not mandatory for offline)
            if infos['canTrade'] == False:
                print('\nWARNING! User unable to trade, please check status with Binance!')
                print('Aborting.')
            else:
                wallet = request_wallet()

        else:
            wallet = request_wallet()
    
    return wallet


#------ Historical data
def historical_data(ticker='BTCUSDT', days=30, interval='15m'):  # the ticker in Binance works in pairs - here you'll want to know how much is BTC worth in USDT, for example
    """
    Description: gets the trading pair historical data from Binance.

    Inputs: ticker - str, default 'BTCUSDT', the pair you want to trade;
            days - int, default 30, gets historical data from this many days ago;
            interval - str, default '15m', the 'slicing' of the timeframe, more info at https://developers.binance.com/docs/binance-spot-api-docs/web-socket-streams#klinecandlestick-streams-for-utc
    
    Outputs objects: hist - pd.DataFrame, OHLC historical data
    """
    import datetime
    import requests
    import json
    import time
    import pandas as pd

    # Defines the data timespan
    end_time = datetime.datetime.now()
    start_time = end_time - datetime.timedelta(days=days)
    
    # Converts time to Unix (because Binance) in miliseconds
    end_timestamp = int(end_time.timestamp()*1000)
    start_timestamp = int(start_time.timestamp()*1000)

    # Binance endpoint
    endpoint = 'https://api.binance.com/api/v3/klines'

    # Timewindow estabilished, requests historical data.
    # Request parameters.
    limit = 1000
    params = {'symbol': ticker, 'interval': interval,
          'endTime': end_timestamp, 'limit': limit,
          'startTime': start_timestamp}
    print('Requesting informations from Binance.')

    # Make the request and saves it in a list. 'Dados' means 'data' in portuguese.
    dados = []
    while True:
        response = requests.get(endpoint, params=params)
        klines = json.loads(response.text)
        dados += klines
        if len(klines) < limit:
            break
        params['startTime'] = int(klines[-1][0])+1
        time.sleep(0.1)
    print('Request successful. Splitting data...')

    # Pick specific data from fetched data
    # About kline[n] pos: https://developers.binance.com/docs/binance-spot-api-docs/rest-api/market-data-endpoints
    loose_data = []
    for kline in dados:
        loose_data = [[float(kline[1]), float(kline[2]), float(kline[3]), float(kline[4]), float(kline[5])] for kline in dados]

    # Creates the DataFrame
    timestamps = [datetime.datetime.fromtimestamp(int(kline[0])/1000) for kline in dados]
    hist = pd.DataFrame(loose_data, columns=['open', 'high', 'low', 'close', 'volume'], index=timestamps)
    hist = pd.concat([hist], keys=[ticker], names=['asset', 'time'])

    print('All done.')

    return hist


#------ Check for trading pairs CSV file
def check_pairs():
    """
    Description: checks if there's a trading pairs record, makes up the trading pairs from Binance toAsset and fromAsset data if none is found
    Inputs: none
    Outputs: none (generates a csv file though)
    """
    import datetime
    import requests
    import json
    import os
    import pandas as pd

    # Stores current month and year, used to validate files
    today = datetime.datetime.now().strftime('%Y%m')
    
    # Resources and backup path
    resources_dir = './resources/'
    backup_dir = 'older_versions/'

    #------ List with the 20 top market cap currencies
    def top20():
        """
        Description: get the 20 biggest market cap cryptos from the web. Needs tweaks for each source.

        input: none

        outpu: list, cryptocurrencies symbols
        """
        import requests
        from bs4 import BeautifulSoup

        cmc = 'https://crypto.com/price'
        
        print('Getting list of the 20 cryptos with the most market cap.')

        try:
            response = requests.get(cmc)
            soup = BeautifulSoup(response.text, "html.parser")

            site = soup.find_all("span", {"class": "chakra-text"})

            cryptos = []
            for cur in site:
                cryptos.append(cur.get_text())
            
            print('Done.')
        
        except:
            print(f"Error fetching biggest market cap cryptos from {cmc}")
        
        return cryptos[0:20]


    #------ Creates the buy/sell pairs from Binance endpoint
    def create_pairs_file():
        # Sets Binance endpoint and its parameter (crypto common to all trades, in this case)
        pairs_endpoint = 'https://api.binance.com/sapi/v1/convert/exchangeInfo'
        params = {'toAsset': 'USDT'}

        # Fetches the Top 20 market cap cryptos from the web to make our asset basket
        crypto_list = top20()

        # Makes the request
        print('Retrieving information about pairing trades from Binance.')
        response = requests.get(pairs_endpoint, params=params)
        
        # Changes the response into a DataFrame
        df = pd.DataFrame(json.loads(response.text))

        # Filters Binance cryptos data maintaining only the top 20 at most
        print('Filtering all assets not tradable from Binance.')
        mask = df['fromAsset'].isin(crypto_list)
        df = df.loc[mask].reset_index(drop=True)

        # Creates the buy/sell pairs
        print('Registering tradable pairs.')
        sell_pairs = df['fromAsset'] + df['toAsset']
        buy_pairs = df['toAsset'] + df['fromAsset']

        # Creates a new temp DataFrame with just the pairs
        temp = pd.concat([buy_pairs, sell_pairs], axis=1)
        temp.columns = ['Buy', 'Sell']

        # Saves to file
        print('Creating new file...')
        temp.to_csv(f'{resources_dir}pairs_{today}.csv', index=False)
        print(f"Trading pairs file created: '{resources_dir}pairs_{today}.csv'")
        print('All done.')        

    # Checks for path
    if not os.path.exists(resources_dir):
        print(f"Directory '{resources_dir}' does not exist. Let's make it, shall we?")
        os.mkdir(resources_dir)
        print(f"Done, directory '{resources_dir}' created successfully.")
    
    else:
        print('The resources directory exists, checking for trade pairs file.')

    # Check for any pair file. If it exists and is newer than a month, loads it.
    arqs = os.listdir(resources_dir)

    # Keeps only generated files, disregarding folders or other misc files
    for i in arqs:
        if 'pairs' not in str(i):
            arqs.remove(i)

    if len(arqs) == 0:
        print('Trading file not found in folder. Creating...')
        create_pairs_file()

    elif len(arqs) == 1:
        arqs = arqs[0]
        print(f"Trading pairs file '{arqs}' found, checking version.")
        file_found = arqs.split('.csv')[0]
        file_found = file_found.split('_')[1]

        if int(today) > int(file_found):
            import shutil                        
            print(f"Time to update the files! Moving current file to './{backup_dir}'.")
            shutil.move(resources_dir+arqs, resources_dir+backup_dir+arqs)
            print(f'Updating trading pairs file.')
            create_pairs_file()

        else:
            print('Trading pairs file is up to date.')

    elif len(arqs) > 1:
        print(f"WARNING: Multiple files found! Moving all trading pairs files to './{backup_dir}'.")
        for i in arqs:
            import shutil
            shutil.move(resources_dir+i, resources_dir+backup_dir+i)
        print(f'Creating valid trading pairs file.')
        create_pairs_file()
    
    else:
        print("Some kind of witchcraft error happened. This message isn't supposed to show up!")


#------ Loads the trading pairs CSV file
def get_pairs():
    """
    Description: loads the latest trading pairs file using pandas.read_csv()

    Input: none
    Output: pandas.DataFrame
    """
    import os
    import pandas as pd

    resources_dir = './resources/'

    # Check for the pairs file.
    arqs = os.listdir(resources_dir)

    # Keeps only generated files, disregarding folders or other misc files
    for i in arqs:
        if 'pairs' not in str(i):
            arqs.remove(i)

    if len(arqs) == 0:
        print('ERROR: Trading file not found in folder. Please run check_pairs() first.')

    elif len(arqs) == 1:
        arqs = arqs[0]
        pairs = pd.read_csv(f'{resources_dir}{arqs}')

    elif len(arqs) > 1:
        print(f"ERROR: Multiple files found! Please run check_pairs() first.")

    return pairs


#------ ATR indicator estimation
def atr_calc(df, length=20):
    """
    Description: function to compute Average True Range estimations.
    Important Notice: df MUST have 'high', 'low', 'close' values features.

    Input: df, OHLC pandas DataFrame
    Output: pandas Series, with standardized ATR
    """
    import pandas_ta

    atr = pandas_ta.atr(high=df['high'],
                        low=df['low'],
                        close=df['close'],
                        length=length)
    
    return atr.sub(atr.mean()).div(atr.std())


#------ MACD estimation
def macd_calc(df, length=30):
    """
    Description: custom function to estimate MACD indicator
    Inputs: df, OHLC pandas DataFrame
    Outputs: indicator estimated
    """
    import pandas_ta

    macd = pandas_ta.macd(close=df['close'], length=length)

    return macd.sub(macd.mean()).div(macd.std())


def estimate_returns(df):
    """
    Definition: estimate cumulative monthly returns (up to 6m)
    Input: DataFrame containing OHLC 'close' infos, fortnightly
    Outputs: updated DataFrame
    """
    outlier = 0.005 

    lags = [2, 4, 6, 8, 10, 12]

    for time in lags:
        df[f'return_{int(time/2)}m'] = df['close'].pct_change(time).pipe(lambda x: x.clip(lower=x.quantile(outlier), upper=x.quantile(1-outlier))).add(1).pow(1/time).sub(1)

    return df







In [2]:
#from metrics import mvcriterion, optim_mvcrit

#wallet = binance_wallet()

# First of all, fetch pairs local file in order to know what assets to perform the calculations
print('Checking for trading pairs...')
check_pairs()
print('Loading pairs.')
pairs = get_pairs()
print('Pairs successfully loaded.')


# Then, get the historical data from Binance
# Params
pool = pd.DataFrame()
past_days = 365*2
interv = '1d'

for asset in pairs['Sell']:
    temp = historical_data(ticker=asset, days=past_days, interval=interv)
    pool = pd.concat([pool, temp])
    del temp

# Up until here I've got a 'pool' object with the historical data and my trading pairs file loaded into 'pairs' object.
# Let's calculate some indicators now

print('Calculating RSI (momentum).')
# Calculating Relative Strenght Index (RSI) - momentum indicator
# The RSI indicator won't be standardized for its use in clustering
pool['rsi'] = pool.groupby(level=0)['close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))
#pool.xs('BTCUSDT', level=0)['rsi'].plot() # to check if it's worked just uncomment the beginning of this line

print('Calculating Bollinger Bands (vol).')
# Calculating Bollinger Bands - volatility indicator (overbought/oversold)
pool['bb_low'] = pool.groupby(level=0)['close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,0])
pool['bb_mid'] = pool.groupby(level=0)['close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,1])
pool['bb_high'] = pool.groupby(level=0)['close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,2])

print('Calculating ATR (vol).')
# Average True Range (ATR) - volatility indicator
# Since this function uses 3 features to compute the indicator (high, low, close), it is needed to use 'apply' instead of 'transform',
# and for that a custom function is needed (check it in Functions section).
pool['atr'] = pool.groupby(level=0, group_keys=False).apply(atr_calc)

print('Calculating MACD (momentum).')
# Moving Average Convergence-Divergence (MACD) - momentum indicator
# Same reasonig as ATR, a custom function is needed here.
pool['macd'] = pool.groupby(level=0, group_keys=False).apply(macd_calc).iloc[:,0]

print('Calculating dollar volume in millions.')
# Dollar volume (based on closing price), divided by 1mil
pool['dollar_vol'] = pool['volume']*pool['close']/1e6

print('Aggregating data to bi-weekly periods, filtering best cryptos.')
# Aggregate to bi-weekly level and filter N most market capped cryptos
indicators = [c for c in pool.columns.unique() if c not in ['dollar_vol', 'open', 'high', 'low', 'volume']]
p_dvol = pool.unstack(level=0)['dollar_vol'].resample('2W').mean().stack('asset').to_frame('dollar_vol')
p_indc = pool.unstack(level=0)[indicators].resample('2W').last().stack('asset', future_stack=True)
pool = pd.concat([p_dvol, p_indc], axis=1).dropna()

print('Creating dollar volume moving averages.')
# 13-week moving average of dollar volume for each asset
pool['dollar_vol'] = pool['dollar_vol'].unstack('asset').rolling(13).mean().stack()

print('Checking cryptos liquidity.')
# Bi-weekly rank for each asset by dollar volume (a.k.a. liquidity), smaller rank is better (most liquid)
pool['liquidity_lvl'] = pool.groupby('time')['dollar_vol'].rank(ascending=False)

print('Creating a rank for the best cryptos in the dataset.')
# Top 15 cryptos fortnightly, able to drop volume and liquidity features already
mask = pool['liquidity_lvl'] < 16
pool = pool.loc[mask].drop(['dollar_vol', 'liquidity_lvl'], axis=1)

print('Estimating returns.')
pool = pool.groupby(level='asset', group_keys=False).apply(estimate_returns).dropna()


print('All good.')

Checking for trading pairs...
The resources directory exists, checking for trade pairs file.
Trading pairs file 'pairs_202507.csv' found, checking version.
Trading pairs file is up to date.
Loading pairs.
Pairs successfully loaded.
Requesting informations from Binance.
Request successful. Splitting data...
All done.
Requesting informations from Binance.
Request successful. Splitting data...
All done.
Requesting informations from Binance.
Request successful. Splitting data...
All done.
Requesting informations from Binance.
Request successful. Splitting data...
All done.
Requesting informations from Binance.
Request successful. Splitting data...
All done.
Requesting informations from Binance.
Request successful. Splitting data...
All done.
Requesting informations from Binance.
Request successful. Splitting data...
All done.
Requesting informations from Binance.
Request successful. Splitting data...
All done.
Requesting informations from Binance.
Request successful. Splitting data...
All 

In [283]:
# Clustering
from sklearn.cluster import KMeans

def clustering(df):
    df['cluster_num'] = KMeans(n_clusters=4,
                               random_state=42,
                               init='random').fit(df).labels_
    
    return df

thiago = pool.dropna().groupby('time', group_keys=False).apply(clustering)

In [285]:
import matplotlib.pyplot as plt

#Getting unique labels
label = KMeans(n_clusters=4,
        random_state=42,
        init='random').fit_predict(pool)

u_labels = thiago['cluster_num'].unique()

for label in u_labels:
    label_{label} = thiago['cluster_num'] == label
 


SyntaxError: invalid syntax (1702371654.py, line 11)

In [None]:
#plotting the results:
 
for i in u_labels:
    plt.scatter(f'label_{i}[cols[0]]', label_0[cols[1]], label = i)    
    plt.scatter(mask[:,0], mask[:,1], label = i)
#    plt.scatter((thiago.loc[mask, 'atr']), (thiago.loc[mask, 'rsi']) , label = i)
plt.legend()
plt.show()

In [3]:
pool

Unnamed: 0_level_0,Unnamed: 1_level_0,close,rsi,bb_low,bb_mid,bb_high,atr,macd,return_1m,return_2m,return_3m,return_4m,return_5m,return_6m
time,asset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2024-07-14,ADAUSDT,0.44490000,57.66806047,0.29259639,0.33128666,0.36997693,-0.63943550,0.06562335,0.05292303,-0.02973655,-0.02462716,-0.04750671,-0.03250181,-0.01385740
2024-07-14,AVAXUSDT,27.82000000,48.48143259,3.22211077,3.32054126,3.41897176,-0.26689891,-0.46035285,-0.01148144,-0.08752286,-0.05551084,-0.08727228,-0.03393242,-0.02138707
2024-07-14,BCHUSDT,400.20000000,52.50228560,5.74885586,5.89315113,6.03744639,0.08645687,-0.66207974,-0.02350717,-0.06242117,-0.04353130,-0.02433900,0.03834540,0.04284099
2024-07-14,BNBUSDT,585.30000000,54.08449439,6.18873732,6.30009405,6.41145078,0.42638929,-1.23399503,-0.01584622,-0.00601639,-0.00536508,-0.00036247,0.03838518,0.05338959
2024-07-14,BTCUSDT,64724.14000000,56.20475310,10.91375847,10.99478429,11.07581011,-0.09412813,-0.91792020,-0.01347480,-0.02440144,-0.00529560,-0.00953493,0.01738594,0.03406049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-07-13,SUIUSDT,2.96080000,49.69348846,1.25851115,1.33277403,1.40703691,0.31544120,-0.39356411,-0.00592504,-0.06277076,0.05103281,0.02673013,0.00451430,-0.02401471
2025-07-13,TRXUSDT,0.28760000,59.83156892,0.23471817,0.24554381,0.25636944,0.04159558,0.35614476,0.02433007,0.01904055,0.02617338,0.02934072,0.01821543,0.01293364
2025-07-13,USDCUSDT,0.99990000,55.19003816,0.69283674,0.69298466,0.69313259,-0.48962708,0.45991422,0.00025012,0.00007502,-0.00001667,0.00000000,-0.00002000,-0.00001667
2025-07-13,XLMUSDT,0.29340000,63.68767289,0.19602398,0.21769785,0.23937173,-0.05156825,0.00402221,0.05903713,0.00579183,0.02661475,0.00103224,-0.00188980,-0.02700403
