# Investigating series sizes

## Testing the ICOParser class

In [1]:
import pandas as pd
from ico_parser import ICOParser
# Imprimir todos valores da célula, não somente o último
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

### Load table with information about ICOs

In [2]:
path_to_csvs = '/home/gabriel/Documents/Repos/time_series_study/data_and_models/all_icos/'

In [3]:
ico_table = pd.read_csv('lista_ico_2020-08-17_tab3.csv', sep=',')
ico_table.tail()

Unnamed: 0,ico,adress,fraud,start_date,market_start_date,diff_days,market_days,size_ok,price,market_cap,o_concur,biggest_holder,exchange,contract,date_analysis,code,site,social_media
253,WAX,0x39bb259f66e1c59d5abef88375979b4d20d98022,0,2017-12-19,2017-12-20,1,997,,,157.0,x,30,0,0,2018-06-18,,,
254,WAYKICHAIN,0x4f878c0852722b0976a955d68b376e4cd4ae99e5,0,2018-01-01,2018-01-03,2,983,,,162.0,x,0,0,0,2018-07-02,,,
255,ZILLIQA,0x05f4a42e251f2d52b8ed15e9fedaacfcef1fad27,0,2018-01-12,2018-01-24,12,962,,,50.0,x,7,0,1,2018-07-23,,,
256,ZPAY,0xeffea57067e02999fdcd0bb45c0f1071a29472d9,1,2018-07-25,2018-08-20,26,754,x,,,,99,0,0,2019-02-16,,,
257,ZYNECOIN,0xe65ee7c03bbb3c950cfd4895c24989afa233ef01,1,2019-01-21,2019-01-23,2,598,x,,,,15,2,0,2019-07-22,x,x,x


In [4]:
ico_table = ico_table.astype({'start_date': 'datetime64', 'market_start_date': 'datetime64', 'date_analysis':'datetime64'})
ico_table.set_index('ico', inplace=True)
ico_table.dtypes

adress                       object
fraud                         int64
start_date           datetime64[ns]
market_start_date    datetime64[ns]
diff_days                     int64
market_days                   int64
size_ok                      object
price                        object
market_cap                  float64
o_concur                     object
biggest_holder                int64
exchange                      int64
contract                      int64
date_analysis        datetime64[ns]
code                         object
site                         object
social_media                 object
dtype: object

In [5]:
import pandas as pd
from datetime import datetime, timedelta
from exchange_addresses import ADRESS_LIST
import pytz
import requests
import json
import time

%%time
ico = "COMPOUND"
path_to_csv= f'{path_to_csvs}{ico}.csv'
ico_start_date = str(ico_table.at[ ico, 'market_start_date'].date())
is_fraud = ico_table.at[ ico, 'fraud']
len_time_series = 20
dateformat='%Y-%m-%d'
date_column='BLOCK_TIMESTAMP'
ico_start_date = (
            datetime.strptime(ico_start_date, dateformat)
            .replace(tzinfo=pytz.UTC)
            .date()
        )
ico_end_date = ico_start_date + timedelta(
                    days=len_time_series
                )

# Slice df for defined start and end date
df = pd.read_csv(path_to_csv)
df.sort_values(by=date_column, inplace=True)
df['transactions'] = 1

df[date_column] = pd.to_datetime(df[date_column]).dt.date

df = df.loc[
        (df[date_column] >= ico_start_date)
        & (df[date_column] < ico_end_date)
    ]
df_for_resample = df.copy()
df.set_index(date_column, inplace=True)
df_for_resample[date_column] = pd.to_datetime(
            df_for_resample[date_column]
        )

## Updated Performance

In [6]:
%%time
ico = "COMPOUND"
ico_csv= f'{path_to_csvs}{ico}.csv'
market_start_date = str(ico_table.at[ ico, 'market_start_date'].date())
is_fraud = ico_table.at[ ico, 'fraud']
ico_parser = ICOParser(ico_csv,
                     date_column='BLOCK_TIMESTAMP',
                     value_column='VALUE',
                     ico_start_date=market_start_date, 
                     fraud_flag=is_fraud,
                          len_time_series=20)

CPU times: user 30.6 s, sys: 99.4 ms, total: 30.7 s
Wall time: 30.7 s


In [7]:
ico_parser.pipeline()

Running method: get_newbiers_dataframe ... 
Running method: get_balance ... 
2020-06-15 2020-07-05
2020-03-04
123
Running method: get_cumsum_balance ... 
Running method: get_cumsum_daily_percentage ... 
Running method: get_daily_number_of_new_holder ... 
Running method: get_array_daily_transactions ... 
Running method: get_array_perc_new_holders ... 
Running method: get_biggest_holder_dict ... 
Running method: get_biggest_holder_array ... 
Running method: get_newbiers_ratio_dict ... 
Running method: get_newbiers_array ... 
Running method: get_gas_ratio_array ... 


In [11]:
ico_parser.get_newbiers_ratio_dict()
ico_parser.get_newbiers_array()

In [12]:
dict_arrays_20 = {}

In [13]:
dict_arrays_20[ico] = (ico_parser.array_daily_transactions,
                              ico_parser.array_perc_new_holders,
                              ico_parser.array_biggest_holder,
                              ico_parser.array_newbiers,
                              ico_parser.array_gas_ratio)
dict_arrays_20

{'COMPOUND': ([0.0103,
   0.0434,
   0.0741,
   0.1282,
   0.1936,
   0.2521,
   0.3471,
   0.4203,
   0.4827,
   0.5362,
   0.6016,
   0.6611,
   0.716,
   0.7575,
   0.7932,
   0.8263,
   0.8594,
   0.9095,
   0.9613,
   1.0],
  [0.0251,
   0.0738,
   0.1089,
   0.1584,
   0.2125,
   0.2541,
   0.3202,
   0.3807,
   0.4406,
   0.4816,
   0.5463,
   0.6133,
   0.6804,
   0.7283,
   0.7697,
   0.8139,
   0.8577,
   0.9065,
   0.9585,
   1.0],
  [0.5045,
   0.5035,
   0.4785,
   0.1,
   0.1,
   0.1,
   0.1,
   0.1,
   0.1,
   0.1,
   0.1,
   0.08,
   0.075,
   0.075,
   0.075,
   0.075,
   0.075,
   0.075,
   0.075,
   0.075],
  [0.021,
   0.0245,
   0.0467,
   0.0595,
   0.0421,
   0.0621,
   0.0745,
   0.0792,
   0.1272,
   0.0804,
   0.1255,
   0.1486,
   0.1514,
   0.1447,
   0.1627,
   0.1632,
   0.1082,
   0.1025,
   0.1205,
   0.1246],
  [0.6901,
   0.6862,
   0.714,
   0.6837,
   0.6773,
   0.6685,
   0.5412,
   0.5631,
   0.2834,
   0.3917,
   0.3708,
   0.3194,
   0.3723,
   0

## Original Performance

In [None]:
%%time
ico = "COMPOUND"
ico_csv= f'{path_to_csvs}{ico}.csv'
market_start_date = str(ico_table.at[ ico, 'market_start_date'].date())
is_fraud = ico_table.at[ ico, 'fraud']
ico_parser = ICOParser(ico_csv,
                     date_column='BLOCK_TIMESTAMP',
                     value_column='VALUE',
                     ico_start_date=market_start_date, 
                     fraud_flag=is_fraud,
                          len_time_series=20)
ico_parser.pipeline()

## Creating Dataset

In [16]:
for ico in ico_table.index.to_list():
    if '-' in ico or '_' in ico or ' ' in ico:
        print(ico)

In [9]:
list_icos = ico_table.index.to_list()

In [18]:
dict_arrays_20 = {}
list_bad_icos_20 = []

In [19]:
dict_arrays_20

{}

In [20]:
for ico in list_icos:
    ico_csv= f'{path_to_csvs}{ico}.csv'
    market_start_date = str(ico_table.at[ ico, 'market_start_date'].date())
    is_fraud = ico_table.at[ ico, 'fraud']
    try:
        print(ico)
        ico_parser = ICOParser(ico_csv,
                     date_column='BLOCK_TIMESTAMP',
                     value_column='VALUE',
                     ico_start_date=market_start_date, 
                     fraud_flag=is_fraud,
                          len_time_series=20)
        ico_parser.pipeline()
        dict_arrays_20[ico] = (ico_parser.array_daily_transactions,
                              ico_parser.array_perc_new_holders,
                              ico_parser.array_biggest_holder,
                              ico_parser.array_newbiers,
                              ico_parser.array_gas_ratio)
    except:
        print(f'PROBLEM WITH: {ico}')
        list_bad_icos_20.append(ico)

0X
Running method: get_newbiers_dataframe ... 
Running method: get_balance ... 
2017-08-15 2017-09-04
2017-08-11
24
Running method: get_cumsum_balance ... 
Running method: get_cumsum_daily_percentage ... 
Running method: get_daily_number_of_new_holder ... 
Running method: get_array_daily_transactions ... 
Running method: get_array_perc_new_holders ... 
Running method: get_biggest_holder_dict ... 
Running method: get_biggest_holder_array ... 
Running method: get_newbiers_ratio_dict ... 
Running method: get_newbiers_array ... 
Running method: get_gas_ratio_array ... 
0XCERT
Running method: get_newbiers_dataframe ... 
Running method: get_balance ... 
2018-07-11 2018-07-31
2018-06-26
35
Running method: get_cumsum_balance ... 
Running method: get_cumsum_daily_percentage ... 
Running method: get_daily_number_of_new_holder ... 
Running method: get_array_daily_transactions ... 
Running method: get_array_perc_new_holders ... 
Running method: get_biggest_holder_dict ... 
Running method: get_bigg

In [26]:
# Verificando o tamanho dos arrays
for ico in dict_arrays_20.keys():
    if len(dict_arrays_20.get(ico)[0]) != 20:
        print(ico)

AIDCOIN
ALLME
BELANCE
BLOCKMALL
CRYPTONIAPOKER
DIAM
EXIMCHAIN
HONEYSHARECOIN
ICON
INDAHASH
LOLIGO
MONEYTOKEN
MULTILEVEL
PLANETMOBILETOKEN
PROMETEUS
SHARERING
SINGULARITYNET
SOCIALMEDIAPAY
SPARKSTER


In [27]:
### Saving dictionary to 

In [11]:
dict_arrays_40 = {}
list_bad_icos_40 = []

In [None]:
for ico in list_icos:
    ico_csv= f'{path_to_csvs}{ico}.csv'
    market_start_date = str(ico_table.at[ ico, 'market_start_date'].date())
    is_fraud = ico_table.at[ ico, 'fraud']
    try:
        print(ico)
        ico_parser = ICOParser(ico_csv,
                     date_column='BLOCK_TIMESTAMP',
                     value_column='VALUE',
                     ico_start_date=market_start_date, 
                     fraud_flag=is_fraud,
                          len_time_series=40)
        ico_parser.pipeline()
        dict_arrays_40[ico] = (ico_parser.array_daily_transactions,
                              ico_parser.array_perc_new_holders,
                              ico_parser.array_biggest_holder,
                              ico_parser.array_newbiers,
                              ico_parser.array_gas_ratio)
    except:
        print(f'PROBLEM WITH: {ico}')
        list_bad_icos_40.append(ico)

0X
Running method: get_newbiers_dataframe ... 
Running method: get_balance ... 
2017-08-15 2017-09-24
2017-08-11
44
Running method: get_cumsum_balance ... 
Running method: get_cumsum_daily_percentage ... 
Running method: get_daily_number_of_new_holder ... 
Running method: get_array_daily_transactions ... 
Running method: get_array_perc_new_holders ... 
Running method: get_biggest_holder_dict ... 
Running method: get_biggest_holder_array ... 
Running method: get_newbiers_ratio_dict ... 
Running method: get_newbiers_array ... 
Running method: get_gas_ratio_array ... 
0XCERT
Running method: get_newbiers_dataframe ... 
Running method: get_balance ... 
2018-07-11 2018-08-20
2018-06-26
55
Running method: get_cumsum_balance ... 
Running method: get_cumsum_daily_percentage ... 
Running method: get_daily_number_of_new_holder ... 
Running method: get_array_daily_transactions ... 
Running method: get_array_perc_new_holders ... 
Running method: get_biggest_holder_dict ... 
Running method: get_bigg

In [36]:
dict_arrays_40

{'0XCERT': ([0.2703,
   0.4362,
   0.4768,
   0.5006,
   0.5153,
   0.5787,
   0.608,
   0.6401,
   0.6524,
   0.6819,
   0.7276,
   0.7366,
   0.7571,
   0.7774,
   0.7928,
   0.8008,
   0.8087,
   0.8134,
   0.8208,
   0.8326,
   0.8439,
   0.8655,
   0.8798,
   0.8963,
   0.9091,
   0.9145,
   0.923,
   0.9286,
   0.9374,
   0.9481,
   0.9551,
   0.9605,
   0.9628,
   0.9646,
   0.9748,
   0.9769,
   0.9805,
   0.9866,
   0.9949,
   1.0],
  [0.4487,
   0.6278,
   0.663,
   0.6875,
   0.6974,
   0.7616,
   0.7757,
   0.7965,
   0.8073,
   0.82,
   0.8716,
   0.877,
   0.886,
   0.8951,
   0.9023,
   0.9086,
   0.9114,
   0.9154,
   0.919,
   0.9231,
   0.9299,
   0.9394,
   0.9457,
   0.9507,
   0.9548,
   0.9584,
   0.9638,
   0.9674,
   0.9729,
   0.9769,
   0.9801,
   0.9828,
   0.9837,
   0.9846,
   0.99,
   0.9919,
   0.9932,
   0.995,
   0.9977,
   1.0],
  [0.4205,
   0.4171,
   0.4155,
   0.4155,
   0.4155,
   0.4155,
   0.4155,
   0.4155,
   0.4154,
   0.4154,
   0.4154,
   0

In [30]:
len(dict_arrays_40)

252

In [6]:
dict_arrays_40

NameError: name 'dict_arrays_40' is not defined

In [None]:
dict_arrays_60 = {}

In [None]:
dict_arrays_60

In [None]:
for ico in list_icos:
    #print(f'{ico} size: {len(list_icos_bad_size.get(ico))}')
    ico_csv= f'{path_to_csvs}{ico}.csv'
    market_start_date = str(ico_table.at[ ico, 'market_start_date'].date())
    is_fraud = ico_table.at[ ico, 'fraud']
    try:
        print(ico)
        ico_parser = ICOParser(ico_csv,
                     date_column='BLOCK_TIMESTAMP',
                     value_column='VALUE',
                     ico_start_date=market_start_date, 
                     fraud_flag=is_fraud,
                          len_time_series=60)
        print('Running method: define_ico_start_date ... ')
        ico_parser.define_ico_start_date()
        print('Running method: get_newbiers_dataframe ... ')
        ico_parser.get_newbiers_dataframe()
        print('Running method: get_balance ... ')
        ico_parser.get_balance()
        dict_arrays_60[ico] = (ico_parser.array_daily_transactions,
                              ico_parser.array_perc_new_holders,
                              ico_parser.array_biggest_holder,
                              ico_parser.array_newbiers,
                              ico_parser.array_gas_ratio)
    except:
        print(f'PROBLEM WITH: {ico}')
        #list_bad_icos.append(ico)


### Loading dictionary with right sizes 

In [None]:
import pickle

In [None]:
with open('ico_arrays_2020-10-12.pickle', 'rb') as handle:
    dict_array_pickle= pickle.load(handle)

In [None]:
list_icos_bad_size = {k:v[1] for k,v in dict_array_pickle.items() if len(v[1]) != 60}

In [None]:
list_icos_bad_size.keys()

In [None]:
for ico in list_icos_bad_size.keys():
    print(f'{ico} size: {len(list_icos_bad_size.get(ico))}')

### 4NEW 

In [None]:
path_to_csvs = '/home/gabriel/Documents/Repos/time_series_study/data_and_models/all_icos/'
ico_ = '4NEW'
ico_csv= f'{path_to_csvs}{ico_}.csv'

In [None]:
ico_table.loc[ico_,:]

In [None]:
market_start_date = str(ico_table.at[ ico_, 'market_start_date'].date())
market_start_date

In [None]:
is_fraud = ico_table.at[ ico_, 'fraud']
is_fraud

In [None]:
str(ico_table.at[ ico_, 'market_start_date'].date())

In [None]:
ico_parser = ICOParser(ico_csv,
                 date_column='BLOCK_TIMESTAMP',
                 value_column='VALUE',
                 ico_start_date=market_start_date, 
                 fraud_flag=is_fraud,
                      len_time_series=20)

In [None]:
print('Running method: define_ico_start_date ... ')
ico_parser.define_ico_start_date()
print('Running method: get_newbiers_dataframe ... ')
ico_parser.get_newbiers_dataframe()
print('Running method: get_balance ... ')
ico_parser.get_balance()

In [None]:
ico_table.head(200)

In [None]:
ico_table.at['AIDCOIN', 'market_start_date']

## Validating size for bad size ICOs

In [None]:
list_bad_icos = []

In [None]:
bad_icos = ['AIDCOIN', 'ANATOMIA', 'BANKERA', 'BELANCE', 'BITCOINMAX', 'BLISSEXCHANGE', 'BUDBO', 'ETHEREUMCASHPRO', 'EXIMCHAIN', 'GADIUNTRUSTWALLET', 'HUOBI', 'ICON',  'LITECOINRED', 'MONEYTOKEN', 'PKGTOKEN', 'QUANTSTAMP', 'REMICOIN', 'SINGULARITYNET', 'SOCIALMEDIAPAY', 'SPARKSTER', 'TEFOOD', 'TIERION', 'TOMO', 'ULTRA']

In [None]:

for ico in bad_icos:
    print(f'{ico} size: {len(list_icos_bad_size.get(ico))}')
    ico_csv= f'{path_to_csvs}{ico}.csv'
    market_start_date = str(ico_table.at[ ico, 'market_start_date'].date())
    is_fraud = ico_table.at[ ico, 'fraud']
    try:
        ico_parser = ICOParser(ico_csv,
                     date_column='BLOCK_TIMESTAMP',
                     value_column='VALUE',
                     ico_start_date=market_start_date, 
                     fraud_flag=is_fraud,
                          len_time_series=20)
        print('Running method: define_ico_start_date ... ')
        ico_parser.define_ico_start_date()
        print('Running method: get_newbiers_dataframe ... ')
        ico_parser.get_newbiers_dataframe()
        print('Running method: get_balance ... ')
        ico_parser.get_balance()
    except:
        print(f'PROBLEM WITH: {ico}')
        list_bad_icos.append(ico)


In [None]:
list_bad_icos

In [None]:
ico_table.loc['ANATOMIA']

In [None]:
ico_parser.pipeline()

In [None]:
len(list_icos_bad_size.get('4NEW'))

In [None]:
['AIDCOIN',
'ANATOMIA',
'BANKERA',
'BELANCE',
'BITCOINMAX',
'BLISSEXCHANGE',
'BUDBO',
'ETHEREUMCASHPRO',
'EXIMCHAIN',
'GADIUNTRUSTWALLET',
'HUOBI',
'ICON',
'LITECOINRED',
'MONEYTOKEN',
'PKGTOKEN',
'QUANTSTAMP',
'REMICOIN',
'SINGULARITYNET',
'SOCIALMEDIAPAY',
'SPARKSTER']

### Testing `ICOParser` individually

In [None]:
ico_table = pd.read_csv('lista_ico_2020-08-17_tab3.csv')
ico_table = ico_table.astype({'start_date': 'datetime64', 'market_start_date': 'datetime64', 'date_analysis':'datetime64'})


In [None]:
ico_table.dtypes

In [None]:
ico_table.set_index('ico', inplace=True)
ico_table.head()

In [None]:
path_to_csvs = '/home/gabriel/Documents/Repos/time_series_study/data_and_models/all_icos/'

#ico_ = 'AMPLEFORTH'
ico_ = 'TERRAMINER'
ico_csv= f'{path_to_csvs}{ico_}.csv'
market_start_date = str(ico_table.at[ ico_, 'market_start_date'].date())
is_fraud = ico_table.at[ ico_, 'fraud']

In [None]:
str(ico_table.at[ ico_, 'market_start_date'].date())

In [None]:
ico_parser = ICOParser(ico_csv,
                 date_column='BLOCK_TIMESTAMP',
                 value_column='VALUE',
                 ico_start_date=market_start_date, 
                 fraud_flag=is_fraud,
                      len_time_series=20)

# TESTANDO PIPELINE

In [None]:
ico_parser.define_ico_start_date()
"""
print('Running method: get_newbiers_dataframe ... ')
ico_parser.get_newbiers_dataframe()
print('Running method: get_balance ... ')
ico_parser.get_balance()
print('Running method: get_cumsum_balance ... ')
ico_parser.get_cumsum_balance()
print('Running method: get_cumsum_daily_percentage ... ')
ico_parser.get_cumsum_daily_percentage()
print('Running method: get_daily_number_of_new_holder ... ')
ico_parser.get_daily_number_of_new_holder()
"""
print('Running method: get_array_daily_transactions ... ')
ico_parser.get_array_daily_transactions()
ico_parser.array_daily_transactions

In [None]:
%%time
ico_parser.pipeline()

In [None]:
df_resample_func = ico_parser.df_resample_day.reset_index()
df_resample_func['BLOCK_TIMESTAMP'] = df_resample_func[
            'BLOCK_TIMESTAMP'
        ].dt.date

In [None]:
ico_parser.df_newbiers_resample['GAS_RATIO'] = (
                ico_parser.df_newbiers_resample['RECEIPT_GAS_USED']
                / ico_parser.df_newbiers_resample['GAS']
            )
ico_parser.array_gas_ratio = (
                ico_parser.df_newbiers_resample.GAS_RATIO.to_list()
            )[-ico_parser.len_time_series :]

# Tamanho dos Arrays

In [None]:
ico_parser.get_newbiers_ratio_dict()
ico_parser.get_newbiers_array()