In [2]:
### Requirements ### 
import pandas as pd
import numpy as np 

import seaborn as sns
import matplotlib.pyplot as plt

from pycoingecko import CoinGeckoAPI
cg = CoinGeckoAPI()

from datetime import datetime 

## cgAPI Usage test

### Command
* cg.get_coins_list()

In [2]:
#cg command
all_coins = cg.get_coins_list()

# Get a list of all 'ids'
id_list = [each['id'] for each in all_coins]

# Get a list of all 'symbols'
symbol_list = [each['symbol'] for each in all_coins]

# Dataframe of both 
all_tokens = pd.DataFrame(symbol_list, index = id_list, columns = ['symbol']) 

# Drop first row (the token has id = '', blocking request accesses through 'id', cast it off)
all_tokens.drop(index=all_tokens.index[0], axis=0, inplace=True)

In [3]:
all_tokens

Unnamed: 0,symbol
01coin,zoc
0-5x-long-algorand-token,algohalf
0-5x-long-altcoin-index-token,althalf
0-5x-long-ascendex-token-token,asdhalf
0-5x-long-bitcoin-cash-token,bchhalf
...,...
zynecoin,zyn
zyro,zyro
zytara-dollar,zusd
zyx,zyx


**Check index unicity**

In [4]:
[ each for each in all_tokens.index.value_counts() if each > 1 ]

[]

**Now, we have 'id' and 'symbol' for 12,656 tradable crypto tokens.**

--- 

## Iterate for all coins 
* We want to find the **top 1000 crypto projects** in terms of twitter followers.
* To do it, we need to go over the 12k tokens: 
    * check if they have **'community_data'** available
    * check if they have **'twitter_followers'** available (is not None)
    
#### API limit considerations
* CoinGecko allows up to 50 requests per minute, meaning
    * We need to reduce **all_coins** to 253 50-sized chunks 
    * Request **get_coin_history_by_id(id='', date='today')** for each token 'id'
    * Access **request_dict['community_data']**, and check if **rqst_community_data.get('twitter_followers') is not None**
    * if it **is not None**, append 'twitter_followers' to the received df 
    * (it will take 253 minutes...)
    
---    
    
### First approach
* Chunk down **all_tokens** (index = 'id', columns = ['symbol']) in 253 slots of 50 tokens
* Define a function which 
    * takes one chunck, get data from the API request
    * Adds it (if not None) to the received df and returns said df as 'extracted_df' 
* **run function 253 times**

### Chunking it down

In [5]:
# Chunk splits
chunk_list = np.array_split(all_tokens, 253)

# Check chunk size
test_chunk = chunk_list[39]
print(f'Chunk contains {test_chunk.shape[0]} tokens')

Chunk contains 50 tokens


### Define function

In [6]:
def Extractor(df):
    
    '''
    * For each token in df, calls for an API request, 
    access 'community_data' and 'developer_data' from request, 
    get metrics from data dicts,
    returns df with indexed 'twitter_followers', 'reddit_subscribers'
                            'forks', 'stars', 'subscribers', 
                            'total_issues', 'closed_issues'
    
    
    Idea:
    twitterExtractor(tokens_chunk)
    out: df['id','twitter_followers', 'reddit_subscribers', 
            'forks', 'stars', 'subscribers', 'total_issues', 'closed_issues'  ]
    '''
    
    
    # set today's date
    #today = datetime.utcnow().strftime('%d-%m-%Y')
    today = '17-02-2022'
    
    # set appendable lists
    ids_list = []
    twitter_foll_list = []
    reddit_subs_list = [] 
    forks_list = []
    stars_list = []
    subscribers_list = []
    total_issues_list = []
    closed_issues_list = []

    # loop
    for id in df.index:
    
        # make request 
        request_dict = cg.get_coin_history_by_id(id = id, date = today)
    
        # check if request_dict contains 'community_data' and 'developer_data'
        if ('community_data' in request_dict.keys()) and ('developer_data' in request_dict.keys()):
        
            # access community and developer data dicts in request
            rqst_community_data = request_dict['community_data']
            rqst_developer_data = request_dict['developer_data']
    
            # COMM METRICS 
            # get 'twitter_followers' from community_data dict
            twitter_foll = rqst_community_data.get('twitter_followers')
            
            # get 'reddit_subscribers' from community_data dict
            reddit_subs = rqst_community_data.get('reddit_subscribers')
            
            # GITHUB METRICS 
            # get 'forks' from developer_data dict
            forks = rqst_developer_data.get('forks')
            
            # get 'stars' from developer_data dict
            stars = rqst_developer_data.get('stars')
            
            # get 'subscribers' from developer_data dict
            subscribers = rqst_developer_data.get('subscribers')
            
            # get 'total_issues' from developer_data dict
            total_issues = rqst_developer_data.get('total_issues')
            
            # get 'closed_issues' from developer_data dict
            closed_issues = rqst_developer_data.get('closed_issues')
            
            
            # if twitter_foll is not None, append all values to respective lists
            if twitter_foll is not None:
                ids_list.append(id)
                twitter_foll_list.append(twitter_foll)
                reddit_subs_list.append(reddit_subs)
                forks_list.append(forks)
                stars_list.append(stars)
                subscribers_list.append(subscribers)
                total_issues_list.append(total_issues)
                closed_issues_list.append(closed_issues)
    
    # Set dataframe using the lists
    extrected_df = pd.DataFrame({'id':ids_list,
                               'twitter_followers':twitter_foll_list,
                               'reddit_subs':reddit_subs_list,
                               'forks':forks_list,
                               'stars':stars_list,
                               'github_subs':subscribers_list,
                               'total_issues':total_issues_list,
                               'closed_issues':closed_issues_list
                              })            
    return extrected_df
    

In [7]:
# Results for test chunck 
Extractor(test_chunk)

Unnamed: 0,id,twitter_followers,reddit_subs,forks,stars,github_subs,total_issues,closed_issues
0,brewlabs,8437,,,,,,
1,brick-token,5134,,,,,,
2,bridge,7584,337.0,,,,,
3,bridge-mutual,45821,,,,,,
4,bright-token,9907,,,,,,
5,bright-union,28452,,,,,,
6,bring-finance,4829,,,,,,
7,brokoli,110461,,,,,,
8,brother-music-platform,5431,,,,,,
9,bscarmy,1281,,,,,,


###### **As we dont want to spend 4 hours manually running the extractor function 253 times, we should schedule the function to run on its own every x seconds**
---    
    
### Second approach
* Define another function to run the extractor function on every id of every chunk, from 0 to 252, every x seconds

##### scheduling 

In [8]:
import time

In [9]:
def executeExtractor(chunk_list, sleep):
    '''
    chunk_list = list, of all chunked dataframes.
    sleep = int, seconds sleeping.
    '''    
    # set test lenth (2% of all coins)
    chunk_list_lenth = 252
    
    # set 'chunk_list' lenth
    #chunk_list_lenth = len(chunk_list)
    
    # generate list_of_twitterExtracted_dfs
    # not comprehensive as we need to communicate the steps
    list_of_Extracted_dfs = []
    
    # timed loop for generating list of dfs
    for i in range(chunk_list_lenth):
        
        # append processing_df to list
        list_of_Extracted_dfs.append(Extractor(chunk_list[i]))
        
        # Communicate process
        print(f'Chunk {i} successfully appended to list_of_Extracted_dfs.')
        
        # condition to sleep
        if (i+1) != chunk_list_lenth:
            
            # Communicate process
            print(f'Wait {sleep} seconds before processing next chunk. \n')
        
            # wait sleep seconds to rerun Extractor
            time.sleep(sleep)
        
        else:
            print('All done.')
            
    
    # Concatanate and return
    return pd.concat(list_of_Extracted_dfs, ignore_index=True)

* With **executeExtraction**, we've successfully automatized the twitter_followers data collection.

* If we are to request **twitter_followers** from the **community_data** request, we might as well extract other things.


    * 'community_data': 
        * 'twitter_followers'
        * 'reddit_subscribers'

     * 'developer_data' | Github: 
         * 'forks'
         * 'stars'
         * 'subscribers'
         * 'total_issues'
         * 'closed_issues'
         
     * Market data
        

In [10]:
all_coins_Extracted = executeExtractor(chunk_list, 80)

HTTPError: 429 Client Error: Too Many Requests for url: https://api.coingecko.com/api/v3/coins/10084-grayton/history?date=17-02-2022

* As the cell above shows, a '429 Client Error' was raised while requesting API access to the 39th chunk. 
* Huge problem here, over an hour into the process and we lost the information for all 38 chunks processed before. 
   * To avoid Errors (of any type) breaking the collection process, lets add some Exception handling.
   
   
**First Exceptions Approach**
* Communicate, set to sleep
   * Any type of Error

In [11]:
def Fix_executeExtractor(chunk_list, sleep):
    '''
    chunk_list = list, of all chunked dataframes.
    sleep = int, seconds sleeping.
    '''    
    # set test lenth (2% of all coins)
    chunk_list_lenth = 252
    
    # set 'chunk_list' lenth
    #chunk_list_lenth = len(chunk_list)
    
    # generate list_of_twitterExtracted_dfs
    # not comprehensive as we need to communicate the steps
    list_of_Extracted_dfs = []
    
    # timed loop for generating list of dfs
    for i in range(chunk_list_lenth):
        while True:
            
            # Try to request
            try:
                # append processing_df to list
                list_of_Extracted_dfs.append(Extractor(chunk_list[i]))
            
            # Handle error
            except Exception as error:
                
                # Communicate error and handle
                print(f'{error} \n Program will wait for a minute and try again. \n')
                time.sleep(60)
                continue
                
            # if no error, break while loop, continue    
            break
        
        # Communicate process
        print(f'Chunk {i} successfully appended to list_of_Extracted_dfs.')
        
        # condition to sleep
        if (i+1) != chunk_list_lenth:
            
            # Communicate process
            print(f'Wait {sleep} seconds before processing next chunk. \n')
        
            # wait sleep seconds to rerun Extractor
            time.sleep(sleep)
        
        else:
            print('All done.')
            
    
    # Concatanate and return
    return pd.concat(list_of_Extracted_dfs, ignore_index=True)

In [12]:
extracted = Fix_executeExtractor(chunk_list,65)

429 Client Error: Too Many Requests for url: https://api.coingecko.com/api/v3/coins/01coin/history?date=17-02-2022 
 Program will wait for a minute and try again. 

Chunk 0 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 1 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 2 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 3 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 4 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 5 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 6 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 7 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chu

Chunk 66 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

('Connection aborted.', OSError("(65, 'EHOSTUNREACH')")) 
 Program will wait for a minute and try again. 

Chunk 67 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

('Connection aborted.', OSError("(54, 'ECONNRESET')")) 
 Program will wait for a minute and try again. 

HTTPSConnectionPool(host='api.coingecko.com', port=443): Read timed out. (read timeout=120) 
 Program will wait for a minute and try again. 

Chunk 68 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 69 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 70 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 71 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 72 success

Chunk 141 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 142 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 143 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 144 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 145 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 146 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 147 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 148 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 149 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 150 successfully appended to list_of_Ext

Chunk 213 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 214 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 215 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 216 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 217 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 218 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 219 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 220 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 221 successfully appended to list_of_Extracted_dfs.
Wait 65 seconds before processing next chunk. 

Chunk 222 successfully appended to list_of_Ext

In [13]:
extracted

Unnamed: 0,id,twitter_followers,reddit_subs,forks,stars,github_subs,total_issues,closed_issues
0,10084-grayton,29577,,,,,,
1,10604-somerset,29577,,,,,,
2,10612-somerset,29577,,,,,,
3,10616-mckinney,29577,,,,,,
4,10629-mckinney,29577,,,,,,
...,...,...,...,...,...,...,...,...
4902,zionomics,2442,,,,,,
4903,zipmex-token,7037,,,,,,
4904,ziticoin,982,,,,,,
4905,zkspace,112163,,,,,,


#### Top 1000 in twitter_followers

In [36]:
topk_TF = extracted.sort_values(by='twitter_followers', ascending = False).iloc[:1000]
topk_TF

Unnamed: 0,id,twitter_followers,reddit_subs,forks,stars,github_subs,total_issues,closed_issues
1593,ethdown,7701962,801536.0,,,,,
1620,ethup,7701962,801559.0,,,,,
988,coinbase-stock,4845643,200016.0,,,,,
989,coinbase-stock-bittrex,4845643,200017.0,,,,,
628,bitcoin,4694036,3914728.0,31613.0,61947.0,3905.0,6730.0,6116.0
...,...,...,...,...,...,...,...,...
2133,hydra-token,50179,521.0,2.0,15.0,4.0,1.0,0.0
1330,diamond-boyz-coin,50178,,4.0,4.0,1.0,0.0,0.0
4630,warrior-token,50177,,,,,,
354,ardana,50058,1101.0,,,,,


**Final Considerations**
* If you actually check the number of twitter followers for ethdown in the coingecko website, you'll find it links to Binance twitter account (as ethdown is a derivative housed by Binance). 

* In the next script of this workflow, we'll filter-out derivative tokens (futures, options, perpetuals, swaps),
* as they are not crypto projects in and of themselves. 