In [1]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import requests as rq
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access your API key
api_key = os.getenv("COINGECKO_API_KEY")

In [None]:
# Get full list of tokens of Solana blockchain
url = "https://api.coingecko.com/api/v3/token_lists/solana/all.json"

headers = {"x-cg-demo-api-key": api_key}

response = rq.get(url, headers=headers)

json_data = response.json()


{'name': 'CoinGecko', 'logoURI': 'https://static.coingecko.com/gecko-new.svg', 'keywords': ['defi'], 'timestamp': '2025-09-19T18:02:36.831+00:00', 'tokens': [{'chainId': None, 'address': 'BWhsvkyrUJqVvrAKjGYLpnTuUCG4SPEh6xVKcjnYCi27', 'name': 'HmmOnSOL', 'symbol': 'HMM', 'decimals': 9, 'logoURI': 'https://assets.coingecko.com/coins/images/37221/thumb/hmm_%281%29.jpg?1713774475'}, {'chainId': None, 'address': '48TqCgU8zC2H5tWshNriY2bWHDULSTSvdgL4iP1Fpump', 'name': 'holo', 'symbol': 'HOLO', 'decimals': 6, 'logoURI': 'https://assets.coingecko.com/coins/images/68855/thumb/28wdm154h9vptrr3wc3o7wdh004p.?1756803861'}, {'chainId': None, 'address': 'D3S1AW1Tj1BbQVCo34D9frJDoD81dU8YRCPhbtUUpump', 'name': 'CAPY', 'symbol': 'CAPY', 'decimals': 6, 'logoURI': 'https://assets.coingecko.com/coins/images/56046/thumb/WhatsApp_Image_2025-04-30_at_14.27.04.jpeg?1748156602'}, {'chainId': None, 'address': 'HogxGo1jDwvseBdYNvNBM7UYpsWJPifbH7hM5nCvBWuw', 'name': 'Hog', 'symbol': 'HOG', 'decimals': 9, 'logoURI

In [5]:
# Convert json to pandas dataframe

df = pd.json_normalize(
    json_data,
    record_path=['tokens'],
    meta=['name', 'logoURI', 'keywords', 'timestamp'],
    record_prefix='token_'

)

# Rename the top-level 'name' to avoid confusion
df.rename(columns={'name': 'source_name'}, inplace=True)


# Show first 5 rows
df.head()


Unnamed: 0,token_chainId,token_address,token_name,token_symbol,token_decimals,token_logoURI,source_name,logoURI,keywords,timestamp
0,,BWhsvkyrUJqVvrAKjGYLpnTuUCG4SPEh6xVKcjnYCi27,HmmOnSOL,HMM,9,https://assets.coingecko.com/coins/images/3722...,CoinGecko,https://static.coingecko.com/gecko-new.svg,[defi],2025-09-19T18:02:36.831+00:00
1,,48TqCgU8zC2H5tWshNriY2bWHDULSTSvdgL4iP1Fpump,holo,HOLO,6,https://assets.coingecko.com/coins/images/6885...,CoinGecko,https://static.coingecko.com/gecko-new.svg,[defi],2025-09-19T18:02:36.831+00:00
2,,D3S1AW1Tj1BbQVCo34D9frJDoD81dU8YRCPhbtUUpump,CAPY,CAPY,6,https://assets.coingecko.com/coins/images/5604...,CoinGecko,https://static.coingecko.com/gecko-new.svg,[defi],2025-09-19T18:02:36.831+00:00
3,,HogxGo1jDwvseBdYNvNBM7UYpsWJPifbH7hM5nCvBWuw,Hog,HOG,9,https://assets.coingecko.com/coins/images/3775...,CoinGecko,https://static.coingecko.com/gecko-new.svg,[defi],2025-09-19T18:02:36.831+00:00
4,,6fnYdoJhYkifvt52pfNtUDr31ZYXmof7JiL9SFrMpump,hit meeee upp,HMU,6,https://assets.coingecko.com/coins/images/3999...,CoinGecko,https://static.coingecko.com/gecko-new.svg,[defi],2025-09-19T18:02:36.831+00:00


In [11]:
# Retain only relevant columns for EDA
relevant_columns = [
    'token_address',
    'token_name',
    'token_symbol',
    'token_decimals',
    'timestamp'
]

# Create a new DataFrame with only the relevant columns
df = df[relevant_columns].copy()

# Show the columns of the new DataFrame to confirm
print("Columns after filtering:", df.columns)

# Display the first 5 rows of the new DataFrame
df.head()


Columns after filtering: Index(['token_address', 'token_name', 'token_symbol', 'token_decimals',
       'timestamp'],
      dtype='object')


Unnamed: 0,token_address,token_name,token_symbol,token_decimals,timestamp
0,BWhsvkyrUJqVvrAKjGYLpnTuUCG4SPEh6xVKcjnYCi27,HmmOnSOL,HMM,9,2025-09-19T18:02:36.831+00:00
1,48TqCgU8zC2H5tWshNriY2bWHDULSTSvdgL4iP1Fpump,holo,HOLO,6,2025-09-19T18:02:36.831+00:00
2,D3S1AW1Tj1BbQVCo34D9frJDoD81dU8YRCPhbtUUpump,CAPY,CAPY,6,2025-09-19T18:02:36.831+00:00
3,HogxGo1jDwvseBdYNvNBM7UYpsWJPifbH7hM5nCvBWuw,Hog,HOG,9,2025-09-19T18:02:36.831+00:00
4,6fnYdoJhYkifvt52pfNtUDr31ZYXmof7JiL9SFrMpump,hit meeee upp,HMU,6,2025-09-19T18:02:36.831+00:00


## Exploratory Data Analysis

In [12]:
# How many records are in the dataset
len(df)

5148

In [13]:
# What is the dimension of our dataset
df.shape

(5148, 5)

In [14]:
# Check for missing values
df.isnull().sum()

token_address     0
token_name        0
token_symbol      0
token_decimals    0
timestamp         0
dtype: int64

In [15]:
# Check for duplicates

df.duplicated().sum()

0

In [16]:
# Get summary statistics

df.describe()

Unnamed: 0,token_decimals
count,5148.0
mean,6.763209
std,1.520046
min,0.0
25%,6.0
50%,6.0
75%,8.0
max,18.0


### Get tokens prices from another endpoint

In [None]:
# Get all unique contract addresses from your DataFrame
contract_addresses = df['token_address'].tolist()


# Get only the first 100 addresses for testing
test_addresses = contract_addresses[:100] 

# Define the CoinGecko API endpoint
addresses_param = ",".join(test_addresses)


url = f"https://api.coingecko.com/api/v3/simple/token_price/solana?contract_addresses={addresses_param}&vs_currencies=usd&include_market_cap=true&include_24hr_vol=true&include_24hr_change=true"
headers = {"x-cg-demo-api-key": api_key} 


# --- 3. Make the API Call ---
response = rq.get(url, headers=headers)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
price_data = response.json()


# Convert the dictionary response directly into a DataFrame
df_prices = pd.DataFrame.from_dict(price_data, orient='index')

# Rename the index column to 'token_address' so it can be merged
df_prices.index.name = 'token_address'
df_prices = df_prices.reset_index()

# Rename columns to be more descriptive and consistent
df_prices = df_prices.rename(columns={
    'usd': 'price_usd',
    'usd_market_cap': 'market_cap_usd',
    'usd_24h_vol': '24h_volume_usd',
    'usd_24h_change': '24h_price_change_percentage'
})


# Define a list of desired columns
desired_cols = [
        'token_address',
        'price_usd',
        'market_cap_usd',
        '24h_volume_usd',
        '24h_price_change_percentage'
    ]

df_prices = df_prices[desired_cols]

print("\nNew price data DataFrame:")
df_prices.head()




API response received.

New price data DataFrame:


Unnamed: 0,token_address,price_usd,market_cap_usd,24h_volume_usd,24h_price_change_percentage
0,BWhsvkyrUJqVvrAKjGYLpnTuUCG4SPEh6xVKcjnYCi27,0.000106,0.0,120.0209,-2.702308
1,48TqCgU8zC2H5tWshNriY2bWHDULSTSvdgL4iP1Fpump,9.6e-05,95585.479708,1120518.0,-9.607765
2,D3S1AW1Tj1BbQVCo34D9frJDoD81dU8YRCPhbtUUpump,1.5e-05,15125.093661,48.58999,-6.191488
3,HogxGo1jDwvseBdYNvNBM7UYpsWJPifbH7hM5nCvBWuw,8e-06,0.0,1.686757,
4,6fnYdoJhYkifvt52pfNtUDr31ZYXmof7JiL9SFrMpump,3.7e-05,36701.43516,227.9902,-2.856205


In [34]:
# Merge df with the new price data on 'token_address'
df_merged = pd.merge(df, df_prices, on='token_address', how='right')

print("\nMerged DataFrame:")
df_merged.head()


Merged DataFrame:


Unnamed: 0,token_address,token_name,token_symbol,token_decimals,timestamp,price_usd,market_cap_usd,24h_volume_usd,24h_price_change_percentage
0,BWhsvkyrUJqVvrAKjGYLpnTuUCG4SPEh6xVKcjnYCi27,HmmOnSOL,HMM,9,2025-09-19T18:02:36.831+00:00,0.000106,0.0,120.0209,-2.702308
1,48TqCgU8zC2H5tWshNriY2bWHDULSTSvdgL4iP1Fpump,holo,HOLO,6,2025-09-19T18:02:36.831+00:00,9.6e-05,95585.479708,1120518.0,-9.607765
2,D3S1AW1Tj1BbQVCo34D9frJDoD81dU8YRCPhbtUUpump,CAPY,CAPY,6,2025-09-19T18:02:36.831+00:00,1.5e-05,15125.093661,48.58999,-6.191488
3,HogxGo1jDwvseBdYNvNBM7UYpsWJPifbH7hM5nCvBWuw,Hog,HOG,9,2025-09-19T18:02:36.831+00:00,8e-06,0.0,1.686757,
4,6fnYdoJhYkifvt52pfNtUDr31ZYXmof7JiL9SFrMpump,hit meeee upp,HMU,6,2025-09-19T18:02:36.831+00:00,3.7e-05,36701.43516,227.9902,-2.856205


In [35]:
# What is the dimension of our new dataset
df_merged.shape

(97, 9)

In [None]:
# Summary statistics of the merged dataset
df_merged.describe()

Unnamed: 0,token_decimals,price_usd,market_cap_usd,24h_volume_usd,24h_price_change_percentage
count,97.0,97.0,97.0,97.0,86.0
mean,6.515464,2.928746,9121986.0,814643.3,-5.23109
std,1.568661,27.46195,48324890.0,5266136.0,6.698998
min,0.0,7.50422e-13,0.0,0.0,-34.768045
25%,6.0,1.358e-05,9237.666,8.001775,-6.599819
50%,6.0,4.671e-05,26162.22,98.88648,-5.252254
75%,6.0,0.00023925,141534.3,2641.046,-2.740782
max,9.0,270.3,406379800.0,44737270.0,23.006717
