# **Step 1 --- Fetch Data From Binance**

In [1]:
import os
import requests
#print(requests.__version__)
import pandas as pd

In [2]:
def fetch_binance(symbol="BTCUSDT", interval="1d", limit=1000):
    url = "https://api.binance.com/api/v3/klines"
    params = {"symbol": symbol, "interval": interval, "limit": limit}
    response = requests.get(url, params=params)
    data = response.json()
    df = pd.DataFrame(data)
    df.columns = ["open_time","open","high","low","close","volume",
                  "close_time","quote_asset_volume","num_trades",
                  "taker_base_volume","taker_quote_volume","ignore"]
    return df

In [3]:
df = fetch_binance("BTCUSDT", "1d", 1000)
print(df.head())

       open_time            open            high             low  \
0  1677801600000  23465.32000000  23476.95000000  21971.13000000   
1  1677888000000  22354.34000000  22410.00000000  22157.08000000   
2  1677974400000  22346.57000000  22662.09000000  22189.22000000   
3  1678060800000  22430.24000000  22602.19000000  22258.00000000   
4  1678147200000  22409.41000000  22557.91000000  21927.00000000   

            close           volume     close_time   quote_asset_volume  \
0  22354.34000000  319954.19785000  1677887999999  7167184765.74364950   
1  22346.57000000  121257.38132000  1677974399999  2706422995.68025610   
2  22430.24000000  154841.75786000  1678060799999  3473011455.18795160   
3  22410.00000000  203751.82957000  1678147199999  4569102169.18569090   
4  22197.96000000  292519.80912000  1678233599999  6517594938.24605280   

   num_trades taker_base_volume   taker_quote_volume ignore  
0     8214639   156827.31366000  3512245357.18619130      0  
1     4169260    60043

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   open_time           1000 non-null   int64 
 1   open                1000 non-null   object
 2   high                1000 non-null   object
 3   low                 1000 non-null   object
 4   close               1000 non-null   object
 5   volume              1000 non-null   object
 6   close_time          1000 non-null   int64 
 7   quote_asset_volume  1000 non-null   object
 8   num_trades          1000 non-null   int64 
 9   taker_base_volume   1000 non-null   object
 10  taker_quote_volume  1000 non-null   object
 11  ignore              1000 non-null   object
dtypes: int64(3), object(9)
memory usage: 93.9+ KB
None


In [5]:
def save_raw_csv(df, symbol, interval):

#Save a DataFrame into data/raw/(symbol_interval).csv
    os.makedirs("data/raw", exist_ok=True)
    file_path = f"data/raw/{symbol}_{interval}.csv"
    df.to_csv(file_path, index=False)
    print(f"Saved: {file_path}")

save_raw_csv(df, "BTCUSDT", "1d")


Saved: data/raw/BTCUSDT_1d.csv


# **Step 2 --- Data Cleaning & Basic Processing**

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   open_time           1000 non-null   int64 
 1   open                1000 non-null   object
 2   high                1000 non-null   object
 3   low                 1000 non-null   object
 4   close               1000 non-null   object
 5   volume              1000 non-null   object
 6   close_time          1000 non-null   int64 
 7   quote_asset_volume  1000 non-null   object
 8   num_trades          1000 non-null   int64 
 9   taker_base_volume   1000 non-null   object
 10  taker_quote_volume  1000 non-null   object
 11  ignore              1000 non-null   object
dtypes: int64(3), object(9)
memory usage: 93.9+ KB


In [8]:
df["ignore"].value_counts()


ignore
0    1000
Name: count, dtype: int64

In [9]:
df = df.drop('ignore',axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   open_time           1000 non-null   int64 
 1   open                1000 non-null   object
 2   high                1000 non-null   object
 3   low                 1000 non-null   object
 4   close               1000 non-null   object
 5   volume              1000 non-null   object
 6   close_time          1000 non-null   int64 
 7   quote_asset_volume  1000 non-null   object
 8   num_trades          1000 non-null   int64 
 9   taker_base_volume   1000 non-null   object
 10  taker_quote_volume  1000 non-null   object
dtypes: int64(3), object(8)
memory usage: 86.1+ KB


In [10]:
df.tail()

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,num_trades,taker_base_volume,taker_quote_volume
995,1763769600000,85129.42,85620.0,83500.0,84739.74,14193.93263,1763855999999,1197657354.9690385,4697593,6688.79428,564562148.9373932
996,1763856000000,84739.75,88127.64,84667.57,86830.0,19734.46418,1763942399999,1708685030.2798696,5063493,10406.51703,901210513.5204473
997,1763942400000,86830.0,89228.0,85272.0,88300.01,24663.12795,1764028799999,2150393686.9203186,6189156,12438.87518,1085485389.0234168
998,1764028800000,88300.01,88519.99,86116.0,87369.96,19567.0411,1764115199999,1708989000.038937,4838747,9423.17985,823123525.1402558
999,1764115200000,87369.97,88224.0,86306.77,87830.16,11998.43031,1764201599999,1047400643.4027432,2952754,5878.35646,513199644.58481365


In [11]:
df["open_time"] = pd.to_datetime(df["open_time"], unit='ms')
df["close_time"] = pd.to_datetime(df["close_time"], unit='ms')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open_time           1000 non-null   datetime64[ns]
 1   open                1000 non-null   object        
 2   high                1000 non-null   object        
 3   low                 1000 non-null   object        
 4   close               1000 non-null   object        
 5   volume              1000 non-null   object        
 6   close_time          1000 non-null   datetime64[ns]
 7   quote_asset_volume  1000 non-null   object        
 8   num_trades          1000 non-null   int64         
 9   taker_base_volume   1000 non-null   object        
 10  taker_quote_volume  1000 non-null   object        
dtypes: datetime64[ns](2), int64(1), object(8)
memory usage: 86.1+ KB


In [None]:
df["close"] = df["close"].astype(float)
df["num_trades"] = df["num_trades"].astype(float)
df["open"] = df["open"].astype(float)
df["high"] = df["high"].astype(float)
df["low"] = df["low"].astype(float)
df["volume"] = df["volume"].astype(float)
df["quote_asset_volume"] = df["quote_asset_volume"].astype(float)
df["taker_base_volume"] = df["taker_base_volume"].astype(float)
df["taker_quote_volume"] = df["taker_quote_volume"].astype(float)


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open_time           1000 non-null   datetime64[ns]
 1   open                1000 non-null   float64       
 2   high                1000 non-null   float64       
 3   low                 1000 non-null   float64       
 4   close               1000 non-null   float64       
 5   volume              1000 non-null   float64       
 6   close_time          1000 non-null   datetime64[ns]
 7   quote_asset_volume  1000 non-null   float64       
 8   num_trades          1000 non-null   float64       
 9   taker_base_volume   1000 non-null   float64       
 10  taker_quote_volume  1000 non-null   float64       
dtypes: datetime64[ns](2), float64(9)
memory usage: 86.1 KB


In [18]:
def save_processed_csv(df, symbol, interval):

#Save a DataFrame into data/raw/(symbol_interval).csv
    os.makedirs("data/processed", exist_ok=True)
    file_path = f"data/processed/{symbol}_{interval}.csv"
    df.to_csv(file_path, index=False)
    print(f"Saved: {file_path}")

save_processed_csv(df, "BTCUSDT", "1d")

Saved: data/processed/BTCUSDT_1d.csv
