# **Step 1 --- Fetch Data From Binance**

In [2]:
import os # This library assists with OS interaction
import requests # Handles HTTP requests with ease
#print(requests.__version__) # Check the version of requests library installed
import pandas as pd

In [3]:
# Function to fetch data using the Binance API and store it in a Pandas Dataframe
def fetch_binance(symbol="BTCUSDT", interval="1d", limit=1000):
    url = "https://api.binance.com/api/v3/klines"
    params = {"symbol": symbol, "interval": interval, "limit": limit}
    response = requests.get(url, params=params)
    data = response.json()
    df = pd.DataFrame(data)
    df.columns = ["open_time","open","high","low","close","volume",
                  "close_time","quote_asset_volume","num_trades",
                  "taker_base_volume","taker_quote_volume","ignore"]
    return df

In [4]:
# Checking the stored DataFrame
df = fetch_binance("BTCUSDT", "1d", 1000)
print(df.head())

       open_time            open            high             low  \
0  1678406400000  20362.21000000  20367.78000000  19549.09000000   
1  1678492800000  20150.69000000  20686.51000000  19765.03000000   
2  1678579200000  20455.73000000  22150.00000000  20270.60000000   
3  1678665600000  21998.05000000  24500.00000000  21813.88000000   
4  1678752000000  24112.27000000  26386.87000000  23976.42000000   

            close           volume     close_time    quote_asset_volume  \
0  20150.69000000  618456.46710000  1678492799999  12344992788.91249570   
1  20455.73000000  427831.82133000  1678579199999   8651590672.46852580   
2  21997.11000000  430944.94288000  1678665599999   8982418413.80113910   
3  24113.48000000  687889.31259000  1678751999999  15824996078.60563840   
4  24670.41000000  699360.93423000  1678838399999  17465307097.88407330   

   num_trades taker_base_volume   taker_quote_volume ignore  
0    12106261   308155.80993000  6151475123.53551420      0  
1    10412300   

In [5]:
# Checking for more information regarding the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   open_time           1000 non-null   int64 
 1   open                1000 non-null   object
 2   high                1000 non-null   object
 3   low                 1000 non-null   object
 4   close               1000 non-null   object
 5   volume              1000 non-null   object
 6   close_time          1000 non-null   int64 
 7   quote_asset_volume  1000 non-null   object
 8   num_trades          1000 non-null   int64 
 9   taker_base_volume   1000 non-null   object
 10  taker_quote_volume  1000 non-null   object
 11  ignore              1000 non-null   object
dtypes: int64(3), object(9)
memory usage: 93.9+ KB
None


In [6]:
#Function to save and store csv file of data obtained
def save_raw_csv(df, symbol, interval):

#Save a DataFrame into data/raw/(symbol_interval).csv
    os.makedirs("data/raw", exist_ok=True)
    file_path = f"data/raw/{symbol}_{interval}.csv"
    df.to_csv(file_path, index=False)
    print(f"Saved: {file_path}")

save_raw_csv(df, "BTCUSDT", "1d")


Saved: data/raw/BTCUSDT_1d.csv


# **Step 2 --- Data Cleaning & Basic Processing**

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   open_time           1000 non-null   int64 
 1   open                1000 non-null   object
 2   high                1000 non-null   object
 3   low                 1000 non-null   object
 4   close               1000 non-null   object
 5   volume              1000 non-null   object
 6   close_time          1000 non-null   int64 
 7   quote_asset_volume  1000 non-null   object
 8   num_trades          1000 non-null   int64 
 9   taker_base_volume   1000 non-null   object
 10  taker_quote_volume  1000 non-null   object
 11  ignore              1000 non-null   object
dtypes: int64(3), object(9)
memory usage: 93.9+ KB


In [8]:
df["ignore"].value_counts()

ignore
0    1000
Name: count, dtype: int64

In [9]:
#Dropping the ignore column
df = df.drop('ignore',axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   open_time           1000 non-null   int64 
 1   open                1000 non-null   object
 2   high                1000 non-null   object
 3   low                 1000 non-null   object
 4   close               1000 non-null   object
 5   volume              1000 non-null   object
 6   close_time          1000 non-null   int64 
 7   quote_asset_volume  1000 non-null   object
 8   num_trades          1000 non-null   int64 
 9   taker_base_volume   1000 non-null   object
 10  taker_quote_volume  1000 non-null   object
dtypes: int64(3), object(8)
memory usage: 86.1+ KB


In [10]:
df.tail()

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,num_trades,taker_base_volume,taker_quote_volume
995,1764374400000,90890.71,91165.65,90155.47,90802.44,7429.88291,1764460799999,673725652.9787655,1677860,3359.04932,304614219.1452583
996,1764460800000,90802.44,92000.01,90336.9,90360.0,9687.74175,1764547199999,884051773.1655338,2154030,5002.96698,456606490.1203246
997,1764547200000,90360.01,90417.0,83822.76,86286.01,34509.01227,1764633599999,2977024047.2231607,7709685,15524.33346,1338718794.0013125
998,1764633600000,86286.01,92307.65,86184.39,91277.88,28210.22732,1764719999999,2519515177.662365,6397299,13512.33949,1206199898.2941513
999,1764720000000,91277.88,93958.58,90990.23,93254.78,15820.097,1764806399999,1466982880.1122732,3997706,8161.86719,756882880.8297262


In [11]:
# Converting raw fields: - timestamps → datetime
df["open_time"] = pd.to_datetime(df["open_time"], unit='ms')
df["close_time"] = pd.to_datetime(df["close_time"], unit='ms')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open_time           1000 non-null   datetime64[ns]
 1   open                1000 non-null   object        
 2   high                1000 non-null   object        
 3   low                 1000 non-null   object        
 4   close               1000 non-null   object        
 5   volume              1000 non-null   object        
 6   close_time          1000 non-null   datetime64[ns]
 7   quote_asset_volume  1000 non-null   object        
 8   num_trades          1000 non-null   int64         
 9   taker_base_volume   1000 non-null   object        
 10  taker_quote_volume  1000 non-null   object        
dtypes: datetime64[ns](2), int64(1), object(8)
memory usage: 86.1+ KB


In [13]:
# Converting raw fields: numeric columns → float
df["close"] = df["close"].astype(float)
df["num_trades"] = df["num_trades"].astype(float)
df["open"] = df["open"].astype(float)
df["high"] = df["high"].astype(float)
df["low"] = df["low"].astype(float)
df["volume"] = df["volume"].astype(float)
df["quote_asset_volume"] = df["quote_asset_volume"].astype(float)
df["taker_base_volume"] = df["taker_base_volume"].astype(float)
df["taker_quote_volume"] = df["taker_quote_volume"].astype(float)


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open_time           1000 non-null   datetime64[ns]
 1   open                1000 non-null   float64       
 2   high                1000 non-null   float64       
 3   low                 1000 non-null   float64       
 4   close               1000 non-null   float64       
 5   volume              1000 non-null   float64       
 6   close_time          1000 non-null   datetime64[ns]
 7   quote_asset_volume  1000 non-null   float64       
 8   num_trades          1000 non-null   float64       
 9   taker_base_volume   1000 non-null   float64       
 10  taker_quote_volume  1000 non-null   float64       
dtypes: datetime64[ns](2), float64(9)
memory usage: 86.1 KB


In [15]:
#Function to save and store csv file of data obtained
def save_processed_csv(df, symbol, interval):

#Save a DataFrame into data/processed/(symbol_interval).csv
    os.makedirs("data/processed", exist_ok=True)
    file_path = f"data/processed/{symbol}_{interval}.csv"
    df.to_csv(file_path, index=False)
    print(f"Saved: {file_path}")

save_processed_csv(df, "BTCUSDT", "1d")

Saved: data/processed/BTCUSDT_1d.csv
