## Import the libraries

In [5]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf

## Download Bitcoin data ( 5 years, daily)

In [6]:
ticker = "BTC-USD"
period = "5y"
interval = "1d"

df_raw = yf.download(ticker, period=period, interval=interval, auto_adjust=False)

print("Downloaded shape:", df_raw.shape)
df_raw.head()

[*********************100%***********************]  1 of 1 completed

Downloaded shape: (1827, 6)





Price,Adj Close,Close,High,Low,Open,Volume
Ticker,BTC-USD,BTC-USD,BTC-USD,BTC-USD,BTC-USD,BTC-USD
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2021-02-19,55888.132812,55888.132812,56113.652344,50937.277344,51675.980469,63495496918
2021-02-20,56099.519531,56099.519531,57505.226562,54626.558594,55887.335938,68145460026
2021-02-21,57539.945312,57539.945312,58330.570312,55672.609375,56068.566406,51897585191
2021-02-22,54207.320312,54207.320312,57533.390625,48967.566406,57532.738281,92052420332
2021-02-23,48824.425781,48824.425781,54204.929688,45290.589844,54204.929688,106102492824


## Quick Check

In [7]:
df_raw.tail()


Price,Adj Close,Close,High,Low,Open,Volume
Ticker,BTC-USD,BTC-USD,BTC-USD,BTC-USD,BTC-USD,BTC-USD
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2026-02-15,68788.1875,68788.1875,70939.289062,68052.546875,69764.953125,40191152750
2026-02-16,68843.15625,68843.15625,70067.234375,67301.585938,68782.398438,33618145426
2026-02-17,67494.21875,67494.21875,69201.867188,66615.28125,68843.09375,34866936040
2026-02-18,66425.320312,66425.320312,68434.429688,65845.898438,67488.023438,33094301643
2026-02-19,66329.359375,66329.359375,67195.507812,65722.882812,66441.851562,33258504192


In [8]:
df_raw.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1827 entries, 2021-02-19 to 2026-02-19
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   (Adj Close, BTC-USD)  1827 non-null   float64
 1   (Close, BTC-USD)      1827 non-null   float64
 2   (High, BTC-USD)       1827 non-null   float64
 3   (Low, BTC-USD)        1827 non-null   float64
 4   (Open, BTC-USD)       1827 non-null   float64
 5   (Volume, BTC-USD)     1827 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 99.9 KB


## keep only the closing price

In [9]:
# Keep only Close
df = df_raw[["Close"]].copy()

# Make sure index is datetime
df.index = pd.to_datetime(df.index)

# Sort by date (important for time series)
df = df.sort_index()

df.head()

Price,Close
Ticker,BTC-USD
Date,Unnamed: 1_level_2
2021-02-19,55888.132812
2021-02-20,56099.519531
2021-02-21,57539.945312
2021-02-22,54207.320312
2021-02-23,48824.425781


## Data quality check
We will check missing values, Duplicated timestamps, basic stats

In [10]:
print("Missing value:", df.isna().sum())

Missing value: Price  Ticker 
Close  BTC-USD    0
dtype: int64


In [11]:
print("Duplicate date:", df.index.duplicated().sum())

Duplicate date: 0


In [13]:
print("Basic Stats:", df.describe())

Basic Stats: Price           Close
Ticker        BTC-USD
count     1827.000000
mean     55635.185006
std      29714.195527
min      15787.284180
25%      29412.204102
50%      48176.347656
75%      72870.097656
max     124752.531250


## Save the cleaned data set 

In [16]:
os.makedirs("data", exist_ok=True)
output_path = "data/btc_cleaned.csv"
df.to_csv(output_path)

print("Saved cleaned dataset to:", output_path)
df.head()


Saved cleaned dataset to: data/btc_cleaned.csv


Price,Close
Ticker,BTC-USD
Date,Unnamed: 1_level_2
2021-02-19,55888.132812
2021-02-20,56099.519531
2021-02-21,57539.945312
2021-02-22,54207.320312
2021-02-23,48824.425781


In the note book 1,
- import the libraries
- download the dataset
- quick insepct the dataset
- quality check the dataset
- Save the cleaned dataset 