# Data Preprocessing :
- Data Collection
- Data Cleaning
- Data Manipulation / Transformation

In [1]:
# Importing Dependencies
import yfinance as yf
import pandas as pd
import numpy as np

In [2]:
# Setting Up Ticker and Data Pull
ticker_symbol = 'BBCA.JK'
ticker = yf.Ticker(ticker_symbol)
stock_data = ticker.history(period='5y')
stock_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-07-28 00:00:00+07:00,5417.027249,5487.838063,5363.919139,5474.561035,65279000,0.0,0.0
2020-07-29 00:00:00+07:00,5474.561449,5474.561449,5399.324954,5430.304688,50262500,0.0,0.0
2020-07-30 00:00:00+07:00,5430.304936,5523.244141,5359.494114,5523.244141,80533500,0.0,0.0
2020-08-03 00:00:00+07:00,5505.540581,5518.817608,5155.912200,5425.878418,137079000,0.0,0.0
2020-08-04 00:00:00+07:00,5448.007502,5505.541294,5425.879121,5496.689941,79357000,0.0,0.0
...,...,...,...,...,...,...,...
2025-07-22 00:00:00+07:00,8450.000000,8475.000000,8400.000000,8400.000000,77981700,0.0,0.0
2025-07-23 00:00:00+07:00,8450.000000,8500.000000,8400.000000,8425.000000,70969500,0.0,0.0
2025-07-24 00:00:00+07:00,8475.000000,8550.000000,8450.000000,8500.000000,138256800,0.0,0.0
2025-07-25 00:00:00+07:00,8500.000000,8525.000000,8450.000000,8450.000000,58682800,0.0,0.0


In [3]:
# Delete Unrelated Columns
df = stock_data.copy()
df.drop(columns=['Dividends', 'Stock Splits'], inplace=True)
df.head()


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-07-28 00:00:00+07:00,5417.027249,5487.838063,5363.919139,5474.561035,65279000
2020-07-29 00:00:00+07:00,5474.561449,5474.561449,5399.324954,5430.304688,50262500
2020-07-30 00:00:00+07:00,5430.304936,5523.244141,5359.494114,5523.244141,80533500
2020-08-03 00:00:00+07:00,5505.540581,5518.817608,5155.9122,5425.878418,137079000
2020-08-04 00:00:00+07:00,5448.007502,5505.541294,5425.879121,5496.689941,79357000


In [4]:
# Check for Missing Value
df.isna().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [5]:
# Check for Duplicated Rows
len(df.drop_duplicates()) / len(df)

1.0

In [6]:
# Transform the datetime format
df = df.reset_index()
df['Date'] = pd.to_datetime(df['Date']).dt.date
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2020-07-28,5417.027249,5487.838063,5363.919139,5474.561035,65279000
1,2020-07-29,5474.561449,5474.561449,5399.324954,5430.304688,50262500
2,2020-07-30,5430.304936,5523.244141,5359.494114,5523.244141,80533500
3,2020-08-03,5505.540581,5518.817608,5155.9122,5425.878418,137079000
4,2020-08-04,5448.007502,5505.541294,5425.879121,5496.689941,79357000


In [7]:
df = df.set_index('Date')
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-07-28,5417.027249,5487.838063,5363.919139,5474.561035,65279000
2020-07-29,5474.561449,5474.561449,5399.324954,5430.304688,50262500
2020-07-30,5430.304936,5523.244141,5359.494114,5523.244141,80533500
2020-08-03,5505.540581,5518.817608,5155.9122,5425.878418,137079000
2020-08-04,5448.007502,5505.541294,5425.879121,5496.689941,79357000


In [None]:
def dataframe():
    return df.to_csv("BBCA.JK.csv")

dataframe()