## **Import & Setup**

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split

## **1. Load Dataset**

In [2]:
df = pd.read_csv("data_saham_bbri_jk.csv")
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

print(df.head())

        date        close         high          low         open     volume
0 2020-01-02  2960.843262  2960.843262  2927.273539  2954.129281   45886302
1 2020-01-03  2967.556885  2980.984845  2947.415125  2967.556885   91189705
2 2020-01-06  2933.987793  2947.415575  2900.418067  2927.273812   48648450
3 2020-01-07  2954.129395  2960.843375  2940.701613  2960.843375  114344885
4 2020-01-08  2940.701416  2954.129196  2913.845675  2940.701416  188929583


## **2. Data Cleaning**

In [3]:
# Hapus missing values
df = df.dropna()

# Pastikan tipe data numeric
numeric_cols = ['open', 'high', 'low', 'close', 'volume']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

df = df.dropna(subset=numeric_cols)

# Hapus duplikat tanggal
df = df.drop_duplicates(subset=['date']).reset_index(drop=True)

df_clean = df.copy()
df_clean.to_csv("df_clean_bbri.csv", index=False)
df_clean.head()

Unnamed: 0,date,close,high,low,open,volume
0,2020-01-02,2960.843262,2960.843262,2927.273539,2954.129281,45886302
1,2020-01-03,2967.556885,2980.984845,2947.415125,2967.556885,91189705
2,2020-01-06,2933.987793,2947.415575,2900.418067,2927.273812,48648450
3,2020-01-07,2954.129395,2960.843375,2940.701613,2960.843375,114344885
4,2020-01-08,2940.701416,2954.129196,2913.845675,2940.701416,188929583


## **3. Feature Engineering**

**3.1 Moving Average**

In [4]:
df_clean["MA7"] = df_clean["close"].rolling(window=7).mean()
df_clean["MA14"] = df_clean["close"].rolling(window=14).mean()
df_clean["MA30"] = df_clean["close"].rolling(window=30).mean()

**3.2 Return**

In [5]:
df_clean["returns"] = df_clean["close"].pct_change()

**3.3 RSI**

In [6]:
def compute_rsi(series, period=14):
    delta = series.diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)

    avg_gain = pd.Series(gain).rolling(period).mean()
    avg_loss = pd.Series(loss).rolling(period).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

df_clean["RSI14"] = compute_rsi(df_clean["close"])

**3.4 MACD**

In [7]:
ema12 = df_clean["close"].ewm(span=12, adjust=False).mean()
ema26 = df_clean["close"].ewm(span=26, adjust=False).mean()

df_clean["MACD"] = ema12 - ema26
df_clean["MACD_signal"] = df_clean["MACD"].ewm(span=9, adjust=False).mean()
df_clean["MACD_hist"] = df_clean["MACD"] - df_clean["MACD_signal"]

**3.5 Lag Features**

In [8]:
df_clean["lag1"] = df_clean["close"].shift(1)
df_clean["lag3"] = df_clean["close"].shift(3)
df_clean["lag7"] = df_clean["close"].shift(7)

**Final Prep**

In [10]:
df_prepared = df_clean.dropna().reset_index(drop=True)
df_prepared.to_csv("df_prepared_bbri.csv", index=False)
df_prepared.head()

Unnamed: 0,date,close,high,low,open,volume,MA7,MA14,MA30,returns,RSI14,MACD,MACD_signal,MACD_hist,lag1,lag3,lag7
0,2020-02-12,3041.410889,3061.552472,3034.696907,3034.696907,143167912,3046.206159,3063.950091,3050.809945,0.004435,27.659649,2.543305,13.156758,-10.613452,3027.982666,3054.838135,2994.413086
1,2020-02-13,3068.266113,3081.694075,3048.124532,3074.980094,87683284,3047.165283,3055.797468,3054.390706,0.00883,33.33331,3.724548,11.270316,-7.545768,3041.410889,3021.268799,3061.552246
2,2020-02-14,3054.838135,3081.694058,3041.410534,3041.410534,139707803,3046.206124,3051.001796,3057.300081,-0.004376,38.636324,3.536401,9.723533,-6.187132,3068.266113,3027.982666,3061.552246
3,2020-02-17,3001.126953,3048.12464,3001.126953,3007.840934,131738165,3038.533098,3043.808315,3059.538053,-0.017582,34.693868,-0.935962,7.591634,-8.527596,3054.838135,3041.410889,3054.838135
4,2020-02-18,2954.129395,3021.268662,2954.129395,3001.1269,97565635,3024.146136,3033.257865,3059.538053,-0.01566,30.357121,-8.178377,4.437632,-12.616009,3001.126953,3068.266113,3054.838135


## **4. Split (Train / Validation / Test)**

In [11]:
train_size = int(len(df_prepared)*0.7)
val_size = int(len(df_prepared)*0.15)

train = df_prepared.iloc[:train_size]
val = df_prepared.iloc[train_size:train_size+val_size]
test = df_prepared.iloc[train_size+val_size:]

train.to_csv("train_bbri.csv", index=False)
val.to_csv("val_bbri.csv", index=False)
test.to_csv("test_bbri.csv", index=False)