In [1]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, cross_val_score, KFold 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from feature_engine.outliers import Winsorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
# Data Loading
df = pd.read_csv('clean_fnb_data.csv')

In [3]:
df

Unnamed: 0,emiten_code,year,date,eps,per,pbv,bvps,ebitda,roa,roe,fcf,fcf_ps,gdp_forecast,political_stability,inflation_forecast,return
0,ADES,2024,Q3,196.76,51.20,2.86,3525.89,1.498380e+11,4.69,5.58,-8.470000e+09,-14.36,5.0,17.1,2.5,-5.687204
1,ADES,2024,Q2,173.98,60.78,3.18,3329.13,1.320190e+11,4.45,5.23,3.963500e+10,67.19,5.0,17.1,2.5,15.300546
2,ADES,2024,Q1,222.76,40.85,2.88,3155.15,1.662250e+11,5.81,7.06,7.798300e+10,132.20,5.0,17.1,2.5,-5.426357
3,ADES,2023,Q4,176.16,54.92,3.30,2932.39,1.398630e+11,4.98,6.01,1.156430e+11,196.04,4.9,17.5,3.0,-7.857143
4,ADES,2023,Q3,180.55,58.85,3.85,2757.63,1.372790e+11,5.49,6.55,1.074140e+11,182.09,4.9,17.5,3.0,24.629080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814,UNVR,2016,Q1,205.77,208.61,51.20,838.45,0.000000e+00,9.43,24.54,1.053940e+12,138.13,5.1,21.4,4.0,16.216216
815,UNVR,2015,Q4,218.69,169.19,58.48,632.68,0.000000e+00,10.61,34.57,2.200957e+12,288.46,5.2,17.1,4.0,-2.631579
816,UNVR,2015,Q3,164.16,231.48,49.88,761.78,0.000000e+00,7.84,21.55,6.868200e+11,90.02,5.2,17.1,4.0,-4.761905
817,UNVR,2015,Q2,175.48,225.10,66.10,597.62,0.000000e+00,8.12,29.36,1.151308e+12,150.89,5.2,17.1,4.0,1.785714


In [4]:
# Uji Korelasi Kendall Tau fitur kategorikal
kendall_corr = df["date"].corr(df["return"], method="kendall")
print('Nilai Uji Korelasi Kendall Tau "date" : ', kendall_corr)

Nilai Uji Korelasi Kendall Tau "date" :  0.022569288384538398


In [5]:
# Uji Korelasi Pearson fitur numerikal
pearson_corr = df["eps"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "eps"                 : ', pearson_corr)
pearson_corr = df["per"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "per"                 : ', pearson_corr)
pearson_corr = df["pbv"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "pbv"                 : ', pearson_corr)
pearson_corr = df["bvps"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "bvps"                : ', pearson_corr)
pearson_corr = df["ebitda"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "ebitda"              : ', pearson_corr)
pearson_corr = df["roa"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "roa"                 : ', pearson_corr)
pearson_corr = df["roe"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "roe"                 : ', pearson_corr)
pearson_corr = df["fcf"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "fcf"                 : ', pearson_corr)
pearson_corr = df["fcf_ps"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "fcf_ps"              : ', pearson_corr)
pearson_corr = df["gdp_forecast"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "gdp_forecast"        : ', pearson_corr)
pearson_corr = df["political_stability"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "political_stability" : ', pearson_corr)
pearson_corr = df["inflation_forecast"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "inflation_forecast"  : ', pearson_corr)

Nilai Uji Korelasi Pearson "eps"                 :  0.01398432614094401
Nilai Uji Korelasi Pearson "per"                 :  0.0229013058413284
Nilai Uji Korelasi Pearson "pbv"                 :  0.00017760438345048414
Nilai Uji Korelasi Pearson "bvps"                :  -0.017062703191277454
Nilai Uji Korelasi Pearson "ebitda"              :  -0.027068489739256945
Nilai Uji Korelasi Pearson "roa"                 :  0.0009826089331792993
Nilai Uji Korelasi Pearson "roe"                 :  0.00013401714905512634
Nilai Uji Korelasi Pearson "fcf"                 :  -0.015778752063766267
Nilai Uji Korelasi Pearson "fcf_ps"              :  -0.020439812632405775
Nilai Uji Korelasi Pearson "gdp_forecast"        :  -0.006853863789062535
Nilai Uji Korelasi Pearson "political_stability" :  0.051167649940864826
Nilai Uji Korelasi Pearson "inflation_forecast"  :  0.04157307986436761


In [6]:
# Split label dan fitur yang akan digunakan dalam pembuatan model
X = df.drop(columns=['emiten_code', 'year'])
y = df['return']

In [7]:
# Split data train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)
print('Train Size: ', X_train.shape)
print('Test Size: ', X_test.shape)

Train Size:  (655, 14)
Test Size:  (164, 14)


In [8]:
# Uji Distribusi Normalitas dengan Uji Skewness
print('Nilai Normalitas Skewness eps  : ', X_train['eps'].skew())
print('Nilai Normalitas Skewness per   : ', X_train['per'].skew())
print('Nilai Normalitas Skewness pbv  : ', X_train['pbv'].skew())
print('Nilai Normalitas Skewness bvps : ', X_train['bvps'].skew())
print('Nilai Normalitas Skewness ebitda : ', X_train['ebitda'].skew())
print('Nilai Normalitas Skewness roa : ', X_train['roa'].skew())
print('Nilai Normalitas Skewness roe : ', X_train['roe'].skew())
print('Nilai Normalitas Skewness fcf : ', X_train['fcf'].skew())
print('Nilai Normalitas Skewness fcf_ps : ', X_train['fcf_ps'].skew())
print('Nilai Normalitas Skewness gdp_forecast : ', X_train['gdp_forecast'].skew())
print('Nilai Normalitas Skewness political_stability : ', X_train['political_stability'].skew())
print('Nilai Normalitas Skewness inflation_forecast : ', X_train['inflation_forecast'].skew())

Nilai Normalitas Skewness eps  :  -2.069672606499073
Nilai Normalitas Skewness per   :  4.10403244488166
Nilai Normalitas Skewness pbv  :  18.058356599712308
Nilai Normalitas Skewness bvps :  13.468256136458558
Nilai Normalitas Skewness ebitda :  3.344397212283065
Nilai Normalitas Skewness roa :  24.96038755102863
Nilai Normalitas Skewness roe :  18.577253179515594
Nilai Normalitas Skewness fcf :  2.5165792439413566
Nilai Normalitas Skewness fcf_ps :  10.453147874453505
Nilai Normalitas Skewness gdp_forecast :  -2.39154631593233
Nilai Normalitas Skewness political_stability :  1.1391031309269664
Nilai Normalitas Skewness inflation_forecast :  0.0401874773357774


In [9]:
# Handling outlier dengan Winsorizer
skew = ['eps', 'per', 'pbv', 'bvps', 'ebitda',
       'roa', 'roe', 'fcf', 'fcf_ps', 'gdp_forecast', 'political_stability', 'return']
winsorizer_1 = Winsorizer(capping_method='iqr',
                        tail='both',
                        fold=3,
                        variables=skew,
                        missing_values='ignore')

X_train_capped = winsorizer_1.fit_transform(X_train)
X_test_capped = winsorizer_1.transform(X_test)

In [10]:
# Membagi fitur kategorikal dan numerikal
categorical_columns = ['date']
numerical_columns = ['eps', 'per', 'pbv', 'bvps', 'ebitda', 'roa', 'roe', 'fcf',
       'fcf_ps', 'gdp_forecast', 'political_stability', 'inflation_forecast']

# Feature transformation dengan menggunakan robustscaler dan OHE
preprocess = ColumnTransformer([
    ("num", RobustScaler(), numerical_columns),  # Standardisasi fitur numerik
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns)  # Encoding fitur kategorikal
])

In [11]:
# Membuat pipeline preprocess untuk setiap algoritma regresi
pipe_linreg = make_pipeline(preprocess, LinearRegression())
pipe_ridge = make_pipeline(preprocess, Ridge())
pipe_lasso = make_pipeline(preprocess, Lasso())
pipe_knn = make_pipeline(preprocess, KNeighborsRegressor())
pipe_svr = make_pipeline(preprocess, SVR())
pipe_dt = make_pipeline(preprocess, DecisionTreeRegressor())
pipe_rf = make_pipeline(preprocess, RandomForestRegressor())
pipe_xgb = make_pipeline(preprocess, XGBRegressor())

In [12]:
# Membuat nama untuk masing-masing pipeline
pipelines = {
    "Linear Regression": pipe_linreg,
    "Ridge Regression": pipe_ridge,
    "Lasso Regression": pipe_lasso,
    "KNN Regressor": pipe_knn,
    "SVR": pipe_svr,
    "Decision Tree": pipe_dt,
    "Random Forest": pipe_rf,
    "XGBoost": pipe_xgb
}

# Looping training untuk masing-masing pipeline
for name, model in pipelines.items():
    model.fit(X_train, y_train)
    print(f"{name} DONE")

Linear Regression DONE
Ridge Regression DONE
Lasso Regression DONE
KNN Regressor DONE
SVR DONE
Decision Tree DONE
Random Forest DONE
XGBoost DONE


In [13]:
# Setting cross validation untuk masing-masing model pipeline
kfold = KFold(n_splits = 5)

cv_linreg_model = cross_val_score(pipe_linreg, X_train_capped, y_train, cv=kfold, 
                                  scoring="neg_mean_absolute_error")
cv_ridge_model = cross_val_score(pipe_ridge, X_train_capped, y_train, cv=kfold, 
                                 scoring="neg_mean_absolute_error")
cv_lasso_model = cross_val_score(pipe_lasso, X_train_capped, y_train, cv=kfold, 
                                 scoring="neg_mean_absolute_error")
cv_knn_model = cross_val_score(pipe_knn, X_train_capped, y_train, cv=kfold, 
                               scoring="neg_mean_absolute_error")
cv_svr_model = cross_val_score(pipe_svr, X_train_capped, y_train, cv=kfold, 
                               scoring="neg_mean_absolute_error")
cv_dt_model = cross_val_score(pipe_dt, X_train_capped, y_train, cv=kfold, 
                              scoring="neg_mean_absolute_error")
cv_rf_model = cross_val_score(pipe_rf, X_train_capped, y_train, cv=kfold, 
                              scoring="neg_mean_absolute_error")
cv_xgb_model = cross_val_score(pipe_xgb, X_train_capped, y_train, cv=kfold,
                               scoring="neg_mean_absolute_error")

In [14]:
# Membuat variabel baru model dan nama model untuk evaluasi model
models = [cv_linreg_model, cv_ridge_model, cv_lasso_model, cv_knn_model]

names = ['linreg_model', 'ridge_model', 'lasso_model', 'knn_model']

# Looping untuk mendapatkan hasil evaluasi model
for model, name in zip(models, names):
    mean_mae = -model.mean()  # Memastikan nialinya tidak negatif karena menggunakan MAE 
    std_mae = model.std()

    print(f"{name}:")
    print(f"  MAE - All - Cross Validation : {model}")
    print(f"  MAE - Mean - Cross Validation : {mean_mae:.4f}")
    print(f"  MAE - std - Cross Validation : {std_mae:.4f}")
    print(f"  MAE - Range of Test Set : {mean_mae - std_mae:.4f} - {mean_mae + std_mae:.4f}")
    print("-" * 50)

linreg_model:
  MAE - All - Cross Validation : [-12.98191939 -12.06933127 -11.37447806 -11.68438027 -14.79811075]
  MAE - Mean - Cross Validation : 12.5816
  MAE - std - Cross Validation : 1.2327
  MAE - Range of Test Set : 11.3489 - 13.8143
--------------------------------------------------
ridge_model:
  MAE - All - Cross Validation : [-12.97632802 -12.06714458 -11.37252846 -11.67986077 -14.79059216]
  MAE - Mean - Cross Validation : 12.5773
  MAE - std - Cross Validation : 1.2309
  MAE - Range of Test Set : 11.3464 - 13.8081
--------------------------------------------------
lasso_model:
  MAE - All - Cross Validation : [-12.3588975  -11.70203917 -11.24842996 -11.17824332 -14.32144678]
  MAE - Mean - Cross Validation : 12.1618
  MAE - std - Cross Validation : 1.1590
  MAE - Range of Test Set : 11.0029 - 13.3208
--------------------------------------------------
knn_model:
  MAE - All - Cross Validation : [-13.59259552 -13.83739259 -12.58889279 -13.79243326 -15.04754656]
  MAE - Mean

In [15]:
# Membuat variabel baru model dan nama model untuk evaluasi model
models = [cv_svr_model, cv_dt_model, cv_rf_model, cv_xgb_model]
names = ['svr_model', 'dt_model', 'rf_model', 'xgb_model']

# Looping untuk mendapatkan hasil evaluasi model
for model, name in zip(models, names):
    mean_mae = -model.mean()  # Memastikan nialinya tidak negatif karena menggunakan MAE 
    std_mae = model.std()

    print(f"{name}:")
    print(f"  MAE - All - Cross Validation : {model}")
    print(f"  MAE - Mean - Cross Validation : {mean_mae:.4f}")
    print(f"  MAE - std - Cross Validation : {std_mae:.4f}")
    print(f"  MAE - Range of Test Set : {mean_mae - std_mae:.4f} - {mean_mae + std_mae:.4f}")
    print("-" * 50)

svr_model:
  MAE - All - Cross Validation : [-12.29405522 -11.62355252 -10.96309434 -11.28215831 -13.98642963]
  MAE - Mean - Cross Validation : 12.0299
  MAE - std - Cross Validation : 1.0733
  MAE - Range of Test Set : 10.9565 - 13.1032
--------------------------------------------------
dt_model:
  MAE - All - Cross Validation : [-20.05429832 -20.88486246 -17.55338854 -17.77679942 -18.91081266]
  MAE - Mean - Cross Validation : 19.0360
  MAE - std - Cross Validation : 1.2849
  MAE - Range of Test Set : 17.7511 - 20.3209
--------------------------------------------------
rf_model:
  MAE - All - Cross Validation : [-13.39629699 -13.21323013 -12.70475534 -12.20620888 -14.73417749]
  MAE - Mean - Cross Validation : 13.2509
  MAE - std - Cross Validation : 0.8500
  MAE - Range of Test Set : 12.4010 - 14.1009
--------------------------------------------------
xgb_model:
  MAE - All - Cross Validation : [-15.3942474  -14.35505467 -13.77470042 -12.84789676 -15.67374608]
  MAE - Mean - Cross 

In [16]:
# Mendapatkan hasil prediksi untuk data train dan test
y_pred_train = pipe_svr.predict(X_train_capped)
y_pred_test = pipe_svr.predict(X_test)

# Evaluasi Model - Train
print("Train Set Evaluation")
print('MAE :', mean_absolute_error(y_train, y_pred_train))
print('MSE :', mean_squared_error(y_train, y_pred_train))
print('R² Score:', r2_score(y_train, y_pred_train))
print("=" * 50)

# Evaluasi Model - Test
print("Test Set Evaluation")
print('MAE :', mean_absolute_error(y_test, y_pred_test))
print('MSE :', mean_squared_error(y_test, y_pred_test))
print('R² Score:', r2_score(y_test, y_pred_test))


Train Set Evaluation
MAE : 11.95625460709745
MSE : 411.1451282084271
R² Score: -0.003677390493856736
Test Set Evaluation
MAE : 14.844313996846061
MSE : 1333.8020558380726
R² Score: -0.005694234785030616


In [17]:
# Membuat parameter SVR yang akan digunakan dalam RandomSearch
param_svr = {
    'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Jenis kernel
    'svr__C': [0.1, 1, 10, 100],  # Regularisasi
    'svr__epsilon': [0.01, 0.1, 0.5, 1],  # Margin toleransi
    'svr__gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Parameter untuk kernel 'rbf', 'poly', dan 'sigmoid'
    'svr__degree': [2, 3, 4],  # Hanya untuk kernel 'poly'
}
param_svr

{'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
 'svr__C': [0.1, 1, 10, 100],
 'svr__epsilon': [0.01, 0.1, 0.5, 1],
 'svr__gamma': ['scale', 'auto', 0.01, 0.1, 1],
 'svr__degree': [2, 3, 4]}

In [18]:
# Mengatur RandomSearchCV untuk SVR
random_search_svr = RandomizedSearchCV(pipe_svr,
                                      param_distributions=param_svr,
                                      scoring='neg_mean_absolute_error',
                                      cv=kfold,
                                      verbose=2,
                                      n_jobs=-1,
                                      random_state=42
                                      )

# Fit RandomSearchCV untuk SVR
random_search_svr.fit(X_train_capped, y_train)

# Mencari hyperparameter terbaik untuk SVR
print('Best hyperparameters for SVR :', random_search_svr.best_params_)

# Mencari nilai MAE terbaik untuk SVR
print('Best MAE for SVR :', -random_search_svr.best_score_)

# Menyimpan model terbaik kedalam variabel
best_model = random_search_svr.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters for SVR : {'svr__kernel': 'sigmoid', 'svr__gamma': 0.1, 'svr__epsilon': 0.5, 'svr__degree': 4, 'svr__C': 0.1}
Best MAE for SVR : 11.957308934913138


In [19]:
# Mendapatkan hasil prediksi untuk data train dan test
y_pred_train_tuned = best_model.predict(X_train_capped)
y_pred_test_tuned = best_model.predict(X_test)

# Evaluasi Model - Train
print("Train Set Evaluation After Tuning")
print('MAE :', mean_absolute_error(y_train, y_pred_train_tuned))
print('MSE :', mean_squared_error(y_train, y_pred_train_tuned))
print('R² Score:', r2_score(y_train, y_pred_train_tuned))
print("=" * 50)

# Evaluasi Model - Test
print("Test Set Evaluation After Tuning")
print('MAE :', mean_absolute_error(y_test, y_pred_test_tuned))
print('MSE :', mean_squared_error(y_test, y_pred_test_tuned))
print('R² Score:', r2_score(y_test, y_pred_test_tuned))

Train Set Evaluation After Tuning
MAE : 11.98043661854867
MSE : 409.68545753627967
R² Score: -0.00011407829413490056
Test Set Evaluation After Tuning
MAE : 14.912853814923537
MSE : 1341.945015763789
R² Score: -0.011834072263563034


In [20]:
# Saving model
with open('best_fnb_svr_model.pkl', 'wb') as model_file:
  pickle.dump(best_model, model_file)