In [1]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
import json

from sklearn.model_selection import train_test_split, cross_val_score, KFold 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from feature_engine.outliers import Winsorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Data loading
df = pd.read_csv('clean_financial_data.csv')

In [3]:
df

Unnamed: 0,emiten_code,year,date,eps,per,pbv,bvps,ebitda,roa,roe,fcf,fcf_ps,gdp_forecast,political_stability,inflation_forecast,return
0,AGRO,2024,Q3,0.56,478.57,1.92,139.72,-3.264659e+10,0.11,0.40,-1.862482e+11,-7.53,5.0,17.1,2.5,19.469027
1,AGRO,2024,Q2,0.44,513.64,1.63,138.28,4.654269e+10,0.08,0.32,3.193126e+11,12.91,5.0,17.1,2.5,-22.602740
2,AGRO,2024,Q1,0.37,783.78,2.09,138.52,3.736836e+10,0.07,0.27,-2.206736e+11,-8.92,5.0,17.1,2.5,-5.806452
3,AGRO,2023,Q4,0.39,794.87,2.24,138.25,7.749860e+08,0.08,0.28,5.784586e+11,23.38,4.9,17.5,3.0,1.973684
4,AGRO,2023,Q3,0.22,1381.82,2.21,137.48,2.462051e+10,0.05,0.16,-4.312402e+11,-17.43,4.9,17.5,3.0,-20.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1438,SDRA,2016,Q1,9.64,123.34,1.48,805.96,0.000000e+00,0.24,1.20,1.312118e+12,251.78,5.1,21.4,4.0,9.090647
1439,SDRA,2015,Q4,7.99,136.30,1.37,793.64,0.000000e+00,0.21,1.01,2.212736e+12,424.60,5.2,17.1,4.0,-7.562988
1440,SDRA,2015,Q3,9.30,126.77,1.51,781.05,0.000000e+00,0.25,1.19,5.498790e+11,105.52,5.2,17.1,4.0,-0.418310
1441,SDRA,2015,Q2,14.24,83.15,1.52,777.13,0.000000e+00,0.41,1.83,-1.703234e+12,-326.83,5.2,17.1,4.0,3.913027


In [4]:
# Uji Korelasi Kendall Tau fitur kategorikal
kendall_corr = df["date"].corr(df["return"], method="kendall")
print('Nilai Uji Korelasi Kendall Tau "date" : ', kendall_corr)

Nilai Uji Korelasi Kendall Tau "date" :  0.06338270365470382


In [5]:
# Uji Korelasi Kendall Tau fitur numerikal
pearson_corr = df["eps"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "eps"                 : ', pearson_corr)
pearson_corr = df["per"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "per"                 : ', pearson_corr)
pearson_corr = df["pbv"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "pbv"                 : ', pearson_corr)
pearson_corr = df["bvps"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "bvps"                : ', pearson_corr)
pearson_corr = df["ebitda"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "ebitda"              : ', pearson_corr)
pearson_corr = df["roa"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "roa"                 : ', pearson_corr)
pearson_corr = df["roe"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "roe"                 : ', pearson_corr)
pearson_corr = df["fcf"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "fcf"                 : ', pearson_corr)
pearson_corr = df["fcf_ps"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "fcf_ps"              : ', pearson_corr)
pearson_corr = df["gdp_forecast"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "gdp_forecast"        : ', pearson_corr)
pearson_corr = df["political_stability"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "political_stability" : ', pearson_corr)
pearson_corr = df["inflation_forecast"].corr(df["return"], method="pearson")
print('Nilai Uji Korelasi Pearson "inflation_forecast"  : ', pearson_corr)


Nilai Uji Korelasi Pearson "eps"                 :  0.008373430120997252
Nilai Uji Korelasi Pearson "per"                 :  -0.0030479092585721547
Nilai Uji Korelasi Pearson "pbv"                 :  0.006748924397411563
Nilai Uji Korelasi Pearson "bvps"                :  -0.009784192033968043
Nilai Uji Korelasi Pearson "ebitda"              :  -0.014699433008967953
Nilai Uji Korelasi Pearson "roa"                 :  0.006973208205588449
Nilai Uji Korelasi Pearson "roe"                 :  0.016771124727668195
Nilai Uji Korelasi Pearson "fcf"                 :  0.002539784912515581
Nilai Uji Korelasi Pearson "fcf_ps"              :  0.07579499099629268
Nilai Uji Korelasi Pearson "gdp_forecast"        :  -0.04460734326540409
Nilai Uji Korelasi Pearson "political_stability" :  0.062325801758398865
Nilai Uji Korelasi Pearson "inflation_forecast"  :  0.04179958496479531


In [6]:
# Split label dan fitur yang akan digunakan dalam pembuatan model
X = df.drop(columns=['emiten_code', 'year'])
y = df['return']

In [7]:
# Split data train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)
print('Train Size: ', X_train.shape)
print('Test Size: ', X_test.shape)

Train Size:  (1154, 14)
Test Size:  (289, 14)


In [8]:
# Uji Distribusi Normalitas dengan Uji Skewness
print('Nilai Normalitas Skewness eps  : ', X_train['eps'].skew())
print('Nilai Normalitas Skewness per   : ', X_train['per'].skew())
print('Nilai Normalitas Skewness pbv  : ', X_train['pbv'].skew())
print('Nilai Normalitas Skewness bvps : ', X_train['bvps'].skew())
print('Nilai Normalitas Skewness ebitda : ', X_train['ebitda'].skew())
print('Nilai Normalitas Skewness roa : ', X_train['roa'].skew())
print('Nilai Normalitas Skewness roe : ', X_train['roe'].skew())
print('Nilai Normalitas Skewness fcf : ', X_train['fcf'].skew())
print('Nilai Normalitas Skewness fcf_ps : ', X_train['fcf_ps'].skew())
print('Nilai Normalitas Skewness gdp_forecast : ', X_train['gdp_forecast'].skew())
print('Nilai Normalitas Skewness political_stability : ', X_train['political_stability'].skew())
print('Nilai Normalitas Skewness inflation_forecast : ', X_train['inflation_forecast'].skew())

Nilai Normalitas Skewness eps  :  -19.62240112863765
Nilai Normalitas Skewness per   :  33.97031345614334
Nilai Normalitas Skewness pbv  :  17.41468742136172
Nilai Normalitas Skewness bvps :  11.74462265005161
Nilai Normalitas Skewness ebitda :  4.214567415380729
Nilai Normalitas Skewness roa :  -7.258436333866704
Nilai Normalitas Skewness roe :  -19.91874116514267
Nilai Normalitas Skewness fcf :  2.1358239901181593
Nilai Normalitas Skewness fcf_ps :  -21.942704517254676
Nilai Normalitas Skewness gdp_forecast :  -2.4185861595068263
Nilai Normalitas Skewness political_stability :  1.144354533591244
Nilai Normalitas Skewness inflation_forecast :  0.047539968645291655


In [9]:
# Handling outlier dengan Winsorizer
skew = ['eps', 'per', 'pbv', 'bvps', 'ebitda',
       'roa', 'roe', 'fcf', 'fcf_ps', 'gdp_forecast', 'political_stability']
winsorizer_1 = Winsorizer(capping_method='iqr',
                        tail='both',
                        fold=3,
                        variables=skew,
                        missing_values='ignore')

X_train_capped = winsorizer_1.fit_transform(X_train)
X_test_capped = winsorizer_1.transform(X_test)

In [10]:
# Membagi fitur kategorikal dan numerikal
categorical_columns = ['date']
numerical_columns = ['eps', 'per', 'pbv', 'bvps', 'ebitda', 'roa', 'roe', 'fcf',
       'fcf_ps', 'gdp_forecast', 'political_stability', 'inflation_forecast']

# Feature transformation dengan menggunakan robustscaler dan OHE
preprocess = ColumnTransformer([
    ("num", RobustScaler(), numerical_columns),  # Standardisasi fitur numerik
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns)
])

In [11]:
# Membuat pipeline preprocess untuk setiap algoritma regresi
pipe_linreg = make_pipeline(preprocess, LinearRegression())
pipe_ridge = make_pipeline(preprocess, Ridge())
pipe_lasso = make_pipeline(preprocess, Lasso())
pipe_knn = make_pipeline(preprocess, KNeighborsRegressor())
pipe_svr = make_pipeline(preprocess, SVR())
pipe_dt = make_pipeline(preprocess, DecisionTreeRegressor())
pipe_rf = make_pipeline(preprocess, RandomForestRegressor())
pipe_xgb = make_pipeline(preprocess, XGBRegressor())


In [12]:
# Membuat nama untuk masing-masing pipeline
pipelines = {
    "Linear Regression": pipe_linreg,
    "Ridge Regression": pipe_ridge,
    "Lasso Regression": pipe_lasso,
    "KNN Regressor": pipe_knn,
    "SVR": pipe_svr,
    "Decision Tree": pipe_dt,
    "Random Forest": pipe_rf,
    "XGBoost": pipe_xgb
}

# Looping training untuk masing-masing pipeline
for name, model in pipelines.items():
    model.fit(X_train, y_train)
    print(f"{name} DONE")


Linear Regression DONE
Ridge Regression DONE
Lasso Regression DONE
KNN Regressor DONE
SVR DONE


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Decision Tree DONE
Random Forest DONE
XGBoost DONE


In [13]:
# Setting cross validation untuk masing-masing model pipeline
kfold = KFold(n_splits = 5, shuffle=True, random_state=42)

cv_linreg_model = cross_val_score(pipe_linreg, X_train_capped, y_train, cv=kfold, 
                                  scoring="neg_mean_absolute_error")
cv_ridge_model = cross_val_score(pipe_ridge, X_train_capped, y_train, cv=kfold, 
                                 scoring="neg_mean_absolute_error")
cv_lasso_model = cross_val_score(pipe_lasso, X_train_capped, y_train, cv=kfold, 
                                 scoring="neg_mean_absolute_error")
cv_knn_model = cross_val_score(pipe_knn, X_train_capped, y_train, cv=kfold, 
                               scoring="neg_mean_absolute_error")
cv_svr_model = cross_val_score(pipe_svr, X_train_capped, y_train, cv=kfold, 
                               scoring="neg_mean_absolute_error")
cv_dt_model = cross_val_score(pipe_dt, X_train_capped, y_train, cv=kfold, 
                              scoring="neg_mean_absolute_error")
cv_rf_model = cross_val_score(pipe_rf, X_train_capped, y_train, cv=kfold, 
                              scoring="neg_mean_absolute_error")
cv_xgb_model = cross_val_score(pipe_xgb, X_train_capped, y_train, cv=kfold,
                               scoring="neg_mean_absolute_error")

In [14]:
# Membuat variabel baru model dan nama model untuk evaluasi model
models = [cv_linreg_model, cv_ridge_model, cv_lasso_model, cv_knn_model]
names = ['linreg_model', 'ridge_model', 'lasso_model', 'knn_model']

# Looping untuk mendapatkan hasil evaluasi model
for model, name in zip(models, names):
    mean_mae = -model.mean()  # Memastikan nialinya tidak negatif karena menggunakan MAE 
    std_mae = model.std()

    print(f"{name}:")
    print(f"  MAE - All - Cross Validation : {model}")
    print(f"  MAE - Mean - Cross Validation : {mean_mae:.4f}")
    print(f"  MAE - std - Cross Validation : {std_mae:.4f}")
    print(f"  MAE - Range of Test Set : {mean_mae - std_mae:.4f} - {mean_mae + std_mae:.4f}")
    print("-" * 50)

linreg_model:
  MAE - All - Cross Validation : [-22.42311917 -18.29160381 -16.80006508 -18.40272793 -16.20755667]
  MAE - Mean - Cross Validation : 18.4250
  MAE - std - Cross Validation : 2.1707
  MAE - Range of Test Set : 16.2543 - 20.5958
--------------------------------------------------
ridge_model:
  MAE - All - Cross Validation : [-22.41948985 -18.28746507 -16.79458124 -18.39838627 -16.19893014]
  MAE - Mean - Cross Validation : 18.4198
  MAE - std - Cross Validation : 2.1721
  MAE - Range of Test Set : 16.2477 - 20.5918
--------------------------------------------------
lasso_model:
  MAE - All - Cross Validation : [-21.98913156 -18.26684929 -16.22767268 -17.92252812 -15.64845434]
  MAE - Mean - Cross Validation : 18.0109
  MAE - std - Cross Validation : 2.2208
  MAE - Range of Test Set : 15.7901 - 20.2318
--------------------------------------------------
knn_model:
  MAE - All - Cross Validation : [-24.35147307 -20.44905977 -18.16771418 -19.41412913 -17.31298247]
  MAE - Mean

In [15]:
# Membuat variabel baru model dan nama model untuk evaluasi model
models = [cv_svr_model, cv_dt_model, cv_rf_model, cv_xgb_model]
names = ['svr_model', 'dt_model', 'rf_model', 'xgb_model']

# Looping untuk mendapatkan hasil evaluasi model
for model, name in zip(models, names):
    mean_mae = -model.mean()  # Memastikan nialinya tidak negatif karena menggunakan MAE 
    std_mae = model.std()

    print(f"{name}:")
    print(f"  MAE - All - Cross Validation : {model}")
    print(f"  MAE - Mean - Cross Validation : {mean_mae:.4f}")
    print(f"  MAE - std - Cross Validation : {std_mae:.4f}")
    print(f"  MAE - Range of Test Set : {mean_mae - std_mae:.4f} - {mean_mae + std_mae:.4f}")
    print("-" * 50)

svr_model:
  MAE - All - Cross Validation : [-21.58149928 -16.97119193 -14.3717285  -16.35937202 -12.34363199]
  MAE - Mean - Cross Validation : 16.3255
  MAE - std - Cross Validation : 3.0892
  MAE - Range of Test Set : 13.2363 - 19.4147
--------------------------------------------------
dt_model:
  MAE - All - Cross Validation : [-26.51374421 -25.59643326 -22.23874713 -20.23844274 -26.59438734]
  MAE - Mean - Cross Validation : 24.2364
  MAE - std - Cross Validation : 2.5523
  MAE - Range of Test Set : 21.6841 - 26.7886
--------------------------------------------------
rf_model:
  MAE - All - Cross Validation : [-21.17750402 -19.5932188  -16.44102658 -17.10571063 -16.52869811]
  MAE - Mean - Cross Validation : 18.1692
  MAE - std - Cross Validation : 1.8914
  MAE - Range of Test Set : 16.2779 - 20.0606
--------------------------------------------------
xgb_model:
  MAE - All - Cross Validation : [-22.94880755 -20.91380851 -17.29099318 -19.36932884 -18.75036972]
  MAE - Mean - Cross 

In [16]:
# Mendapatkan hasil prediksi untuk data train dan test
y_pred_train = pipe_svr.predict(X_train_capped)
y_pred_test = pipe_svr.predict(X_test)

# Evaluasi Model - Train
print("Train Set Evaluation")
print('MAE :', mean_absolute_error(y_train, y_pred_train))
print('MSE :', mean_squared_error(y_train, y_pred_train))
print('R² Score:', r2_score(y_train, y_pred_train))
print("=" * 50)

# Evaluasi Model - Test
print("Test Set Evaluation")
print('MAE :', mean_absolute_error(y_test, y_pred_test))
print('MSE :', mean_squared_error(y_test, y_pred_test))
print('R² Score:', r2_score(y_test, y_pred_test))


Train Set Evaluation
MAE : 16.590629822996714
MSE : 1327.491275595089
R² Score: -0.01333692606127923
Test Set Evaluation
MAE : 17.22907831026692
MSE : 3027.275979559826
R² Score: -0.01467911209849504


In [17]:
# Membuat parameter SVR yang akan digunakan dalam RandomSearch
param_svr = {
    'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Jenis kernel
    'svr__C': [0.1, 1, 10, 100],  # Regularisasi
    'svr__epsilon': [0.01, 0.1, 0.5, 1],  # Margin toleransi
    'svr__gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Parameter untuk kernel 'rbf', 'poly', dan 'sigmoid'
    'svr__degree': [2, 3, 4],  # Hanya untuk kernel 'poly'
}
param_svr

{'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
 'svr__C': [0.1, 1, 10, 100],
 'svr__epsilon': [0.01, 0.1, 0.5, 1],
 'svr__gamma': ['scale', 'auto', 0.01, 0.1, 1],
 'svr__degree': [2, 3, 4]}

In [18]:
# Mengatur RandomSearchCV untuk SVR
random_search_svr = RandomizedSearchCV(pipe_svr,
                                      param_distributions=param_svr,
                                      scoring='neg_mean_absolute_error',
                                      cv=kfold,
                                      verbose=2,
                                      n_jobs=-1,
                                      random_state=42
                                      )

# Fit RandomSearchCV untuk SVR
random_search_svr.fit(X_train_capped, y_train)

# Mencari hyperparameter terbaik untuk SVR
print('Best hyperparameters for SVR :', random_search_svr.best_params_)

# Mencari nilai MAE terbaik untuk SVR
print('Best MAE for SVR :', -random_search_svr.best_score_)

# Menyimpan model terbaik kedalam variabel
best_model = random_search_svr.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters for SVR : {'svr__kernel': 'rbf', 'svr__gamma': 0.01, 'svr__epsilon': 0.1, 'svr__degree': 3, 'svr__C': 1}
Best MAE for SVR : 16.36642716982707


In [19]:
# Mendapatkan hasil prediksi untuk data train dan test
y_pred_train_tuned = best_model.predict(X_train_capped)
y_pred_test_tuned = best_model.predict(X_test)

# Evaluasi Model - Train
print("Train Set Evaluation After Tuning")
print('MAE :', mean_absolute_error(y_train, y_pred_train_tuned))
print('MSE :', mean_squared_error(y_train, y_pred_train_tuned))
print('R² Score:', r2_score(y_train, y_pred_train_tuned))
print("=" * 50)

# Evaluasi Model - Test
print("Test Set Evaluation After Tuning")
print('MAE :', mean_absolute_error(y_test, y_pred_test_tuned))
print('MSE :', mean_squared_error(y_test, y_pred_test_tuned))
print('R² Score:', r2_score(y_test, y_pred_test_tuned))


Train Set Evaluation After Tuning
MAE : 16.218593625537846
MSE : 1305.2522892069644
R² Score: 0.003639144907743641
Test Set Evaluation After Tuning
MAE : 17.07458705048571
MSE : 3009.3642111323284
R² Score: -0.008675464790868448


In [20]:
# Menyimpan kolom numerikal
with open('list_cat_cols.txt', 'w') as file_1:
  json.dump(categorical_columns, file_1)
# Menyimpan kolom kategorikal yang belum di encode (encode melalui pipeline scaler)
with open('list_num_cols.txt', 'w') as file_2:
  json.dump(numerical_columns, file_2) 
# Menyimpan model terbaik setelah hyperparameter tuning
with open('best_finance_svr_model.pkl', 'wb') as model_file:
  pickle.dump(best_model, model_file)