## Load Data and preprocess

In [11]:
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold

df = pd.read_csv('Data/Dataset.csv', decimal =".", thousands=",")
df = df.drop(['Kota','Paslon 1','Paslon 2','Total'],axis=1)

scaler = MinMaxScaler()

df_minmax = pd.DataFrame(scaler.fit_transform(df.values), columns=df.columns, index=df.index)
dfX, dfY = df_minmax.iloc[:, :-1], df_minmax.iloc[:, [-1]]
kf = RepeatedKFold(n_splits=10, n_repeats = 3 , random_state=1)
kf.get_n_splits(dfX)
y = df['Partisipasi']


## Metrics

3 metrik akan digunakan untuk mengevaluasi model yang telah dibuat, yaitu:
1. Mean Absolute Error (MAE)
- MAE merupakan rata - rata dari selisih absolut antara nilai prediksi dengan nilai aktual. MAE merupakan metrik yang paling mudah diinterpretasikan karena tidak melibatkan satuan. MAE yang semakin kecil menunjukkan model yang semakin baik.
2. Root Mean Squared Error (RMSE)
- RMSE merupakan akar kuadrat dari rata - rata dari selisih kuadrat antara nilai prediksi dengan nilai aktual. RMSE yang semakin kecil menunjukkan model yang semakin baik.
3. R-Squared (R2)
- R2 merupakan koefisien determinasi yang menunjukkan seberapa baik model yang dibuat dapat menjelaskan variabilitas dari data yang diamati. R2 yang semakin besar menunjukkan model yang semakin baik.

In [12]:
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

def error_rate(methods,preds,vals):
  rmse = math.sqrt(mean_squared_error(vals,preds))
  mae = mean_absolute_error(vals, preds)
  r2 = r2_score(vals, preds)
  
  return methods,rmse,mae,r2

## Modelling

Sebelum seleksi fitur, akan dilakukan pemodelan dengan menggunakan semua fitur yang ada. Pemodelan dilakukan terlebih dahulu untuk mencari model terbaik yang akan digunakan pada seleksi fitur.  
Pemodelan dilakukan dengan menggunakan 7 algoritma, yaitu:
1. Linear Regression
2. SVR
3. Decision Tree
4. Random Forest
5. XGBoost
6. LightGBM
7. CatBoost

## Flatten

Flatten merupakan sebuah fungsi utilitas untuk mengubah array 2 dimensi menjadi array 1 dimensi. Fungsi ini akan digunakan untuk mengubah array 2 dimensi hasil prediksi menjadi array 1 dimensi agar dapat digunakan pada metrik evaluasi.

In [13]:
def flatten(l):
    # flatten list of list menjadi sebuah list
    return [item for sublist in l for item in sublist]

## Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression
import pandas as pd

def evaluate_linreg(dfX,y,kf):
  reg = LinearRegression()
  vals = []
  preds = []
  for train_index, test_index in kf.split(dfX):
      X_train, X_test = dfX.iloc[train_index], dfX.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      reg.fit(X_train, y_train)
      pred = reg.predict(X_test)

      preds.append(pred)
      vals.append(y_test)
  linear_reg_df = pd.DataFrame({'Prediction':flatten(preds),'Actual Values':flatten(vals)})
  return error_rate("Linear Regression",linear_reg_df['Prediction'],linear_reg_df['Actual Values'])

evaluate_linreg(dfX, y, kf)

('Linear Regression',
 0.041151108859387615,
 0.03339395849373654,
 0.2803149971364153)

## SVR

In [17]:
from sklearn.svm import SVR

def evaluate_svr(dfX,y,kf):
  
  svr_reg = SVR()
  vals = []
  preds = []

  for train_index, test_index in kf.split(dfX):
      X_train, X_test = dfX.iloc[train_index], dfX.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      svr_reg.fit(X_train,y_train)
      svr_pred = svr_reg.predict(X_test)
      
      preds.append(svr_pred)
      vals.append(y_test)

  svr_df = pd.DataFrame({'Prediction':flatten(preds),'Actual Values':flatten(vals)})
  return error_rate("SVR",svr_df['Prediction'],svr_df['Actual Values'])

evaluate_svr(dfX, y, kf)

('SVR', 0.04773394043795352, 0.03884803361188745, 0.03164646558058193)

## Decision Tree

In [21]:
from sklearn.tree import DecisionTreeRegressor

def evaluate_dt(dfX,y,kf):
  
  dt_reg = DecisionTreeRegressor()
  vals = []
  preds = []
  for train_index, test_index in kf.split(dfX):
      
      X_train, X_test = dfX.iloc[train_index], dfX.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      dt_reg.fit(X_train,y_train)
      dt_pred = dt_reg.predict(X_test)

      preds.append(dt_pred)
      vals.append(y_test)

  dt_df = pd.DataFrame({'Prediction':flatten(preds),'Actual Values':flatten(vals)})
  return error_rate("Decision Tree",dt_df['Prediction'],dt_df['Actual Values'])
  
evaluate_dt(dfX,y,kf)


('Decision Tree',
 0.05523462757282909,
 0.04503630844654088,
 -0.2965886883579478)

In [22]:
from sklearn.ensemble import RandomForestRegressor

def evaluate_rf(dfX,y,kf):
  
  vals = []
  preds = []

  rf = RandomForestRegressor(n_estimators= 100, random_state = 0)

  for train_index, test_index in kf.split(dfX):
      X_train, X_test = dfX.iloc[train_index], dfX.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      rf.fit(X_train,y_train)
      rf_pred = rf.predict(X_test)

      preds.append(rf_pred)
      vals.append(y_test)

  rf_df = pd.DataFrame({'Prediction':flatten(preds),'Actual Values':flatten(vals)})
  return error_rate("Random Forest",rf_df['Prediction'],rf_df['Actual Values'])

evaluate_rf(dfX,y,kf)

('Random Forest',
 0.04015124271912773,
 0.03232011222889938,
 0.31486311058980954)

In [23]:
from xgboost import XGBRegressor

def evaluate_xgboost(dfX,y,kf):
  
  vals = []
  preds = []

  xgb = XGBRegressor()

  for train_index, test_index in kf.split(dfX):
      X_train, X_test = dfX.iloc[train_index], dfX.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      xgb.fit(X_train,y_train)
      xgb_pred = xgb.predict(X_test)

      preds.append(xgb_pred)
      vals.append(y_test)

  xgb_df = pd.DataFrame({'Prediction':flatten(preds),'Actual Values':flatten(vals)})
  return error_rate("XGBoost",xgb_df['Prediction'],xgb_df['Actual Values'])

evaluate_xgboost(dfX,y,kf)

('XGBoost', 0.04671284884163465, 0.03846689642337487, 0.07263206290551683)

In [29]:
import lightgbm as lgb  

def evaluate_lightgbm(dfX,y):
  
  params = {
      'task': 'train', 
      'boosting': 'gbdt',
      'objective': 'regression',
      'num_leaves': 10,
      'learning_rate': 0.05,
      'metric': {'l2','l1'},
      'verbose': -1
  }
  
  vals = []
  preds = []

  for train_index, test_index in kf.split(dfX):

      X_train, X_test = dfX.iloc[train_index], dfX.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      lgb_train = lgb.Dataset(X_train, y_train)

      lgb_reg = lgb.train(params,train_set=lgb_train)
      lgb_pred = lgb_reg.predict(X_test)
      
      preds.append(lgb_pred)
      vals.append(y_test)

  lgb_df = pd.DataFrame({'Prediction':flatten(preds),'Actual Values':flatten(vals)})
  return error_rate("LightGBM",lgb_df['Prediction'],lgb_df['Actual Values'])
evaluate_lightgbm(dfX,y)

('LightGBM', 0.04281453667792947, 0.034679282965554566, 0.22095621803714005)

In [32]:
from catboost import CatBoostRegressor
from catboost import Pool

def evaluate_catboost(dfX,y):
  vals = []
  preds = []

  cbr = CatBoostRegressor()
  for train_index, test_index in kf.split(dfX):
      X_train, X_test = dfX.iloc[train_index], dfX.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]
      
      train_pool = Pool(data=X_train, label=y_train)
      test_pool = Pool(data=X_test, label=y_test)

      cbr.fit(train_pool, verbose=False)
      cbr_pred = cbr.predict(test_pool)
      
      preds.append(cbr_pred)
      vals.append(y_test)

  cat_df = pd.DataFrame({'Prediction':flatten(preds),'Actual Values':flatten(vals)})
  return error_rate("Catboost",cat_df['Prediction'],cat_df['Actual Values'])

evaluate_catboost(dfX,y)

('Catboost', 0.04001594428986717, 0.0326771489029961, 0.3194727692532964)

## Takeaway

Pada modelling pertama, terlihat metode - metode yang menggunakan ensemble learning seperti random forest dan gradient boosting seperti XGBoost, LightGBM, dan CatBoost memiliki performa yang lebih baik dibandingkan dengan metode - metode lainnya.  Selain itu, linear regression juga mempunyai performa yang cukup baik dengan waktu training yang sangat cepat.  
Oleh karena itu, feature selection akan dilakukan dengan menggunakan metode - metode tersebut.
