In [1]:
#  plot
%matplotlib inline
%matplotlib widget
from matplotlib import pyplot as plt
import matplotlib

#   basic packages
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm
import math

#   acceleratioin
from numba import jit, njit

#   learning packages
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor as knn
from sklearn.preprocessing import normalize
from lightgbm import LGBMModel, LGBMClassifier, LGBMRegressor, plot_importance
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor
# from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor


In [2]:
# load preprocessed data
with open('preprocessed_data/efedericisentence_bert_base/train_feature.json') as f:
    training_data = json.load(f)


In [3]:
training_id = [int(data['id']) for data in training_data]
X = [data['feature'] for data in training_data]
X = np.asarray(X, dtype=np.float32)
Y = np.asarray([int(data['label']) for data in training_data])


In [4]:
with open('preprocessed_data/efedericisentence_bert_base/test_feature.json') as f:
    testing_data = json.load(f)


In [5]:
testing_id = [int(data['id']) for data in testing_data]
X_test = [data['feature'] for data in testing_data]
X_test = np.asarray(X_test, dtype=np.float32)

In [6]:
X_train, X_eval, Y_train, Y_eval = train_test_split(X, Y, test_size=0.33, random_state=8787)


In [13]:
X_test_train, X_test_eval = train_test_split(X_test, test_size=0.33, random_state=8787)
print(X_test_train.shape, X_test_eval.shape)

(4231, 5393) (2084, 5393)


In [7]:
print(X.shape)
print(X_test.shape)
print(X_train.shape, X_eval.shape)

(17170, 5393)
(6315, 5393)
(11503, 5393) (5667, 5393)


# LightGBM Random Forest

In [8]:
lgbm_rf_model = LGBMRegressor(n_estimators=25000,
                            verbose=1,
                            n_jobs=10,
                            feature_fraction=0.3,
                            bagging_fraction=0.4,
                            bagging_freq=10,
                            random_state=1234,
                            boosting_type='rf',
                            metric='mae',)
lgbm_rf_model.fit(X_train,Y_train)

# with open(f'model/lgbm_rf_model_1.975083351596476.pickle', 'rb') as f:
#     lgbm_rf_model = pickle.load(f)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1088804
[LightGBM] [Info] Number of data points in the train set: 11503, number of used features: 5393
[LightGBM] [Info] Start training from score 4.578719


LGBMRegressor(bagging_fraction=0.4, bagging_freq=10, boosting_type='rf',
              feature_fraction=0.3, metric='mae', n_estimators=25000, n_jobs=10,
              random_state=1234, verbose=1)

In [10]:
# Validation random forest
rf_pred = lgbm_rf_model.predict(X_eval)
print(rf_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(rf_pred - Y_eval))
print(np.mean(np.abs(rf_pred - Y_eval)))

with open(f'model/lgbm_rf_model_{mae}.pickle', 'wb') as f:
    pickle.dump(lgbm_rf_model, f)

[5.91697832 4.85502455 3.21674401 5.19528259 5.36363094 5.09930997
 4.87604682 2.98759929 3.48252934 4.79766436]
[6 8 5 6 5 7 2 0 6 5]
1.9750910403054105


# AdaBoost

In [90]:
ada_model = AdaBoostClassifier()
ada_model = ada_model.fit(X_train, Y_train)

In [91]:
# Validation
ada_pred = ada_model.predict(X_eval)
print(ada_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(ada_pred - Y_eval))
print(np.mean(np.abs(ada_pred - Y_eval)))

[7 2 6 2 8 6 1 0 3 9]
[6 8 5 6 5 7 2 0 6 5]
1.892182812775719


In [16]:
with open(f'model/ensemble_adaboost_model_{mae}.pickle', 'wb') as f:
    pickle.dump(ada_model, f)

# LightGBM dart

In [17]:
lgbm_dart_model = LGBMRegressor(n_estimators=25000,
                            verbose=1,
                            n_jobs=10,
                            feature_fraction=0.3,
                            bagging_fraction=0.4,
                            bagging_freq=10,
                            random_state=1234,
                            boosting_type='dart',
                            metric='mae',)

lgbm_dart_model.fit(X_train,Y_train)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1088804
[LightGBM] [Info] Number of data points in the train set: 11503, number of used features: 5393
[LightGBM] [Info] Start training from score 4.578719


LGBMRegressor(bagging_fraction=0.4, bagging_freq=10, boosting_type='dart',
              feature_fraction=0.3, metric='mae', n_estimators=25000, n_jobs=10,
              random_state=1234, verbose=1)

In [18]:
# Validation dart
dart_pred = lgbm_dart_model.predict(X_eval)
print(dart_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(dart_pred - Y_eval))
print(np.mean(np.abs(dart_pred - Y_eval)))

[ 5.93984977  4.65687972  5.29374439  4.21730308  5.45037259  5.5643454
  3.05679004 -0.11307005  3.99976695  5.53339056]
[6 8 5 6 5 7 2 0 6 5]
1.5032024608614731


In [19]:
with open(f'model/lgbm_dart_model_{mae}.pickle', 'wb') as f:
    pickle.dump(lgbm_dart_model, f)

# LightGBM gbdt

In [20]:
lgbm_gbdt_model = LGBMRegressor(n_estimators=2500,
                            verbose=1,
                            n_jobs=10,
                            feature_fraction=0.3,
                            bagging_fraction=0.4,
                            random_state=1234,
                            boosting_type='gbdt',
                            metric='mae',
                            )

lgbm_gbdt_model.fit(X_train,Y_train)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1088804
[LightGBM] [Info] Number of data points in the train set: 11503, number of used features: 5393
[LightGBM] [Info] Start training from score 4.578719


LGBMRegressor(bagging_fraction=0.4, feature_fraction=0.3, metric='mae',
              n_estimators=2500, n_jobs=10, random_state=1234, verbose=1)

In [21]:
# Validation
gbdt_pred = lgbm_gbdt_model.predict(X_eval)
print(gbdt_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(gbdt_pred - Y_eval))
print(np.mean(np.abs(gbdt_pred - Y_eval)))

[ 6.73205006  4.53371631  3.97131426  4.49344836  5.18928407  5.4346011
  3.53314585 -0.1132879   3.20598694  6.43975914]
[6 8 5 6 5 7 2 0 6 5]
1.5325363633330564


In [98]:
with open(f'model/lgbm_gbdt_model_{mae}.pickle', 'wb') as f:
    pickle.dump(lgbm_gbdt_model, f)

In [47]:
lgbm_gbdt_model2 = LGBMRegressor(n_estimators=25000,
                            verbose=1,
                            n_jobs=10,
                            feature_fraction=0.3,
                            bagging_fraction=0.4,
                            bagging_freq=10,
                            random_state=1234,
                            boosting_type='gbdt',
                            metric='mae',
                            )

lgbm_gbdt_model2.fit(X_train,Y_train)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1088804
[LightGBM] [Info] Number of data points in the train set: 11503, number of used features: 5393
[LightGBM] [Info] Start training from score 4.578719


LGBMRegressor(bagging_fraction=0.4, bagging_freq=10, feature_fraction=0.3,
              metric='mae', n_estimators=25000, n_jobs=10, random_state=1234,
              verbose=1)

In [48]:
# Validation
gbdt2_pred = lgbm_gbdt_model2.predict(X_eval)
print(gbdt2_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(gbdt2_pred - Y_eval))
print(np.mean(np.abs(gbdt2_pred - Y_eval)))

[ 5.98833594  5.3094955   4.76288748  3.82490299  3.01964699  5.60997923
  3.28107687 -0.47853178  2.87270623  5.83664299]
[6 8 5 6 5 7 2 0 6 5]
1.6056601414281035


In [49]:
with open(f'model/lgbm_gbdt_bag_model_{mae}.pickle', 'wb') as f:
    pickle.dump(lgbm_gbdt_model2, f)

In [50]:
lgbm_gbdt_model3 = LGBMRegressor(n_estimators=2500,
                            verbose=1,
                            n_jobs=10,
                            feature_fraction=0.3,
                            bagging_fraction=0.4,
                            bagging_freq=10,
                            random_state=1234,
                            boosting_type='gbdt',
                            metric='mae',
                            )

lgbm_gbdt_model3.fit(X_train,Y_train)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1088804
[LightGBM] [Info] Number of data points in the train set: 11503, number of used features: 5393
[LightGBM] [Info] Start training from score 4.578719


LGBMRegressor(bagging_fraction=0.4, bagging_freq=10, feature_fraction=0.3,
              metric='mae', n_estimators=2500, n_jobs=10, random_state=1234,
              verbose=1)

In [51]:
# Validation
gbdt3_pred = lgbm_gbdt_model3.predict(X_eval)
print(gbdt3_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(gbdt3_pred - Y_eval))
print(np.mean(np.abs(gbdt3_pred - Y_eval)))

[ 5.98416481  5.31289515  4.76109461  3.82458854  3.01767715  5.6030891
  3.28373797 -0.47352973  2.86887975  5.83252205]
[6 8 5 6 5 7 2 0 6 5]
1.6056904021808203


# blend with small part test

In [86]:
usecol = ['Danceability']
df = pd.read_csv('./html2023-spring-final-project/test_predic_label.csv', usecols=usecol)

Y_test = df['Danceability'].astype(int).to_numpy()

X_test_train, X_test_eval, Y_test_train, Y_test_eval, id_test_train, id_test_eval = train_test_split(X_test, Y_test, testing_id,  test_size=0.66, random_state=8787)
print(X_test_train.shape, X_test_eval.shape, Y_test_train.shape, Y_test_eval.shape)

X_blend_train = np.concatenate([X_train, X_test_train])
Y_blend_train = np.concatenate([Y_train, Y_test_train])
print(X_blend_train.shape, Y_blend_train.shape)


(2147, 5393) (4168, 5393) (2147,) (4168,)
(11503, 5393) (13650, 5393) (13650,)


In [92]:
# Adaboost
ada_blend_model = AdaBoostClassifier()
ada_blend_model = ada_blend_model.fit(X_blend_train, Y_blend_train)

# Validation
ada_blend_pred = ada_blend_model.predict(X_eval)
print(ada_blend_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(ada_blend_pred - Y_eval))
print(np.mean(np.abs(ada_blend_pred - Y_eval)))


[7 7 4 6 7 6 4 0 3 9]
[6 8 5 6 5 7 2 0 6 5]
1.8459502382212811


In [99]:
with open(f'model/ensemble_adaboost_blend_model_{mae}.pickle', 'wb') as f:
    pickle.dump(ada_blend_model, f)

In [101]:
# LigthGBM dart
lgbm_dart_blend_model = LGBMRegressor(n_estimators=25000,
                            verbose=1,
                            n_jobs=10,
                            feature_fraction=0.3,
                            bagging_fraction=0.4,
                            bagging_freq=10,
                            random_state=1234,
                            boosting_type='dart',
                            metric='mae',)

lgbm_dart_blend_model.fit(X_blend_train,Y_blend_train)

# Validation dart
dart_blend_pred = lgbm_dart_blend_model.predict(X_eval)
print(dart_blend_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(dart_blend_pred - Y_eval))
print(np.mean(np.abs(dart_blend_pred - Y_eval)))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1093869
[LightGBM] [Info] Number of data points in the train set: 13650, number of used features: 5393
[LightGBM] [Info] Start training from score 4.548205
[ 5.9479592   4.5484604   5.32569816  4.10126705  5.51138259  5.46663252
  3.10486665 -0.01848302  3.52725796  5.6550889 ]
[6 8 5 6 5 7 2 0 6 5]
1.5016537392633487


In [102]:
with open(f'model/lgbm_dart_blend_model_{mae}.pickle', 'wb') as f:
    pickle.dump(lgbm_dart_blend_model, f)

In [94]:
# LigthGBM gbdt
lgbm_gbdt_blend_model = LGBMRegressor(n_estimators=2500,
                            verbose=1,
                            n_jobs=10,
                            feature_fraction=0.3,
                            bagging_fraction=0.4,
                            random_state=1234,
                            boosting_type='gbdt',
                            metric='mae',
                            )

lgbm_gbdt_blend_model.fit(X_blend_train,Y_blend_train)

# Validation
gbdt_blend_pred = lgbm_gbdt_blend_model.predict(X_eval)
print(gbdt_blend_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(gbdt_blend_pred - Y_eval))
print(np.mean(np.abs(gbdt_blend_pred - Y_eval)))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1093869
[LightGBM] [Info] Number of data points in the train set: 13650, number of used features: 5393
[LightGBM] [Info] Start training from score 4.548205
[ 5.99474503  3.10608888  4.95041417  4.439354    5.47372553  5.79711771
  2.54150961 -0.48164055  3.03441591  5.25761051]
[6 8 5 6 5 7 2 0 6 5]
1.5209615179935414


In [95]:
# Validation
gbdt_blend_pred = lgbm_gbdt_blend_model.predict(X_eval)
print(gbdt_blend_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(gbdt_blend_pred - Y_eval))
print(np.mean(np.abs(gbdt_blend_pred - Y_eval)))

[ 5.99474503  3.10608888  4.95041417  4.439354    5.47372553  5.79711771
  2.54150961 -0.48164055  3.03441591  5.25761051]
[6 8 5 6 5 7 2 0 6 5]
1.5209615179935414


In [100]:
with open(f'model/lgbm_gbdt_blend_model_{mae}.pickle', 'wb') as f:
    pickle.dump(lgbm_gbdt_blend_model, f)

In [None]:
lgbm_rf_blend_model = LGBMRegressor(n_estimators=25000,
                            verbose=1,
                            n_jobs=10,
                            feature_fraction=0.3,
                            bagging_fraction=0.4,
                            bagging_freq=10,
                            random_state=1234,
                            boosting_type='rf',
                            metric='mae',)
lgbm_rf_blend_model.fit(X_train,Y_train)

# Validation random forest
rf_blend_pred = lgbm_rf_model.predict(X_eval)
print(rf_blend_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(rf_blend_pred - Y_eval))
print(np.mean(np.abs(rf_blend_pred - Y_eval)))

In [None]:
with open(f'model/lgbm_rf_model_{mae}.pickle', 'wb') as f:
    pickle.dump(lgbm_rf_blend_model, f)

# Linear Search for best weights combo

In [30]:

def postClean(label):
    for i, label in enumerate(label):
        if label[i] < 0:
            label[i] = 0
        elif label[i] > 9:
            label[i] = 9
        
        label[i] = np.round(label[i])
    return label
@jit
def LinearSearch(pred1, pred2, pred3, Y_eval, if_clean:dict):

    # init search grid
    model1_w = np.arange(1, 20, 1)
    model2_w = np.arange(1, 20, 1)
    model3_w = np.arange(1, 20, 1)
    
    
    if if_clean['model1']:
        for i, label in enumerate(pred1):
            if pred1[i] < 0:
                pred1[i] = 0
            elif pred1[i] > 9:
                pred1[i] = 9
            
            if if_clean['model1_round']:
                pred1[i] = np.round(pred1[i])
                
    if if_clean['model2']:
        for i, label in enumerate(pred2):
            if pred2[i] < 0:
                pred2[i] = 0
            elif pred2[i] > 9:
                pred2[i] = 9
            
            if if_clean['model2_round']:
                pred2[i] = np.round(pred2[i])
                
    if if_clean['model3']:
        for i, label in enumerate(pred3):
            if pred3[i] < 0:
                pred3[i] = 0
            elif pred3[i] > 9:
                pred3[i] = 9
            
            if if_clean['model3_round']:
                pred3[i] = np.round(pred3[i])

    if_post_clean = [True, False]

    best_mae = 10
    best_weights = {
        'weight1': 0,
        'weight2': 0,
        'weight3': 0,
        'post_clean': False,
    }

    for w1 in model1_w:
        for w2 in model2_w:
            for w3 in model3_w:
                for post_clean in if_post_clean:
                    ensemble_pred = (w1*pred1 + w2*pred2 + w3*pred3) / np.sum([w1, w2, w3])
                    if post_clean:
                        for i, label in enumerate(ensemble_pred):
                            if ensemble_pred[i] < 0:
                                ensemble_pred[i] = 0
                            elif ensemble_pred[i] > 9:
                                ensemble_pred[i] = 9
                            
                            ensemble_pred[i] = np.round(ensemble_pred[i])
                            
                    mae = np.mean(np.abs(ensemble_pred - Y_eval))
                    if mae < best_mae:
                        best_mae = mae
                        best_weights['weight1'] = w1
                        best_weights['weight2'] = w2
                        best_weights['weight3'] = w3
                        best_weights['post_clean'] = post_clean
                        print(best_mae, best_weights)

                    
    return best_mae, best_weights

In [28]:
# gbdt_pred = lgbm_gbdt_model.predict(X_eval)
# dart_pred = lgbm_dart_model.predict(X_eval)
# ada_pred = ada_model.predict(X_eval)
if_clean = {
    'model1': False,
    'model1_round': False,
    'model2': False,
    'model2_round': False,
    'model3': False,
    'model3_round': False,
}
mae, best_weights = LinearSearch(gbdt_pred, dart_pred, ada_pred, Y_eval, if_clean)

Compilation is falling back to object mode WITH looplifting enabled because Function "LinearSearch" failed type inference due to: non-precise type pyobject
During: typing of argument at /var/folders/n2/87d8k7hn75z50bb14qb5p1wh0000gn/T/ipykernel_2171/4275379307.py (5)

File "../../../../../../../../../../../var/folders/n2/87d8k7hn75z50bb14qb5p1wh0000gn/T/ipykernel_2171/4275379307.py", line 5:
<source missing, REPL/exec in use?>

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "LinearSearch" failed type inference due to: cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "../../../../../../../../../../../var/folders/n2/87d8k7hn75z50bb14qb5p1wh0000gn/T/ipykernel_2171/4275379307.py", line 12:
<source missing, REPL/exec in use?>

  @jit

File "../../../../../../../../../../../var/folders/n2/87d8k7hn75z50bb14qb5p1wh0000gn/T/ipykernel_2171/4275379307.py", line 5:
<source missing, REPL/exec in use?>

  state.func_ir.lo

1.4928533615669666 {'weight1': 1, 'weight2': 1, 'weight3': 1, 'post_clean': True}
1.489853538027175 {'weight1': 1, 'weight2': 2, 'weight3': 1, 'post_clean': True}
1.488912416916652 {'weight1': 1, 'weight2': 4, 'weight3': 1, 'post_clean': False}
1.487030174695606 {'weight1': 1, 'weight2': 4, 'weight3': 2, 'post_clean': True}
1.486696860968963 {'weight1': 1, 'weight2': 7, 'weight3': 1, 'post_clean': False}
1.4861478736544909 {'weight1': 1, 'weight2': 7, 'weight3': 3, 'post_clean': True}
1.4859714134462678 {'weight1': 1, 'weight2': 9, 'weight3': 1, 'post_clean': True}
1.485955371609157 {'weight1': 1, 'weight2': 9, 'weight3': 1, 'post_clean': False}
1.4842068113640374 {'weight1': 1, 'weight2': 10, 'weight3': 1, 'post_clean': True}
1.4831480501146992 {'weight1': 1, 'weight2': 11, 'weight3': 1, 'post_clean': True}
1.4829715899064762 {'weight1': 1, 'weight2': 12, 'weight3': 1, 'post_clean': True}
1.482265749073584 {'weight1': 1, 'weight2': 13, 'weight3': 1, 'post_clean': True}
1.4820892888653

In [109]:
# 更換為Y_test_eval調參
mae_test, best_weights_test = LinearSearch(gbdt_pred, dart_pred, ada_pred, Y_test_eval, if_clean)

ValueError: operands could not be broadcast together with shapes (5667,) (4168,) 

In [104]:
# blend
mae_blend, best_weights_blend = LinearSearch(gbdt_blend_pred, dart_blend_pred, ada_blend_pred, Y_eval, if_clean)

1.4995588494794423 {'weight1': 1, 'weight2': 1, 'weight3': 1, 'post_clean': True}
1.4914416799011823 {'weight1': 1, 'weight2': 2, 'weight3': 1, 'post_clean': True}
1.481912828657138 {'weight1': 1, 'weight2': 3, 'weight3': 1, 'post_clean': True}
1.478383624492677 {'weight1': 2, 'weight2': 3, 'weight3': 1, 'post_clean': True}
1.474854420328216 {'weight1': 2, 'weight2': 4, 'weight3': 1, 'post_clean': True}
1.4743250397035468 {'weight1': 3, 'weight2': 4, 'weight3': 1, 'post_clean': True}
1.4699135344979708 {'weight1': 3, 'weight2': 5, 'weight3': 1, 'post_clean': True}
1.4693841538733017 {'weight1': 4, 'weight2': 5, 'weight3': 1, 'post_clean': True}
1.4685018528321863 {'weight1': 4, 'weight2': 6, 'weight3': 1, 'post_clean': True}
1.4679724722075171 {'weight1': 5, 'weight2': 7, 'weight3': 1, 'post_clean': True}
1.467443091582848 {'weight1': 6, 'weight2': 8, 'weight3': 1, 'post_clean': True}


# Predcit test

In [37]:
gbdt_pred_Test = lgbm_gbdt_model.predict(X_test)
dart_pred_Test = lgbm_dart_model.predict(X_test)
ada_pred_Test = ada_model.predict(X_test)

In [43]:
# weight
w1 = 1
w2 = 9
w3 = 1
ensemble_pred_Test = (w1 * gbdt_pred_Test + w2 * dart_pred_Test + w3 * ada_pred_Test) / (w1+w2+w3)

# print(best_weights)
# ensemble_pred_Test = (best_weights['weight1'] * gbdt_pred_Test + best_weights['weight2'] * dart_pred_Test + best_weights['weight3'] * ada_pred_Test) / (best_weights['weight1']+best_weights['weight2']+best_weights['weight3'])

# 清outliner和4捨5入
# ensemble_pred_Test = postClean(ensemble_pred_Test)
for i, label in enumerate(ensemble_pred_Test):
    if ensemble_pred_Test[i] < 0:
        ensemble_pred_Test[i] = 0
    elif ensemble_pred_Test[i] > 9:
        ensemble_pred_Test[i] = 9
    
    ensemble_pred_Test[i] = np.round(ensemble_pred_Test[i])

In [46]:
ensemble_pred = (w1 * gbdt_pred + w2 * dart_pred + w3 * ada_pred) / (w1+w2+w3)
for i, label in enumerate(ensemble_pred):
    if ensemble_pred[i] < 0:
        ensemble_pred[i] = 0
    elif ensemble_pred[i] > 9:
        ensemble_pred[i] = 9
    
    ensemble_pred[i] = np.round(ensemble_pred[i])
mae = np.mean(np.abs(ensemble_pred - Y_eval))
mae

1.4859714134462678

In [44]:
test_df = pd.DataFrame(columns=['id', 'Danceability'])
test_df['id'] = testing_id
test_df['Danceability'] = ensemble_pred_Test

test_df.to_csv('submission.csv', index=False)

In [105]:
gbdt_pred_Test_blend = lgbm_gbdt_blend_model.predict(X_test)
dart_pred_Test_blend = lgbm_dart_blend_model.predict(X_test)
ada_pred_Test_blend = ada_blend_model.predict(X_test)

In [107]:
w1 = 6
w2 = 8
w3 = 1
ensemble_pred_Test_blend = (w1 * gbdt_pred_Test_blend + w2 * dart_pred_Test_blend + w3 * ada_pred_Test_blend) / (w1+w2+w3)

for i, label in enumerate(ensemble_pred_Test_blend):
    if ensemble_pred_Test_blend[i] < 0:
        ensemble_pred_Test_blend[i] = 0
    elif ensemble_pred_Test_blend[i] > 9:
        ensemble_pred_Test_blend[i] = 9
    
    ensemble_pred_Test_blend[i] = np.round(ensemble_pred_Test_blend[i])

mae = np.mean(np.abs(ensemble_pred_Test_blend - Y_test))
mae

0.32636579572446556

In [108]:
test_df = pd.DataFrame(columns=['id', 'Danceability'])
test_df['id'] = testing_id
test_df['Danceability'] = ensemble_pred_Test_blend

test_df.to_csv('submission.csv', index=False)