In [1]:
#  plot
%matplotlib inline
%matplotlib widget
from matplotlib import pyplot as plt
import matplotlib

#   basic packages
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm
import math

#   acceleratioin
from numba import jit, njit

#   learning packages
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor as knn
from sklearn.preprocessing import normalize
from lightgbm import LGBMModel, LGBMClassifier, LGBMRegressor, plot_importance
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor
# from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor


In [2]:
"""
load preprocessed data (train_feature_sklearn didn't perform well)
"""

with open('newModel/sentence-t5-large_train_feature.json') as f:
    training_data = json.load(f)
# with open('preprocessed_data/efedericisentence_bert_base/train_feature_sklearn.json') as f:
#     training_data = json.load(f)

training_id = [int(data['id']) for data in training_data]
X = [data['feature'] for data in training_data]
X = np.asarray(X, dtype=np.float32)
Y = np.asarray([int(data['label']) for data in training_data])


partial_ans = pd.read_csv('html2023-spring-final-project/test_partial_answer.csv')


with open('newModel/sentence-t5-large_test_feature.json') as f:
    testing_data = json.load(f)
# with open('preprocessed_data/efedericisentence_bert_base/test_feature_sklearn.json') as f:
#     testing_data = json.load(f)


testing_id = []
X_test = []
test_partial_id = []
X_test_partial = []
Y_test_partial = np.array(partial_ans['Danceability'])
id_list = np.array(partial_ans['id'])

for data in testing_data:
    if data['id'] not in id_list:
        testing_id += [int(data['id'])]
        X_test.append(data['feature'])
    else:
        X_test_partial.append(data['feature'])
 
testing_id = np.asarray(testing_id)       
X_test = np.asarray(X_test, dtype=np.float32)
X_test_partial = np.array(X_test_partial, dtype=np.float32)


In [3]:
std_list = []
mean_list = []

for i in range(10):
    label_idx = np.where(Y == i)
    temp = X[label_idx]
    
    std_dict = {}
    mean_dict = {}
    for j in range(14):
        mean_dict[j] = np.mean(temp[:,j])
        std_dict[j] = np.std(temp[:,j], ddof=1)
    std_list.append(std_dict)
    mean_list.append(mean_dict)

In [4]:
# normalize data
X_norm = normalize(X, axis=0, norm='max')
X_test_norm = normalize(X_test, axis=0, norm='max')
X_test_partial_norm = normalize(X_test_partial, axis=0, norm='max')

# splt a validation set from the training set
X_train, X_eval, Y_train, Y_eval = train_test_split(X_norm, Y, test_size=0.33, random_state=8787, shuffle=True)

# Adaboost

In [5]:
ada_model = AdaBoostClassifier()
ada_model = ada_model.fit(X_train, Y_train)

# Validation
ada_pred = ada_model.predict(X_eval)
print(ada_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(ada_pred - Y_eval))
print(np.mean(np.abs(ada_pred - Y_eval)))


ada_pred = ada_model.predict(X_test_partial_norm)
mae_test = np.mean(np.abs(ada_pred - Y_test_partial))
print(np.mean(np.abs(ada_pred - Y_test_partial)))

[9 6 6 1 9 6 2 0 3 9]
[6 8 5 6 5 7 2 0 6 5]
1.8787718369507675
2.8003169572107764


# LightGBM dart

In [6]:
lgbm_dart_model = LGBMRegressor(n_estimators=25000,
                            verbose=1,
                            n_jobs=10,
                            feature_fraction=0.3,
                            bagging_fraction=0.4,
                            bagging_freq=10,
                            random_state=1234,
                            boosting_type='dart',
                            metric='mae',)

lgbm_dart_model = lgbm_dart_model.fit(X_train,Y_train)

# Validation dart
dart_pred = lgbm_dart_model.predict(X_eval)
print(dart_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(dart_pred - Y_eval))
print(np.mean(np.abs(dart_pred - Y_eval)))

dart_pred = lgbm_dart_model.predict(X_test_partial_norm)
mae_test = np.mean(np.abs(dart_pred - Y_test_partial))
print(np.mean(np.abs(dart_pred - Y_test_partial)))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095677
[LightGBM] [Info] Number of data points in the train set: 11503, number of used features: 5393
[LightGBM] [Info] Start training from score 4.578719
[ 5.52147362  4.26601329  5.41100772  3.61140726  5.64775795  5.62819548
  2.93867741 -0.40017103  3.68136911  5.65649044]
[6 8 5 6 5 7 2 0 6 5]
1.4523977011281604
2.2585640067510226


# LightGBM gbdt

In [7]:
lgbm_model = LGBMRegressor(n_estimators=2500,
                            verbose=1,
                            n_jobs=10,
                            feature_fraction=0.3,
                            bagging_fraction=0.4,
                            random_state=1234,
                            boosting_type='gbdt',
                            metric='mae',
                            )

lgbm_model = lgbm_model.fit(X_train,Y_train)

# Validation
pred = lgbm_model.predict(X_eval)
mae = np.mean(np.abs(pred - Y_eval))
print(np.mean(np.abs(pred - Y_eval)))

pred = lgbm_model.predict(X_test_partial_norm)
mae_test = np.mean(np.abs(pred - Y_test_partial))
print(np.mean(np.abs(pred - Y_test_partial)))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095677
[LightGBM] [Info] Number of data points in the train set: 11503, number of used features: 5393
[LightGBM] [Info] Start training from score 4.578719
1.4932472700182868
2.2578649281935617


# GridSearch

In [8]:
# @jit
# def LinearSearch(pred1, pred2, pred3, pred4, Y_eval, if_clean:dict):
@jit
def LinearSearch(pred1, pred2, pred3, Y_eval, if_clean:dict):
    # init search grid
    model1_w = np.arange(1, 20, 1)
    model2_w = np.arange(1, 20, 1)
    model3_w = np.arange(1, 20, 1)
    
    if if_clean['model1']:
        for i, label in enumerate(pred1):
            if pred1[i] < 0:
                pred1[i] = 0
            elif pred1[i] > 9:
                pred1[i] = 9
            
            if if_clean['model1_round']:
                pred1[i] = np.round(pred1[i])
                
    if if_clean['model2']:
        for i, label in enumerate(pred2):
            if pred2[i] < 0:
                pred2[i] = 0
            elif pred2[i] > 9:
                pred2[i] = 9
            
            if if_clean['model2_round']:
                pred2[i] = np.round(pred2[i])
                
    if if_clean['model3']:
        for i, label in enumerate(pred3):
            if pred3[i] < 0:
                pred3[i] = 0
            elif pred3[i] > 9:
                pred3[i] = 9
            
            if if_clean['model3_round']:
                pred3[i] = np.round(pred3[i])

    if_post_clean = [True, False]

    best_mae = 10
    best_weights = {
        'weight1': 0,
        'weight2': 0,
        'weight3': 0,
        'post_clean': False,
    }

    for w1 in model1_w:
        for w2 in model2_w:
            for w3 in model3_w:
                # for w4 in model4_w:
                for post_clean in if_post_clean:
                    # ensemble_pred = (w1*pred1 + w2*pred2 + w3*pred3 + w4*pred4) / np.sum([w1, w2, w3, w4])
                    ensemble_pred = (w1*pred1 + w2*pred2 + w3*pred3) / np.sum([w1, w2, w3])
                    if post_clean:
                        for i, label in enumerate(ensemble_pred):
                            if ensemble_pred[i] < 0:
                                ensemble_pred[i] = 0
                            elif ensemble_pred[i] > 9:
                                ensemble_pred[i] = 9
                            
                            ensemble_pred[i] = np.round(ensemble_pred[i])
                            
                    mae = np.mean(np.abs(ensemble_pred - Y_eval))
                    if mae < best_mae:
                        best_mae = mae
                        best_weights['weight1'] = w1
                        best_weights['weight2'] = w2
                        best_weights['weight3'] = w3
                        # best_weights['weight4'] = w4
                        best_weights['post_clean'] = True
                        print(best_mae, best_weights)

                    
    return best_mae, best_weights

In [9]:
# lgbm_pred = lgbm_model.predict(X_eval)
# dart_pred = lgbm_dart_model.predict(X_eval)
# ada_pred = ada_model.predict(X_eval)
# if_clean = {
#     'model1': False,
#     'model1_round': False,
#     'model2': False,
#     'model2_round': False,
#     'model3': False,
#     'model3_round': False,
# }
# mae, best_weights = LinearSearch(ada_pred, lgbm_pred, dart_pred, Y_eval, if_clean)


# feature shift

In [43]:
try:
    del lgbm_model_2
    del lgbm_pred2
except:
    pass

lgbm_pred = lgbm_model.predict(X_test_partial_norm)
lgbm_pred2 = lgbm_model.predict(X_test_norm)
dart_pred = lgbm_dart_model.predict(X_test_partial_norm)
ada_pred = ada_model.predict(X_test_partial_norm)

X_test_partial_tmp = X_test_partial_norm.copy()
X_test_norm_tmp = X_test_norm.copy()

mae_gbdt = np.mean(np.abs(lgbm_pred - Y_test_partial))
mae_dart = np.mean(np.abs(dart_pred - Y_test_partial))
mae_ada = np.mean(np.abs(ada_pred - Y_test_partial))

print(np.mean(np.abs(lgbm_pred - Y_test_partial)))
print(np.mean(np.abs(dart_pred - Y_test_partial)))
print(np.mean(np.abs(ada_pred - Y_test_partial)))

for _ in range(10):

    try:
        lgbm_pred = lgbm_model_2.predict(X_test_partial_tmp)
        lgbm_pred2 = lgbm_model_2.predict(X_test_norm_tmp)
        # dart_pred = lgbm_dart_model_2.predict(X_test_partial_tmp)
        # ada_pred = ada_model_2.predict(X_test_partial_tmp)
    except:
        pass
    
    for i, label in enumerate(lgbm_pred):
        if lgbm_pred[i] < 0:
            lgbm_pred[i] = 0
        elif lgbm_pred[i] > 9:
            lgbm_pred[i] = 9
            
    for i in range(len(lgbm_pred)):
        for j in range(14):
            random_noise_level = np.random.rand() - 0.5
            # if  dart_pred[i] - mae_dart <= 0:
            #     random_jump = np.random.randint(0, mae_dart, size=1)
            # elif dart_pred[i] + mae_dart >= 9:
            #     random_jump = -np.random.randint(0, mae_dart, size=1)
            # else:
            #     random_jump = np.random.randint(-mae_dart, mae_dart, size=1)
            # X_test_partial_tmp[i][j] = mean_list[int(np.round(dart_pred[i]))][j]
            X_test_partial_tmp[i][j] = mean_list[int(np.round(lgbm_pred[i]))][j] + random_noise_level * std_list[int(np.round(lgbm_pred[i]))][j]
    try:
        for i, label in enumerate(lgbm_pred2):
            if lgbm_pred2[i] < 0:
                lgbm_pred2[i] = 0
            elif lgbm_pred2[i] > 9:
                lgbm_pred2[i] = 9
        for i in range(len(lgbm_pred2)):
            for j in range(14):
                random_noise_level = np.random.rand() - 0.5
                X_test_norm_tmp[i][j] = mean_list[int(np.round(lgbm_pred2[i]))][j] + random_noise_level * std_list[int(np.round(lgbm_pred2[i]))][j]
        print('in')
    except:
        pass    
    
    try:
        lgbm_pred = lgbm_model_2.predict(X_test_partial_tmp)
        # dart_pred = lgbm_dart_model_2.predict(X_test_partial_tmp)
        # ada_pred = ada_model_2.predict(X_test_partial_tmp)
    except:
        lgbm_pred = lgbm_model.predict(X_test_partial_tmp)
        # dart_pred = lgbm_dart_model.predict(X_test_partial_tmp)
        # ada_pred = ada_model.predict(X_test_partial_tmp)
        
    print('test round') # actually unfair to evaluate by this (you cannot trust this result)
    print(np.mean(np.abs(lgbm_pred - Y_test_partial)))
    # print(np.mean(np.abs(dart_pred - Y_test_partial)))
    # print(np.mean(np.abs(ada_pred - Y_test_partial)))
        
    seed = np.random.randint(low=0, high=len(X_test_partial_tmp))
    X_test_train, X_test_eval, Y_test_train, Y_test_eval = train_test_split(X_test_partial_tmp, Y_test_partial, test_size=0.25, random_state=seed)
    X_test_p_train, X_test_p_eval, Y_test_p_train, Y_test_p_eval = train_test_split(X_test_norm_tmp, lgbm_pred2, test_size=0.95, random_state=seed)
    
    # X_blend_train = np.concatenate([X_train, X_test_train])
    # Y_blend_train = np.concatenate([Y_train, Y_test_train])
    X_blend_train = np.concatenate([X_train, X_test_train, X_test_p_train])
    Y_blend_train = np.concatenate([Y_train, Y_test_train, Y_test_p_train])
    
    lgbm_model_2 = LGBMRegressor(n_estimators=2500,
                                verbose=1,
                                n_jobs=10,
                                feature_fraction=0.3,
                                bagging_fraction=0.4,
                                random_state=1234,
                                boosting_type='gbdt',
                                metric='mae',
                                )

    # lgbm_dart_model_2 = LGBMRegressor(n_estimators=25000,
    #                             verbose=1,
    #                             n_jobs=10,
    #                             feature_fraction=0.3,
    #                             bagging_fraction=0.4,
    #                             bagging_freq=10,
    #                             random_state=1234,
    #                             boosting_type='dart',
    #                             metric='mae',)
    # ada_model_2 = AdaBoostClassifier()

    lgbm_model_2 = lgbm_model_2.fit(X_blend_train, Y_blend_train)
    # lgbm_dart_model_2 = lgbm_dart_model_2.fit(X_blend_train, Y_blend_train)
    # ada_model_2 = ada_model_2.fit(X_blend_train, Y_blend_train)

    lgbm_pred = lgbm_model_2.predict(X_test_eval)
    # dart_pred = lgbm_dart_model_2.predict(X_test_eval)
    # ada_pred = ada_model_2.predict(X_test_eval)

    print('retrain blend result')
    print(np.mean(np.abs(lgbm_pred - Y_test_eval)))
    # print(np.mean(np.abs(dart_pred - Y_test_eval)))
    # print(np.mean(np.abs(ada_pred - Y_test_eval)))
    
    for i, label in enumerate(lgbm_pred):
        if lgbm_pred[i] < 0:
            lgbm_pred[i] = 0
        elif lgbm_pred[i] > 9:
            lgbm_pred[i] = 9

    for i in range(len(lgbm_pred)):
        for j in range(14):
            random_noise_level = np.random.rand() - 0.5
            X_test_eval[i][j] = mean_list[int(np.round(lgbm_pred[i]))][j] + random_noise_level * std_list[int(np.round(lgbm_pred[i]))][j]
    lgbm_pred = lgbm_model_2.predict(X_test_eval)
    print('feature shift')
    print(np.mean(np.abs(lgbm_pred - Y_test_eval)))

2.2578649281935617
2.2585640067510226
2.8003169572107764
in
test round
2.3492125797930403
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1111977
[LightGBM] [Info] Number of data points in the train set: 12260, number of used features: 5393
[LightGBM] [Info] Start training from score 4.570052
retrain blend result
2.223756794605872
feature shift
2.199860665164271
in
test round
0.7690133166024722
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1111908
[LightGBM] [Info] Number of data points in the train set: 12260, number of used features: 5393
[LightGBM] [Info] Start training from score 4.567805
retrain blend result
1.5390891571676715
feature shift
1.8880973429953196
in
test round
0.9816241189632776
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1111620
[LightGBM] [Info] Number of data points in the train set: 12260, number of used features: 5393
[LightGBM] [Info] Start 

In [56]:
lgbm_model_2 = LGBMRegressor(n_estimators=2500,
                                verbose=1,
                                n_jobs=10,
                                feature_fraction=0.3,
                                bagging_fraction=0.4,
                                random_state=1234,
                                boosting_type='gbdt',
                                metric='mae',
                                )
lgbm_model_2 = lgbm_model_2.fit(X_test_partial_tmp, Y_test_partial)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 874706
[LightGBM] [Info] Number of data points in the train set: 631, number of used features: 5393
[LightGBM] [Info] Start training from score 4.521395


In [57]:
lgbm_model_3 = lgbm_model_2

In [58]:
# Validation
tmp_pred = lgbm_model_3.predict(X_eval)

for i, label in enumerate(tmp_pred):
    if tmp_pred[i] < 0:
        tmp_pred[i] = 0
    elif tmp_pred[i] > 9:
        tmp_pred[i] = 9
    tmp_pred[i] = np.round(tmp_pred[i])

print(tmp_pred[:10])
print(Y_eval[:10])
mae = np.mean(np.abs(tmp_pred - Y_eval))
print(np.mean(np.abs(tmp_pred - Y_eval)))

[3. 3. 2. 4. 3. 4. 3. 2. 3. 4.]
[6 8 5 6 5 7 2 0 6 5]
2.455620257631904


In [59]:
label1 = lgbm_model_3.predict(X_test_norm_tmp)
label2 = lgbm_model_3.predict(X_test_partial_tmp)


for i, label in enumerate(label1):
    if label1[i] < 0:
        label1[i] = 0
    elif label1[i] > 9:
        label1[i] = 9
    label1[i] = np.round(label1[i])

for i, label in enumerate(label2):
    if label2[i] < 0:
        label2[i] = 0
    elif label2[i] > 9:
        label2[i] = 9
    label2[i] = np.round(label2[i])

In [60]:
final_label = np.concatenate([label1, label2])
final_label.shape

(6315,)

In [61]:
tt = np.concatenate([testing_id, id_list])
tt.shape

(6315,)

In [62]:
test_df = pd.DataFrame(columns=['id', 'Danceability'])
test_df['id'] = tt
test_df['Danceability'] = final_label

final_df = test_df.sort_values(by=['id'])

In [63]:
final_df

Unnamed: 0,id,Danceability
0,17170,4.0
1,17171,2.0
2,17172,3.0
3,17173,5.0
4,17174,4.0
...,...,...
5681,23480,5.0
5682,23481,8.0
6313,23482,3.0
5683,23483,7.0


In [64]:
final_df.to_csv('submission_stage2.csv', index=False)

In [65]:
np.mean(np.abs(label2 - Y_test_partial))

0.0

In [34]:
# lgbm_pred = lgbm_model.predict(X_test_partial_norm)
# dart_pred = lgbm_dart_model.predict(X_test_partial_norm)
# ada_pred = ada_model.predict(X_test_partial_norm)

# X_test_partial_tmp = X_test_partial_norm.copy()

# test_pred = lgbm_model_2.predict(X_test_partial_tmp)

# for i, label in enumerate(test_pred):
#     if test_pred[i] < 0:
#         test_pred[i] = 0
#     elif test_pred[i] > 9:
#         test_pred[i] = 9

# for i in range(len(test_pred)):
#     for j in range(14):
#         random_noise_level = np.random.rand() - 0.5
#         X_test_partial_tmp[i][j] = mean_list[int(np.round(test_pred[i]))][j] + random_noise_level * std_list[int(np.round(test_pred[i]))][j]


# Y_test_partial = np.array(partial_ans['Danceability'])
# lgbm_pred = lgbm_model_2.predict(X_test_partial_tmp)

# print('test round')
# print(np.mean(np.abs(lgbm_pred - Y_test_partial)))

In [35]:
# for i, label in enumerate(lgbm_pred):
#     if lgbm_pred[i] < 0:
#         lgbm_pred[i] = 0
#     elif lgbm_pred[i] > 9:
#         lgbm_pred[i] = 9

# print(np.round(lgbm_pred[:20]))
# print(Y_test_partial[:20])

In [36]:
# print(np.mean(np.abs(lgbm_pred - Y_test_partial)))

In [37]:
# print(len(lgbm_pred))
# print(len(Y_test_partial))