In [11]:
!pip install catboost
!pip install ipywidgets
!pip install shap
!jupyter nbextension enable --py widgetsnbextension



In [1]:
from catboost import Pool, CatBoostClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

In [2]:
with open('./dataset/' + 'linebot_train' + '_user_features.pickle', 'rb') as handle:
    X_train = pickle.load(handle)
with open('./dataset/' + 'linebot_test' + '_user_features.pickle', 'rb') as handle:
    X_test = pickle.load(handle)

In [3]:
X_train_features = X_train['cat_train_features']
y_train_scores = X_train['y_train_scores']

print("Train data")
print("User Features shape:", X_train_features.shape)
print("scores shape:", y_train_scores.shape)

Train data
User Features shape: (4280, 140)
scores shape: (4280,)


In [4]:
X_test_features = X_test['cat_test_features']
y_test_scores = X_test['y_test_scores']

print("Test data")
print("User Features shape:", X_test_features.shape)
print("scores shape:", y_test_scores.shape)

Test data
User Features shape: (1070, 140)
scores shape: (1070,)


In [5]:
from catboost import Pool, CatBoostClassifier

In [6]:
X_train_features[:, 15:106] = X_train_features[:, 15:106].astype(str)
X_test_features[:, 15:106] = X_test_features[:, 15:106].astype(str)

In [7]:
feature_importances = [6, 7, 12, 4, 13, 10, 11, 88, 106, 9, 14, 5, 8, 2, 117, 0, 132, 118, 121, 131, 100, 122, 129, 130, 124, 120, 20, 135, 133, 107, 123, 119, 139, 108, 112, 136, 3, 105, 90, 104, 127, 138, 134, 111, 126, 125, 110, 44, 137, 114, 61, 128, 16, 1, 115, 48, 73, 59, 109, 96, 84, 113, 54, 102, 65, 38, 58, 37, 75, 35, 66, 39, 116, 30, 60, 41, 85, 83, 70, 21, 94, 52, 99, 64, 27, 69, 22, 24, 31, 32, 95, 63, 97, 42, 93, 101, 17, 80, 87, 98, 40, 19, 77, 71, 34, 18, 55, 33, 103, 57, 47, 86, 53, 46, 91, 23, 36, 45, 25, 29, 82, 28, 79, 72, 50, 74, 56, 67, 68, 81, 15, 26, 43, 49, 51, 62, 76, 78, 89, 92]

In [8]:
X_train_features = X_train_features[:, feature_importances[:]]
X_test_features = X_test_features[:, feature_importances[:]]

In [9]:
# Time_cat 0
# DBA 1
# In_Outside 2
# Gender 3
# Age
# Education 5
# Occupation 6
# BSRS5
# OpenMindedness_BFI
# Conscientiousness_BFI
# Extraversion_BFI
# Agreeableness_BFI
# NegativeEmotionality_BFI
# NSSNoiseSensitivityScale
# Location 14
# Audio_class_cat_... 15~105
print(X_train_features.shape, X_test_features.shape)
X_train_features[:5]

(4280, 140) (1070, 140)


array([['學生', 4, 2.5, 23.0, 2.6, 2, 3, '0.0', 67.9288210808362, 3.5,
        '偏吵雜的工作場所', '大學', 2, '室外', 50.47718917846747, '4',
        72.08562990809602, 56.6434097018443, 67.92935292827207,
        75.34290144810711, '1.0', 65.22791204544602, 72.38151349689876,
        72.85390115510484, 69.46026722766617, 66.52866200074193, '0.0',
        66.55622533661031, 70.5402945204378, 72.81385013104897,
        65.65043589943046, 62.50552414571024, 1631906318.4540648,
        76.15121688768451, 0.0, 66.14869839844009, '女性', '0.0', '0.0',
        '0.0', 72.88845539575964, 13963788628.545378, 69.23927063716944,
        78.48290734391462, 71.70660051022645, 69.52068482708732,
        85.6451980468612, '0.0', 942437433.320309, 0.0, '0.0',
        71.20127419734547, '0.0', 81.3, 511.0, '0.0', '0.0', '0.0',
        82.77402241184832, '0.0', '0.0', 0.0, '0.0', '0.0', '0.0', '0.0',
        '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', 999.0, '0.0', '0.0',
        '0.0', '0.0', '0.0', '0.0', '0.0', '0.0',

In [10]:
cat_features = []
for idx, value in enumerate(X_train_features[0][:]):
    if isinstance(value, str):
        cat_features.append(idx)
train_dataset = Pool(data=X_train_features[:, :],
                     label=y_train_scores,
                     cat_features=cat_features)

eval_dataset = Pool(data=X_test_features[:, :],
                    label=y_test_scores,
                    cat_features=cat_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=4000,
                           learning_rate=0.03,
                           l2_leaf_reg=7,
                           depth=8,
                           use_best_model=True,
                           early_stopping_rounds=300,
                           task_type="GPU",
                           devices='0:1')

# summary = model.select_features(train_dataset,
#                                 eval_set=eval_dataset,
#                                 features_for_select='0-139',
#                                 num_features_to_select=50,
#                                 steps=5,
#                                 train_final_model=True,
#                                 logging_level='Verbose',
#                                 plot=True)
# summary


# Fit model

model.fit(train_dataset, eval_set=eval_dataset, early_stopping_rounds=300, verbose=100, plot=True)
# Get predicted classes
preds_class = model.predict(eval_dataset)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_dataset)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_dataset,
                          prediction_type='RawFormulaVal')


#best params: 'params': {'depth': 10, 'l2_leaf_reg': 7, 'learning_rate': 0.1}
# 'params': {'depth': 8, 'l2_leaf_reg': 7, 'learning_rate': 0.1},
# grid = {'learning_rate': [0.03, 0.1, 0.3],
#         'depth': [8, 10, 12],
#         'l2_leaf_reg': [5, 7, 9]}

# grid_search_result = model.grid_search(grid,
#                                        X=train_dataset,
#                                        verbose=100,
#                                        plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6840115	test: 0.6847848	best: 0.6847848 (0)	total: 48.1ms	remaining: 3m 12s
100:	learn: 0.4380679	test: 0.4698643	best: 0.4698643 (100)	total: 7.89s	remaining: 5m 4s
200:	learn: 0.3882488	test: 0.4441182	best: 0.4441182 (200)	total: 15.4s	remaining: 4m 51s
300:	learn: 0.3581375	test: 0.4341302	best: 0.4341302 (300)	total: 22.8s	remaining: 4m 39s
400:	learn: 0.3339239	test: 0.4298701	best: 0.4297795 (394)	total: 30.3s	remaining: 4m 31s
500:	learn: 0.3158470	test: 0.4288289	best: 0.4285768 (492)	total: 37.9s	remaining: 4m 24s
600:	learn: 0.3012966	test: 0.4274577	best: 0.4274567 (598)	total: 45.3s	remaining: 4m 16s
700:	learn: 0.2887463	test: 0.4270916	best: 0.4268574 (684)	total: 52.7s	remaining: 4m 7s
800:	learn: 0.2766477	test: 0.4267174	best: 0.4265769 (723)	total: 1m	remaining: 4m
900:	learn: 0.2641444	test: 0.4262805	best: 0.4262186 (886)	total: 1m 7s	remaining: 3m 53s
1000:	learn: 0.2562110	test: 0.4256173	best: 0.4253554 (986)	total: 1m 15s	remaining: 3m 46s
1100:	lea

In [None]:
import shap
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(train_dataset)
# shap.force_plot(explainer.expected_value, shap_values[0,:], X_train_features[0, :9])

In [None]:
features_names = [
'Time_cat', 
'DBA', 
'In_Outside', 
'Gender', 
'Age', 
'Education', 
'Occupation', 
'BSRS5', 
'OpenMindedness_BFI', 
'Conscientiousness_BFI', 
'Extraversion_BFI', 
'Agreeableness_BFI', 
'NegativeEmotionality_BFI', 
'NSSNoiseSensitivityScale', 
'Location', 
'Audio_class_cat_交通噪音', 
'Audio_class_cat_公共汽車', 
'Audio_class_cat_公共空間', 
'Audio_class_cat_剪刀', 
'Audio_class_cat_劈啪作響聲', 
'Audio_class_cat_動物叫聲', 
'Audio_class_cat_卡車', 
'Audio_class_cat_叮噹作響', 
'Audio_class_cat_吹口哨', 
'Audio_class_cat_呼吸聲', 
'Audio_class_cat_咀嚼', 
'Audio_class_cat_咳嗽', 
'Audio_class_cat_哭聲', 
'Audio_class_cat_喇叭聲', 
'Audio_class_cat_喧嘩嘈雜聲', 
'Audio_class_cat_嗡嗡聲', 
'Audio_class_cat_嘎嘎聲', 
'Audio_class_cat_噪音', 
'Audio_class_cat_噴嚏', 
'Audio_class_cat_囓齒動物', 
'Audio_class_cat_垃圾車', 
'Audio_class_cat_城市的或人造的', 
'Audio_class_cat_大房間或大廳', 
'Audio_class_cat_安靜', 
'Audio_class_cat_室內', 
'Audio_class_cat_室外', 
'Audio_class_cat_對話', 
'Audio_class_cat_小房間', 
'Audio_class_cat_尖叫聲', 
'Audio_class_cat_工地噪音', 
'Audio_class_cat_廣播', 
'Audio_class_cat_引擎', 
'Audio_class_cat_心跳聲', 
'Audio_class_cat_戶外', 
'Audio_class_cat_打印機', 
'Audio_class_cat_打嗝', 
'Audio_class_cat_打字聲', 
'Audio_class_cat_拉門聲', 
'Audio_class_cat_掌聲', 
'Audio_class_cat_摩托車', 
'Audio_class_cat_摩擦聲', 
'Audio_class_cat_放屁', 
'Audio_class_cat_敲擊聲', 
'Audio_class_cat_書寫聲', 
'Audio_class_cat_機動車（公路）', 
'Audio_class_cat_機械聲', 
'Audio_class_cat_機械風扇', 
'Audio_class_cat_水上交通工具', 
'Audio_class_cat_沙沙聲', 
'Audio_class_cat_沸騰', 
'Audio_class_cat_流水聲', 
'Audio_class_cat_海浪聲', 
'Audio_class_cat_消防車', 
'Audio_class_cat_滑鼠聲', 
'Audio_class_cat_滴答聲', 
'Audio_class_cat_爆炸聲', 
'Audio_class_cat_球類', 
'Audio_class_cat_環境噪音', 
'Audio_class_cat_白噪聲', 
'Audio_class_cat_直升機', 
'Audio_class_cat_空調', 
'Audio_class_cat_笑聲', 
'Audio_class_cat_箭', 
'Audio_class_cat_粉紅噪音', 
'Audio_class_cat_聲納', 
'Audio_class_cat_腳步聲', 
'Audio_class_cat_自行車', 
'Audio_class_cat_蛙鳴', 
'Audio_class_cat_蟲鳴', 
'Audio_class_cat_說話', 
'Audio_class_cat_警報聲', 
'Audio_class_cat_警笛', 
'Audio_class_cat_跳動聲', 
'Audio_class_cat_車輛', 
'Audio_class_cat_道路噪音', 
'Audio_class_cat_鄉村或自然', 
'Audio_class_cat_鈴聲', 
'Audio_class_cat_鍋碗瓢盆', 
'Audio_class_cat_鐘聲', 
'Audio_class_cat_鐵路交通', 
'Audio_class_cat_鑽頭', 
'Audio_class_cat_雨聲', 
'Audio_class_cat_雷聲', 
'Audio_class_cat_電動工具', 
'Audio_class_cat_電視', 
'Audio_class_cat_音樂', 
'Audio_class_cat_風噪聲（麥克風）', 
'Audio_class_cat_風聲', 
'Audio_class_cat_飛機', 
'Audio_class_cat_馬達聲', 
'Audio_class_cat_鳥類叫聲', 
'Min', 
'LA90', 
'LA50', 
'LA10', 
'Max', 
'Leq', 
'< 45 dBA', 
'45~55dBA', 
'55~65dBA', 
'65~75dBA', 
'> 75dBA', 
'100Hz', 
'125Hz', 
'160Hz', 
'200Hz', 
'250Hz', 
'315Hz', 
'400Hz', 
'500Hz', 
'630Hz', 
'800Hz', 
'1000Hz', 
'1250Hz', 
'1600Hz', 
'2000Hz', 
'2500Hz', 
'3150Hz', 
'4000Hz', 
'5000Hz', 
'6300Hz',
'8000Hz', 
'< 200Hz', 
'200~4000Hz', 
'> 4000Hz' ]
len(features_names)

In [None]:
print(model.feature_importances_)
shap.summary_plot(shap_values, X_train_features[:, :])

In [None]:
user_importances = []
class_importances = []
audio_importances = []
for i, importance in sorted(enumerate(model.feature_importances_), key=(lambda k: k[1]), reverse=True):
    print(features_names[i])
    if i < 15:
        user_importances.append(features_names[i])
    elif i < 106:
        class_importances.append(features_names[i])
    else:
        audio_importances.append(features_names[i])

In [34]:
s = ''
for i, label in enumerate(audio_importances):
    s += (', ' + label[:])
    if i >= 19:
        break
print(s)

, Min, 100Hz, 3150Hz, 125Hz, 250Hz, 2500Hz, 315Hz, 1600Hz, 2000Hz, 500Hz, 200Hz, 6300Hz, 4000Hz, LA90, 400Hz, 160Hz, > 4000Hz, LA50, < 45 dBA, 8000Hz


In [10]:
preds_class

array([0., 1., 1., ..., 0., 0., 0.])

In [11]:
preds_proba

array([[0.65061826, 0.34938174],
       [0.10433375, 0.89566625],
       [0.42647838, 0.57352162],
       ...,
       [0.95972733, 0.04027267],
       [0.72525193, 0.27474807],
       [0.64318813, 0.35681187]])

In [12]:
preds_raw

array([-0.62175794,  2.14997292,  0.29623395, ..., -3.17097604,
       -0.97066452, -0.58922858])

In [12]:
from sklearn import metrics

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train_scores,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_test_scores,m.predict_proba(test)[:,1]))

In [13]:
auc(model, train_dataset, eval_dataset)

(0.9712075951064377, 0.8903857458780058)

In [14]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print(accuracy_score(y_test_scores, preds_class))
print(f1_score(y_test_scores, preds_class, average="macro"))
print(precision_score(y_test_scores, preds_class, average="macro"))
print(recall_score(y_test_scores, preds_class, average="macro")) 

0.8
0.7999154153734306
0.7999643635753815
0.800426196008174
