In [1]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import pickle
from sklearn.preprocessing import OneHotEncoder

In [2]:
basepath = './dataset/audio_wav'
pd_random_seed = 1

In [3]:
class_df = pd.read_csv(os.path.join(basepath, 'linebot_data_class.csv'))
class_df = class_df[['file_name', 'new_cate_2']]
class_df.rename({'file_name': 'AudioFile_name'}, axis=1, inplace=True)
class_df.rename({'new_cate_2': 'Audio_class_cat'}, axis=1, inplace=True)
print(len(class_df))
class_df.head(10)

32382


Unnamed: 0,AudioFile_name,Audio_class_cat
0,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-06-01 1...,車輛
1,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-06-09 1...,車輛
2,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-05-27 0...,音樂
3,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-05-26 1...,說話
4,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-06-01 1...,說話
5,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-06-04 0...,室內
6,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-06-08 1...,車輛
7,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-05-21 1...,音樂
8,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-06-02 1...,大房間或大廳
9,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-06-11 1...,海浪聲


In [4]:
cat, cnt = np.unique(np.asarray(class_df['Audio_class_cat']), return_counts=True)
print(cat)
print(cnt)
print(len(cnt))
print(cnt.sum())

['交通噪音' '公共汽車' '公共空間' '剪刀' '劈啪作響聲' '動物叫聲' '卡車' '叮噹作響' '吹口哨' '呼吸聲' '咀嚼'
 '咳嗽' '哭聲' '喇叭聲' '喧嘩嘈雜聲' '嗡嗡聲' '嘎嘎聲' '噪音' '噴嚏' '囓齒動物' '垃圾車' '城市的或人造的'
 '大房間或大廳' '安靜' '室內' '室外' '對話' '小房間' '尖叫聲' '工地噪音' '廣播' '引擎' '心跳聲' '戶外'
 '打印機' '打嗝' '打字聲' '拉門聲' '掌聲' '摩托車' '摩擦聲' '放屁' '敲擊聲' '書寫聲' '機動車（公路）' '機械聲'
 '機械風扇' '水上交通工具' '沙沙聲' '沸騰' '流水聲' '海浪聲' '消防車' '滑鼠聲' '滴答聲' '爆炸聲' '球類'
 '環境噪音' '白噪聲' '直升機' '空調' '笑聲' '箭' '粉紅噪音' '聲納' '腳步聲' '自行車' '蛙鳴' '蟲鳴' '說話'
 '警報聲' '警笛' '跳動聲' '車輛' '道路噪音' '鄉村或自然' '鈴聲' '鍋碗瓢盆' '鐘聲' '鐵路交通' '鑽頭' '雨聲'
 '雷聲' '電動工具' '電視' '音樂' '風噪聲（麥克風）' '風聲' '飛機' '馬達聲' '鳥類叫聲']
[  57  463  135    2   12 2605   13   66    4  246   12   31   31   10
   32  184  194  154   11   81   16  129  412 2596 4208   10  244 3780
    2   90   26   17  215  684   52   22   25  104    2   62    3    3
   50  215  788  337  505   55  416   23  174   45    1   15   26   29
   17   10  563    4   62   43   27   13   10   34    4   27  124 3981
   82   26   36 2609   67  557    8   27   85  488    3  204   13   53
  406 1412  236  

In [5]:
def one_hot_encode(df, column_name):
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    df_encoded = pd.DataFrame(encoder.fit_transform(df[[column_name]]))
    df_encoded.columns = encoder.get_feature_names_out([column_name])
    df.drop([column_name] ,axis=1, inplace=True)
    df = pd.concat([df, df_encoded], axis=1)
    return df

In [6]:
class_df = one_hot_encode(class_df, 'Audio_class_cat')
class_df = class_df.dropna()
print(len(class_df))
class_df.head()

32382


Unnamed: 0,AudioFile_name,Audio_class_cat_交通噪音,Audio_class_cat_公共汽車,Audio_class_cat_公共空間,Audio_class_cat_剪刀,Audio_class_cat_劈啪作響聲,Audio_class_cat_動物叫聲,Audio_class_cat_卡車,Audio_class_cat_叮噹作響,Audio_class_cat_吹口哨,...,Audio_class_cat_雨聲,Audio_class_cat_雷聲,Audio_class_cat_電動工具,Audio_class_cat_電視,Audio_class_cat_音樂,Audio_class_cat_風噪聲（麥克風）,Audio_class_cat_風聲,Audio_class_cat_飛機,Audio_class_cat_馬達聲,Audio_class_cat_鳥類叫聲
0,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-06-01 1...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-06-09 1...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-05-27 0...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-05-26 1...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-06-01 1...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
print(len(class_df))
class_df = class_df.groupby('AudioFile_name').max().reset_index()
print(len(class_df))
class_df.head()

32382
10969


Unnamed: 0,AudioFile_name,Audio_class_cat_交通噪音,Audio_class_cat_公共汽車,Audio_class_cat_公共空間,Audio_class_cat_剪刀,Audio_class_cat_劈啪作響聲,Audio_class_cat_動物叫聲,Audio_class_cat_卡車,Audio_class_cat_叮噹作響,Audio_class_cat_吹口哨,...,Audio_class_cat_雨聲,Audio_class_cat_雷聲,Audio_class_cat_電動工具,Audio_class_cat_電視,Audio_class_cat_音樂,Audio_class_cat_風噪聲（麥克風）,Audio_class_cat_風聲,Audio_class_cat_飛機,Audio_class_cat_馬達聲,Audio_class_cat_鳥類叫聲
0,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-05-19 1...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-05-20 1...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-05-21 0...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-05-21 1...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,U02c91f8f2ecb32c96a13d9c83f69b0f9(2022-05-21 1...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Features preprocessing description
- total 38 features:
    - changed features:
        - Audio_class: one hot encoded to 91 columns -> 38 to (38 - 1 + 91) = 128 columns
        - Feel_score: change value [1, 2, 3] to 0 and value [4, 5] to 1
        - 時: rename to 'Time_cat' and do binning
        - 周末: rename to is_weekend and change its value
            - if value == 'y': value = 1
            - else: value = 0
        - gender: change its value
            - if value == '男性': value = 1
            - else: value = 0

In [8]:
df = pd.read_excel(os.path.join(basepath, 'linebot_data20221205.xlsx'))
# 'Audio_class' will be merge from linebot_data_class.csv
df = df[['AudioFile_name', 'DBA', 'Feel_score', 'Feel_health', '時', '周末',
         'Min', 'LA90', 'LA50', 'LA10', 'Max', 'Leq', 'LA10_LA90', 'MAX_MIN',
         '大於75dB', 'F65_75dB', 'F55_65dB', 'F45_55dB', '小於45dB',
         '大於4kHz', 'F200_4kHz', '小於200H', 'age', 'gender', 'educode', 'BSRS5', 'Noise_exposure',
         'Open Mindedness_BFI', 'Conscientiousness_BFI', 'Extraversion_BFI', 'Agreeableness_BFI',
         'Negative Emotionality_BFI', 'NSS(Noise Sensitivity Scale)', 'Max_LP', 'd_type', 'site_location2', 'ses_level']]
df = df.dropna()

print(len(df))
df = df[abs(df['Feel_score']-df['Feel_health'])<3]
print(len(df))

df['Feel_score'] = np.where((df['Feel_score']==1) | (df['Feel_score']==2) | (df['Feel_score']==3), 0, df['Feel_score'])
df['Feel_score'] = np.where((df['Feel_score']==4) | (df['Feel_score']==5), 1, df['Feel_score'])

df['Feel_health'] = np.where((df['Feel_health']==1) | (df['Feel_health']==2) | (df['Feel_health']==3), 0, df['Feel_health'])
df['Feel_health'] = np.where((df['Feel_health']==4) | (df['Feel_health']==5), 1, df['Feel_health'])

df.rename({'時': 'Time_cat'}, axis=1, inplace=True)
df = df.assign(Time_cat=pd.cut(df['Time_cat'], 
                               bins=[0, 6, 9, 18, 24], 
                               labels=['1', '2', '3', '4']))

df.rename({'周末': 'is_weekend'}, axis=1, inplace=True)
df['is_weekend'] = np.where(df['is_weekend']!='y', 0, df['is_weekend'])
df['is_weekend'] = np.where(df['is_weekend']=='y', 1, df['is_weekend'])

df['gender'] = np.where(df['gender']!='男性', 0, df['gender'])
df['gender'] = np.where(df['gender']=='男性', 1, df['gender'])

df = pd.merge(df, class_df, on="AudioFile_name", how="inner")

df = df.dropna().sample(frac=1, random_state=pd_random_seed).reset_index(drop=True)
print(df.shape)
df.head(10)

10166
10106
(9834, 128)


Unnamed: 0,AudioFile_name,DBA,Feel_score,Feel_health,Time_cat,is_weekend,Min,LA90,LA50,LA10,...,Audio_class_cat_雨聲,Audio_class_cat_雷聲,Audio_class_cat_電動工具,Audio_class_cat_電視,Audio_class_cat_音樂,Audio_class_cat_風噪聲（麥克風）,Audio_class_cat_風聲,Audio_class_cat_飛機,Audio_class_cat_馬達聲,Audio_class_cat_鳥類叫聲
0,U1ec34c20faa76a50df1ec15876e6b498(2022-05-22 0...,50.17,0,0,2,1,42.077072,45.047828,47.255137,48.914119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Uc8d7b09fd377e7e83046b4593ae235db(2022-07-12 1...,35.11,1,1,4,0,23.673985,27.113053,31.312686,34.988202,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,U81dd3f16fcb0d67c002bcf6aacc3cba0(2022-06-29 1...,59.73,1,0,3,0,47.333809,51.870583,55.926191,60.521378,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Uf30819db1dfab2a5acc766a08eda454b(2022-05-26 2...,64.02,0,0,4,0,26.878681,57.593943,60.233606,63.424884,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,U1ec34c20faa76a50df1ec15876e6b498(2022-05-27 0...,41.09,1,1,2,0,20.854745,26.779183,32.933182,41.166755,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,U5bf5273eb94dccdede5f5fd9ef509b06(2022-06-25 2...,35.46,1,1,4,1,26.481646,29.025916,31.300616,34.499085,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,U343746fde21a7d04c724ab9878b8233e(2022-07-16 2...,56.18,0,1,4,1,17.896923,44.976383,47.406677,55.282706,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,U716f3681d442c3320de2cd6c2c86024a(2022-06-11 1...,44.85,0,1,3,1,18.059136,37.548871,39.950587,43.90267,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,U7f2bc803739d1599ad7ef8475aca38f5(2022-05-26 2...,31.9,0,1,4,0,14.514427,20.570001,28.683383,30.68884,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Ufcb4342e83f09cb9fcca1437c9b03fb8(2022-05-28 1...,63.57,0,0,3,1,50.760723,55.822287,60.018101,62.97179,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
for c in df.columns:
    print(c)

AudioFile_name
DBA
Feel_score
Feel_health
Time_cat
is_weekend
Min
LA90
LA50
LA10
Max
Leq
LA10_LA90
MAX_MIN
大於75dB
F65_75dB
F55_65dB
F45_55dB
小於45dB
大於4kHz
F200_4kHz
小於200H
age
gender
educode
BSRS5
Noise_exposure
Open Mindedness_BFI
Conscientiousness_BFI
Extraversion_BFI
Agreeableness_BFI
Negative Emotionality_BFI
NSS(Noise Sensitivity Scale)
Max_LP
d_type
site_location2
ses_level
Audio_class_cat_交通噪音
Audio_class_cat_公共汽車
Audio_class_cat_公共空間
Audio_class_cat_剪刀
Audio_class_cat_劈啪作響聲
Audio_class_cat_動物叫聲
Audio_class_cat_卡車
Audio_class_cat_叮噹作響
Audio_class_cat_吹口哨
Audio_class_cat_呼吸聲
Audio_class_cat_咀嚼
Audio_class_cat_咳嗽
Audio_class_cat_哭聲
Audio_class_cat_喇叭聲
Audio_class_cat_喧嘩嘈雜聲
Audio_class_cat_嗡嗡聲
Audio_class_cat_嘎嘎聲
Audio_class_cat_噪音
Audio_class_cat_噴嚏
Audio_class_cat_囓齒動物
Audio_class_cat_垃圾車
Audio_class_cat_城市的或人造的
Audio_class_cat_大房間或大廳
Audio_class_cat_安靜
Audio_class_cat_室內
Audio_class_cat_室外
Audio_class_cat_對話
Audio_class_cat_小房間
Audio_class_cat_尖叫聲
Audio_class_cat_工地噪音
Audio_clas

In [10]:
print(len(df))
g = df.groupby('Feel_score')
print(g.size())
# df = g.apply(lambda x: x.sample(g.size().min())).reset_index(drop=True)
df0 = df.loc[df['Feel_score'] == 0.0][:g.size().min()]
df1 = df.loc[df['Feel_score'] == 1.0][:g.size().min()]
df = pd.concat([df0, df1])
df

9834
Feel_score
0    7020
1    2814
dtype: int64


Unnamed: 0,AudioFile_name,DBA,Feel_score,Feel_health,Time_cat,is_weekend,Min,LA90,LA50,LA10,...,Audio_class_cat_雨聲,Audio_class_cat_雷聲,Audio_class_cat_電動工具,Audio_class_cat_電視,Audio_class_cat_音樂,Audio_class_cat_風噪聲（麥克風）,Audio_class_cat_風聲,Audio_class_cat_飛機,Audio_class_cat_馬達聲,Audio_class_cat_鳥類叫聲
0,U1ec34c20faa76a50df1ec15876e6b498(2022-05-22 0...,50.17,0,0,2,1,42.077072,45.047828,47.255137,48.914119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Uf30819db1dfab2a5acc766a08eda454b(2022-05-26 2...,64.02,0,0,4,0,26.878681,57.593943,60.233606,63.424884,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,U343746fde21a7d04c724ab9878b8233e(2022-07-16 2...,56.18,0,1,4,1,17.896923,44.976383,47.406677,55.282706,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,U716f3681d442c3320de2cd6c2c86024a(2022-06-11 1...,44.85,0,1,3,1,18.059136,37.548871,39.950587,43.902670,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,U7f2bc803739d1599ad7ef8475aca38f5(2022-05-26 2...,31.90,0,1,4,0,14.514427,20.570001,28.683383,30.688840,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9815,U92b984697b6e402ed024c6d17925eda8(2022-06-24 1...,31.67,1,1,3,0,8.245280,9.911901,11.931190,22.885744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9818,U92ab75d4e115573d8ccb35a0d0c76a34(2022-05-22 1...,46.02,1,1,3,1,11.384272,13.661894,15.263457,25.761169,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9823,U4485aa5166cf4ddfef47f62b9224ee32(2022-05-27 2...,35.82,1,1,4,0,18.188685,20.112204,22.976200,35.107213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9825,U55ba52f16342c523fb361acd5aa09319(2022-06-19 1...,33.42,1,1,3,1,13.346287,28.644071,30.119144,31.688725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df = df.sample(frac=1, random_state=pd_random_seed).reset_index(drop=True)
df

Unnamed: 0,AudioFile_name,DBA,Feel_score,Feel_health,Time_cat,is_weekend,Min,LA90,LA50,LA10,...,Audio_class_cat_雨聲,Audio_class_cat_雷聲,Audio_class_cat_電動工具,Audio_class_cat_電視,Audio_class_cat_音樂,Audio_class_cat_風噪聲（麥克風）,Audio_class_cat_風聲,Audio_class_cat_飛機,Audio_class_cat_馬達聲,Audio_class_cat_鳥類叫聲
0,U985fa4d251a0b428e624aa3b0141dd08(2022-05-30 1...,41.39,1,1,3,0,28.414896,30.736440,32.393545,35.008557,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,U61cef6fdfd197d5df99408ca4a2142ec(2022-06-02 0...,48.15,0,0,2,0,13.137367,31.849239,39.720913,49.190985,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,U6b3e72c9846bb4cc2532c6906e9e6b14(2022-05-27 1...,59.83,0,0,3,0,28.661609,53.984254,56.711833,58.516686,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,U8494d640671b2b7c9582a793f098c25e(2022-06-25 1...,55.10,1,1,3,1,39.399548,50.230903,51.762734,52.943789,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Uc8d7b09fd377e7e83046b4593ae235db(2022-07-28 0...,55.38,0,0,2,0,42.523831,47.063245,51.617696,54.855630,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5623,Uf4854464d72b7fd781a3ab036eb5f3d6(2022-07-21 1...,47.22,0,0,3,0,18.963946,25.498632,29.381929,47.136110,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5624,U45c45ee0d2f6e1de8adba632802eb1c0(2022-05-24 1...,33.96,1,0,3,0,18.254663,28.382460,30.642455,32.761166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5625,U5bf5273eb94dccdede5f5fd9ef509b06(2022-06-15 0...,25.48,1,1,1,0,18.269646,20.108467,21.502478,24.461239,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5626,Ua5db6a146523bbb41b720313ed9901da(2022-06-12 2...,35.24,0,0,4,1,10.267532,23.493298,25.428293,32.478577,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
print(df.shape)

(5628, 128)


In [39]:
x = df.iloc[0, :]
list(x[4:])

['3',
 0,
 28.414896,
 30.73644026,
 32.39354523,
 35.00855734,
 58.12608633,
 37.40803977,
 4.272117085,
 29.71119033,
 0.0,
 0.0,
 0.001964637,
 0.023575639,
 0.974459725,
 0.131035666,
 0.755652218,
 0.113312116,
 22,
 1,
 3,
 1,
 1.19047619,
 2.0,
 3.0,
 2.5,
 3.0,
 3.5,
 2.9,
 2,
 2,
 '室內_住家或住宿',
 4,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [42]:
def next_record(df):
    for index, row in df.iterrows():
        filename = row['AudioFile_name']
        feel_score = row['Feel_score']
        feel_health = row['Feel_health']
        feature = [row[1]]+list(row[4:])
        yield feel_score, feel_health, filename, feature

In [43]:
# for catboost
feel_scores = []
feel_healths = []
filenames = []

features = []

print(len(df))
i = 0
data_generator = next_record(df)
for feel_score, feel_health, filename, feature in data_generator:
    feel_scores.append(feel_score)
    feel_healths.append(feel_health)
    filenames.append(filename)
    features.append(feature)
    i += 1
    if i%500==0:
        print(f"dataset prepared: {i}")
print(len(features[0]))

5628
dataset prepared: 500
dataset prepared: 1000
dataset prepared: 1500
dataset prepared: 2000
dataset prepared: 2500
dataset prepared: 3000
dataset prepared: 3500
dataset prepared: 4000
dataset prepared: 4500
dataset prepared: 5000
dataset prepared: 5500
125


In [44]:
# for catboost
filenames = np.asarray(filenames)
feel_scores = np.asarray(feel_scores)
feel_healths = np.asarray(feel_healths)

features = np.asarray(features)

In [47]:
# for catboost
print("Training data shape")
print(feel_scores.shape)
print(feel_healths.shape)
print(features.shape)

Training data shape
(5628,)
(5628,)
(5628, 125)


In [48]:
# train test split
test_size_percentage = 0.2
test_size = int(test_size_percentage * features.shape[0])
test_size

1125

In [50]:
# for catboost
cat_X_train = {'y_train_scores': feel_scores[:-test_size],
               'cat_train_features': features[:-test_size]}

cat_X_test = {'y_test_scores': feel_scores[-test_size:],
              'cat_test_features': features[-test_size:]}

In [51]:
# for catboost
with open('./dataset/' + 'linebot_train' + '_cat_data.pickle', 'wb') as handle:
    pickle.dump(cat_X_train, handle, protocol=4)
with open('./dataset/' + 'linebot_test'+ '_cat_data.pickle', 'wb') as handle:
    pickle.dump(cat_X_test, handle, protocol=4)