# Import thư viện

In [19]:
import numpy as np 
import pandas as pd
import re
from itables import show
from scipy.stats import skew,boxcox_normmax, zscore
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, FunctionTransformer, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import itables.options as opt
import os
opt.maxBytes = 5000000

# Nạp dữ liệu

In [20]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Tiền xử lý dữ liệu

## Drop các đặc trưng không cần thiết trong mô hình

In [21]:
train.drop(['Id', 'Artist Name', 'Track Name'], axis=1, inplace=True)
test.drop(['Id','Artist Name', 'Track Name'], axis=1, inplace=True)

## Xử lý dữ liệu thiếu:

In [22]:
train.columns

Index(['Popularity', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_in min/ms', 'time_signature', 'Class'],
      dtype='object')

In [23]:
missing_feats = ['Popularity','key','instrumentalness']

for feat in missing_feats:
    global_median = train[feat].median()
    train[feat] = train[feat].fillna(global_median)
    test[feat] = test[feat].fillna(global_median)

In [24]:
train.isna().sum()

Popularity            0
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
duration_in min/ms    0
time_signature        0
Class                 0
dtype: int64

In [25]:
test.isna().sum()

Popularity            0
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
duration_in min/ms    0
time_signature        0
dtype: int64

# Feature Engineering

In [26]:
#StandardScaler
StandardScaler_feats = ['Popularity', 'tempo', 'danceability','valence',
                        'energy','speechiness','acousticness',
                        'instrumentalness','liveness']

#Log transformer
log_feats = ['speechiness','acousticness',
            'instrumentalness','liveness']

# #One-hot
# Cat_feats = ['key','mode','time_signature']

In [27]:
#Tạo sacler
scaler_std = StandardScaler()
pt_loudness = PowerTransformer(method='yeo-johnson')
scaler_robust = RobustScaler()

# =============== Dữ liệu train ===============
#Log transform
train[log_feats] = np.log1p(train[log_feats])

#StandardScaler
train[StandardScaler_feats] = scaler_std.fit_transform(train[StandardScaler_feats])

#Power Transformer
#Nó tự động tìm ra lambda tốt nhất:
#Cần log thì log
#Cần bình phương thì sẽ bình phương
#Xử lý cả âm
train['loudness'] = pt_loudness.fit_transform(train[['loudness']])

#RobustScaler
train[['duration_in min/ms']] = scaler_robust.fit_transform(train[['duration_in min/ms']])


# =============== Dữ liệu test ===============
#Sử dụng các dữ liệu đã fit ở tập train để transform cho tập test
test[log_feats] = np.log1p(test[log_feats])
test[StandardScaler_feats] = scaler_std.transform(test[StandardScaler_feats])
test['loudness'] = pt_loudness.transform(test[['loudness']])
test[['duration_in min/ms']] = scaler_robust.transform(test[['duration_in min/ms']])

## Concat các file embedding của Artist Name đã tạo:

In [28]:
df_train_emb = pd.read_csv('../exps/Preproccessed/train_embeddings_lstm.csv')
df_test_emb = pd.read_csv("../exps/Preproccessed/test_embeddings_lstm.csv")
train_final = pd.concat([train, df_train_emb], axis=1)
test_final = pd.concat([test,df_test_emb],axis=1)
show(train_final)
show(test_final)

0
Loading ITables v2.5.2 from the internet...  (need help?)


0
Loading ITables v2.5.2 from the internet...  (need help?)


# Lưu lại dữ liệu

In [29]:
exp_dir = "../exps"
if os.path.exists(exp_dir) == False:
    os.makedirs(exp_dir,exist_ok=True)

save_dir = f"{exp_dir}/Preproccessed"
os.makedirs(save_dir,exist_ok=True)

In [30]:
train_final.to_csv(f'{save_dir}/exp2_trainWithName.csv',index=False)
test_final.to_csv(f'{save_dir}/exp2_testWithName.csv',index=False)