# Import thư viện

In [73]:
import numpy as np 
import pandas as pd
from itables import show
from scipy.stats import skew,boxcox_normmax, zscore
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, FunctionTransformer, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import itables.options as opt
import os
opt.maxBytes = 5000000

# Nạp dữ liệu

In [74]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Tiền xử lý dữ liệu

## Drop các đặc trưng không cần thiết trong mô hình

In [75]:
train.drop(['Id', 'Artist Name', 'Track Name'], axis=1, inplace=True)
test.drop(['Id', 'Artist Name', 'Track Name'], axis=1, inplace=True)

## Xử lý dữ liệu thiếu:

In [76]:
train.columns

Index(['Popularity', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_in min/ms', 'time_signature', 'Class'],
      dtype='object')

In [77]:
missing_feats = ['Popularity','key','instrumentalness']

for feat in missing_feats:
    global_median = train[feat].median()
    train[feat] = train[feat].fillna(global_median)
    test[feat] = test[feat].fillna(global_median)

In [78]:
train.isna().sum()

Popularity            0
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
duration_in min/ms    0
time_signature        0
Class                 0
dtype: int64

In [79]:
test.isna().sum()

Popularity            0
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
duration_in min/ms    0
time_signature        0
dtype: int64

# Feature Engineering

In [80]:
#StandardScaler
StandardScaler_feats = ['Popularity','danceability','valence',
                        'energy','speechiness','acousticness',
                        'instrumentalness','liveness']

#Log transformer
log_feats = ['speechiness','acousticness',
            'instrumentalness','liveness']

#One-hot
Cat_feats = ['key','mode','time_signature']

In [81]:
# =============== Dữ liệu train ===============
#Log transform
train[log_feats] = np.log1p(train[log_feats])

#StandardScaler
train[StandardScaler_feats] = StandardScaler().fit_transform(train[StandardScaler_feats])

#Power Transformer
#Nó tự động tìm ra lambda tốt nhất:
#Cần log thì log
#Cần bình phương thì sẽ bình phương
#Xử lý cả âm
train['loudness'] = PowerTransformer(method='yeo-johnson').fit_transform(train[['loudness']])

# =============== Dữ liệu test ===============
test[log_feats] = np.log1p(test[log_feats])
test[StandardScaler_feats] = StandardScaler().fit_transform(test[StandardScaler_feats])
test['loudness'] = PowerTransformer(method='yeo-johnson').fit_transform(test[['loudness']])

**Gộp train và test để cả 2 đều động bộ**

In [82]:
target = train['Class']
df = pd.concat([train.drop('Class',axis=1),test])
print("\n train",train.shape)
print("\n test",test.shape)
print("\nall data",df.shape)


 train (14396, 15)

 test (3600, 14)

all data (17996, 14)


In [83]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_feats = encoder.fit_transform(df[Cat_feats])
feature_names = encoder.get_feature_names_out(Cat_feats)
df_encoded = pd.DataFrame(encoded_feats, columns=feature_names, index=df.index)
df = df.drop(columns=Cat_feats).join(df_encoded)

In [84]:
df.shape

(25196, 28)

In [85]:
show(df)

0
Loading ITables v2.5.2 from the internet...  (need help?)


# Lưu lại dữ liệu

In [86]:
exp_dir = "../exps"
if os.path.exists(exp_dir) == False:
    os.makedirs(exp_dir,exist_ok=True)

save_dir = f"{exp_dir}/Preproccessed"
os.makedirs(save_dir,exist_ok=True)

In [87]:
train = df.iloc[:14396,:]
test = df.iloc[14396:,:]

In [88]:
train.to_csv(f'{save_dir}/exp1_train.csv',index=False)
test.to_csv(f'{save_dir}/exp1_test.csv',index=False)