# Baseline

In [2]:
import os
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [4]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [5]:
train

Unnamed: 0,ID,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration,genre
0,TRAIN_00000,0.3341,0.9231,10,0.8540,0.3797,0.0080,0.0002,0.7697,0.7902,0.5391,175,trap
1,TRAIN_00001,0.6344,0.5955,10,0.2826,0.1378,0.0000,0.7137,0.1597,0.4162,0.3043,513,techno
2,TRAIN_00002,0.3983,0.1107,8,0.2459,0.0247,0.7740,0.0000,0.5105,0.4291,0.2324,151,Trap Metal
3,TRAIN_00003,0.3139,0.9702,2,0.9252,0.9590,0.0371,0.3015,0.2539,0.0577,0.5406,246,trap
4,TRAIN_00004,0.0735,0.9452,5,0.7702,0.1397,0.2583,0.0015,0.1774,0.1128,0.3176,229,Dark Trap
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25378,TRAIN_25378,0.8814,0.3398,6,0.4602,0.3825,0.4505,0.0000,0.4222,0.8753,0.4081,88,Underground Rap
25379,TRAIN_25379,0.7505,0.5657,0,0.6682,0.1038,0.1281,0.0000,0.0596,0.9583,0.4273,123,Trap Metal
25380,TRAIN_25380,0.2520,0.7617,8,0.4928,0.0410,0.0003,0.8395,0.0843,0.1372,0.7828,353,dnb
25381,TRAIN_25381,0.6202,0.3455,0,0.6499,0.7960,0.4818,0.0000,0.2007,0.0194,0.9756,120,Dark Trap


In [6]:
test

Unnamed: 0,ID,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration
0,TEST_00000,0.8861,0.6821,2,0.7241,0.3103,0.1451,0.0000,0.5087,0.8863,0.9648,279
1,TEST_00001,0.8495,0.3883,1,0.4495,0.4661,0.6234,0.0004,0.1347,0.4062,0.9051,151
2,TEST_00002,0.2479,0.7549,3,0.7683,0.3131,0.5718,0.0000,0.1250,0.6357,0.3234,196
3,TEST_00003,0.6344,0.9804,1,0.6599,0.0931,0.0020,0.6016,0.8799,0.8790,0.2946,207
4,TEST_00004,0.6237,0.6926,6,0.6716,0.0269,0.0283,0.0001,0.1580,0.9288,0.3642,196
...,...,...,...,...,...,...,...,...,...,...,...,...
16917,TEST_16917,0.5461,0.8686,1,0.6351,0.0656,0.0010,0.8888,0.0047,0.8250,0.2672,161
16918,TEST_16918,0.8443,0.4194,4,0.5065,0.4315,0.1637,0.0241,0.0749,0.9482,0.9186,289
16919,TEST_16919,0.3373,0.9202,10,0.7272,0.4502,0.0791,0.0001,0.1881,0.0992,0.5953,212
16920,TEST_16920,0.4017,0.9741,9,0.6272,0.0768,0.0001,0.8718,0.6649,0.4463,0.4841,617


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25383 entries, 0 to 25382
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                25383 non-null  object 
 1   danceability      25383 non-null  float64
 2   energy            25383 non-null  float64
 3   key               25383 non-null  int64  
 4   loudness          25383 non-null  float64
 5   speechiness       25383 non-null  float64
 6   acousticness      25383 non-null  float64
 7   instrumentalness  25383 non-null  float64
 8   liveness          25383 non-null  float64
 9   valence           25383 non-null  float64
 10  tempo             25383 non-null  float64
 11  duration          25383 non-null  int64  
 12  genre             25383 non-null  object 
dtypes: float64(9), int64(2), object(2)
memory usage: 2.5+ MB


In [8]:
# X는 독립변수이므로 종속변수를 제거합니다. 또한 target 이외의 문자열 데이터를 제거합니다.
X = train.drop(["ID", "genre"], axis = 1)
# y는 종속변수로 값을 설정합니다.
y = train[['genre']]

# train에서와 마찬가지로 문자열이 포함된 특성은 제거합니다.
test = test.drop(["ID"], axis = 1)

In [9]:
# 학습데이터, 검증데이터 분리
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)

In [10]:
model = RandomForestClassifier(random_state = 42)

In [11]:
# 학습데이터를 모델에 입력합니다.
model.fit(X_train,y_train)

RandomForestClassifier(random_state=42)

In [12]:
val_pred = model.predict(X_valid)

# Macro f1 score을 사용하기 위해 average 인자 값을 "macro" 로 설정해줍니다.
print("현재 Macro F1 Score의 검증 점수는 {}입니다.".format(f1_score(val_pred, y_valid, average = "macro")))

현재 Macro F1 Score의 검증 점수는 0.6261751859544912입니다.


In [13]:
# 최종 예측을 하기위해 test값을 입력합니다.
pred = model.predict(test)

In [14]:
# 제출 파일을 불러옵니다.
submission = pd.read_csv("./sample_submission.csv")
submission["genre"] = pred

In [15]:
submission

Unnamed: 0,ID,genre
0,TEST_00000,Hiphop
1,TEST_00001,Dark Trap
2,TEST_00002,Emo
3,TEST_00003,techhouse
4,TEST_00004,Pop
...,...,...
16917,TEST_16917,techhouse
16918,TEST_16918,Hiphop
16919,TEST_16919,hardstyle
16920,TEST_16920,psytrance


In [16]:
# 해당 파일을 다운로드 받아서 제출해주세요.
submission.to_csv("./baseline_submit.csv", index = False)