# Classification 01_25_2nd

In [1]:
import os
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [4]:
train

Unnamed: 0,ID,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration,genre
0,TRAIN_00000,0.3341,0.9231,10,0.8540,0.3797,0.0080,0.0002,0.7697,0.7902,0.5391,175,trap
1,TRAIN_00001,0.6344,0.5955,10,0.2826,0.1378,0.0000,0.7137,0.1597,0.4162,0.3043,513,techno
2,TRAIN_00002,0.3983,0.1107,8,0.2459,0.0247,0.7740,0.0000,0.5105,0.4291,0.2324,151,Trap Metal
3,TRAIN_00003,0.3139,0.9702,2,0.9252,0.9590,0.0371,0.3015,0.2539,0.0577,0.5406,246,trap
4,TRAIN_00004,0.0735,0.9452,5,0.7702,0.1397,0.2583,0.0015,0.1774,0.1128,0.3176,229,Dark Trap
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25378,TRAIN_25378,0.8814,0.3398,6,0.4602,0.3825,0.4505,0.0000,0.4222,0.8753,0.4081,88,Underground Rap
25379,TRAIN_25379,0.7505,0.5657,0,0.6682,0.1038,0.1281,0.0000,0.0596,0.9583,0.4273,123,Trap Metal
25380,TRAIN_25380,0.2520,0.7617,8,0.4928,0.0410,0.0003,0.8395,0.0843,0.1372,0.7828,353,dnb
25381,TRAIN_25381,0.6202,0.3455,0,0.6499,0.7960,0.4818,0.0000,0.2007,0.0194,0.9756,120,Dark Trap


In [5]:
train.info() # Non-Null

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25383 entries, 0 to 25382
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                25383 non-null  object 
 1   danceability      25383 non-null  float64
 2   energy            25383 non-null  float64
 3   key               25383 non-null  int64  
 4   loudness          25383 non-null  float64
 5   speechiness       25383 non-null  float64
 6   acousticness      25383 non-null  float64
 7   instrumentalness  25383 non-null  float64
 8   liveness          25383 non-null  float64
 9   valence           25383 non-null  float64
 10  tempo             25383 non-null  float64
 11  duration          25383 non-null  int64  
 12  genre             25383 non-null  object 
dtypes: float64(9), int64(2), object(2)
memory usage: 2.5+ MB


In [6]:
# X는 독립변수이므로 종속변수를 제거합니다. 또한 target 이외의 문자열 데이터를 제거합니다.
X = train.drop(["ID", "genre"], axis = 1)
# y는 종속변수로 값을 설정합니다.
y = train['genre']

# train에서와 마찬가지로 문자열이 포함된 특성은 제거합니다.
test = test.drop(["ID"], axis = 1)

In [8]:
X.columns

Index(['danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration'],
      dtype='object')

In [9]:
# 평균, 표준편차, 중앙값 New Feature
X['mean'] = X.mean(axis = 1)
X['std'] = X.std(axis = 1)
X['Median'] = X.median(axis = 1)

test['mean'] = test.mean(axis = 1)
test['std'] = test.std(axis = 1)
test['Median'] = test.median(axis = 1)

X

Unnamed: 0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration,mean,std,Median
0,0.3341,0.9231,10,0.8540,0.3797,0.0080,0.0002,0.7697,0.7902,0.5391,175,17.236191,49.963983,0.7902
1,0.6344,0.5955,10,0.2826,0.1378,0.0000,0.7137,0.1597,0.4162,0.3043,513,47.840382,147.122377,0.5955
2,0.3983,0.1107,8,0.2459,0.0247,0.7740,0.0000,0.5105,0.4291,0.2324,151,14.702327,43.157836,0.4291
3,0.3139,0.9702,2,0.9252,0.9590,0.0371,0.3015,0.2539,0.0577,0.5406,246,22.941736,70.539294,0.9252
4,0.0735,0.9452,5,0.7702,0.1397,0.2583,0.0015,0.1774,0.1128,0.3176,229,21.526927,65.623048,0.3176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25378,0.8814,0.3398,6,0.4602,0.3825,0.4505,0.0000,0.4222,0.8753,0.4081,88,8.929091,25.055482,0.4602
25379,0.7505,0.5657,0,0.6682,0.1038,0.1281,0.0000,0.0596,0.9583,0.4273,123,11.514682,35.256198,0.5657
25380,0.2520,0.7617,8,0.4928,0.0410,0.0003,0.8395,0.0843,0.1372,0.7828,353,33.126509,101.176807,0.7617
25381,0.6202,0.3455,0,0.6499,0.7960,0.4818,0.0000,0.2007,0.0194,0.9756,120,11.280827,34.381492,0.6202


In [10]:
# 각 칼럼에 시간(Duration , 음악 재생 시간) 나눈 값 만들기
X['danceability / duration '] = X.danceability / X.duration
X['energy / duration '] = X.energy / X.duration
X['loudness / duration'] = X.loudness / X.duration
X['speechiness / duration'] = X.speechiness / X.duration
X['acousticness / duration'] = X.acousticness / X.duration
X['liveness / duration'] = X.liveness / X.duration
X['valence / duration'] = X.valence / X.duration
X['tempo / duration'] = X.tempo / X.duration

In [11]:
# Test에도 똑같이 해주기
test['danceability / duration '] = test.danceability / test.duration
test['energy / duration '] = test.energy / test.duration
test['loudness / duration'] = test.loudness / test.duration
test['speechiness / duration'] = test.speechiness / test.duration
test['acousticness / duration'] = test.acousticness / test.duration
test['liveness / duration'] = test.liveness / test.duration
test['valence / duration'] = test.valence / test.duration
test['tempo / duration'] = test.tempo / test.duration

In [13]:
X

Unnamed: 0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,std,Median,danceability / duration,energy / duration,loudness / duration,speechiness / duration,acousticness / duration,liveness / duration,valence / duration,tempo / duration
0,0.3341,0.9231,10,0.8540,0.3797,0.0080,0.0002,0.7697,0.7902,0.5391,...,49.963983,0.7902,0.001909,0.005275,0.004880,0.002170,4.571429e-05,0.004398,0.004515,0.003081
1,0.6344,0.5955,10,0.2826,0.1378,0.0000,0.7137,0.1597,0.4162,0.3043,...,147.122377,0.5955,0.001237,0.001161,0.000551,0.000269,0.000000e+00,0.000311,0.000811,0.000593
2,0.3983,0.1107,8,0.2459,0.0247,0.7740,0.0000,0.5105,0.4291,0.2324,...,43.157836,0.4291,0.002638,0.000733,0.001628,0.000164,5.125828e-03,0.003381,0.002842,0.001539
3,0.3139,0.9702,2,0.9252,0.9590,0.0371,0.3015,0.2539,0.0577,0.5406,...,70.539294,0.9252,0.001276,0.003944,0.003761,0.003898,1.508130e-04,0.001032,0.000235,0.002198
4,0.0735,0.9452,5,0.7702,0.1397,0.2583,0.0015,0.1774,0.1128,0.3176,...,65.623048,0.3176,0.000321,0.004128,0.003363,0.000610,1.127948e-03,0.000775,0.000493,0.001387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25378,0.8814,0.3398,6,0.4602,0.3825,0.4505,0.0000,0.4222,0.8753,0.4081,...,25.055482,0.4602,0.010016,0.003861,0.005230,0.004347,5.119318e-03,0.004798,0.009947,0.004638
25379,0.7505,0.5657,0,0.6682,0.1038,0.1281,0.0000,0.0596,0.9583,0.4273,...,35.256198,0.5657,0.006102,0.004599,0.005433,0.000844,1.041463e-03,0.000485,0.007791,0.003474
25380,0.2520,0.7617,8,0.4928,0.0410,0.0003,0.8395,0.0843,0.1372,0.7828,...,101.176807,0.7617,0.000714,0.002158,0.001396,0.000116,8.498584e-07,0.000239,0.000389,0.002218
25381,0.6202,0.3455,0,0.6499,0.7960,0.4818,0.0000,0.2007,0.0194,0.9756,...,34.381492,0.6202,0.005168,0.002879,0.005416,0.006633,4.015000e-03,0.001672,0.000162,0.008130


In [14]:
y

0                   trap
1                 techno
2             Trap Metal
3                   trap
4              Dark Trap
              ...       
25378    Underground Rap
25379         Trap Metal
25380                dnb
25381          Dark Trap
25382                Rap
Name: genre, Length: 25383, dtype: object

# Modeling

In [15]:
import os

from supervised.automl import AutoML

In [16]:
automl = AutoML(mode="Compete", eval_metric='f1')

In [17]:
automl.fit(X, y)

Linear algorithm was disabled.
AutoML directory: AutoML_7
The task is multiclass_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.404884 trained in 1.43 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree f1 0.399244 trained in 9.86 seconds
2_DecisionTree f1 0.471969 trained in 10.77 seconds
3_DecisionTree f1 0.4719

In [18]:
y_pred = automl.predict(test)

In [19]:
y_pred

array(['Hiphop', 'Underground Rap', 'Emo', ..., 'hardstyle', 'psytrance',
       'Underground Rap'], dtype=object)

# Submission

In [20]:
# 제출 파일을 불러옵니다.
submission = pd.read_csv("./sample_submission.csv")
submission["genre"] = y_pred

In [21]:
submission

Unnamed: 0,ID,genre
0,TEST_00000,Hiphop
1,TEST_00001,Underground Rap
2,TEST_00002,Emo
3,TEST_00003,techhouse
4,TEST_00004,Pop
...,...,...
16917,TEST_16917,techhouse
16918,TEST_16918,Hiphop
16919,TEST_16919,hardstyle
16920,TEST_16920,psytrance


In [22]:
# submission
submission.to_csv("./AutoML_Before_normalize_duration_meanstdmedian.csv", index = False)