## Dacon Baseline Code
---

#### 기본 아이디어 (접근법)

- Feature로 V0000~V5120 을 사용합니다 (5121개).
- 모든 문자를 0으로 변환합니다.
- 모든 결측치를 0으로 채워넣습니다.
- 학습데이터로 0초 ~ 59초 (총 60개) 데이터만 사용합니다.

In [6]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_2 import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)

from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역할


In [7]:
train_folder = 'data/train/'
test_folder = 'data/test/'
train_label_path = 'data/train_label.csv'


In [8]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)


In [9]:
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [10]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=60)

#### Split x, y
---

In [11]:
X_train = train.drop(['label'], axis=1)
y_train = train['label']

#### PCA & Scaling
---

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_reg_x = scaler.fit_transform(X_train)

In [15]:
from sklearn.decomposition import PCA


pca = PCA(n_components=1000)

train_x_pca = pca.fit_transform(X_train)

### Train
---

In [16]:
train_x_pca = train.drop(['label'], axis=1)
y_train = train['label']
model = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1)
model.fit(X_train, y_train)
joblib.dump(model, './model/model_01_23.pkl')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.0s finished


['./model/model_01_23.pkl']

In [17]:
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=60)

In [18]:
# model = joblib.load('model.pkl') 
pred = model.predict_proba(test)

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.2s remaining:    0.7s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished


In [19]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('submission.csv', index=True) #제출 파일 만들기