In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
!pip install catboost 

import pandas as pd
import random
import os
import numpy as np

from sklearn import preprocessing
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split

# Data 위치
data_path = "/gdrive/My Drive/Colab Notebooks/dacon/dna_classify/data"


class CFG:
    SEED = 42


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


seed_everything(CFG.SEED)  # Seed 고정

train = pd.read_csv(data_path+'/train.csv')
test = pd.read_csv(data_path+'/test.csv')


def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x


train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

train_x = train_x.drop(columns=['father', 'mother', 'gender'])
test_x = test_x.drop(columns=['father', 'mother', 'gender'])

train_x = train_x.drop(columns=['SNP_03'])
test_x = test_x.drop(columns=['SNP_03'])

train_data_x, valid_data_x, train_data_y, valid_data_y = train_test_split(
    train_x, train_y, test_size=0.1, random_state=CFG.SEED)

cat_peature = ['SNP_10']

dtrain = Pool(train_data_x, label=train_data_y, cat_features=cat_peature)
dvalid = Pool(valid_data_x, label=valid_data_y, cat_features=cat_peature)

params = {'learning_rate': 0.8, 'depth': 10,
          'boosting_type': 'Plain',
          'iterations': 5000,
          'task_type': 'GPU',
          'max_bin': 20000
          }

model = CatBoostClassifier(**params)
model.fit(dtrain, eval_set=dvalid, early_stopping_rounds=1000, use_best_model=True)

preds = model.predict(test_x)
print('Done.')

submit = pd.read_csv(data_path+'/sample_submission.csv')

submit['class'] = class_le.inverse_transform(preds)

submit.to_csv('/gdrive/My Drive/Colab Notebooks/dacon/dna_classify/sub/2nd.csv', index=False)
print('Write result.')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1
0:	learn: 0.5072320	test: 0.6262967	best: 0.6262967 (0)	total: 66.1ms	remaining: 5m 30s
1:	learn: 0.3861382	test: 0.5314318	best: 0.5314318 (1)	total: 169ms	remaining: 7m 2s
2:	learn: 0.3107563	test: 0.4932302	best: 0.4932302 (2)	total: 272ms	remaining: 7m 33s
3:	learn: 0.2577241	test: 0.4737268	best: 0.4737268 (3)	total: 370ms	remaining: 7m 41s
4:	learn: 0.2140217	test: 0.4636583	best: 0.4636583 (4)	total: 440ms	remaining: 7m 19s
5:	learn: 0.1865607	test: 0.4462459	best: 0.4462459 (5)	total: 491ms	remaining: 6m 48s
6:	learn: 0.1618366	test: 0.4331582	best: 0.4331582 (6)	total: 539ms	remaining: 6m 24s
7:

  y = column_or_1d(y, warn=True)


In [4]:
submit.to_csv('/gdrive/My Drive/Colab Notebooks/dacon/dna_classify/sub/2nd.csv', index=False)