In [1]:
!pip install catboost
!pip install wandb



In [2]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# from sklearn.linear_model import SGDOneClassSVM
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import wandb
wandb.init(project="DACON_235877")

cv=10
seed=1011

def set_seeds(seed=seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds()

train = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/data/test.csv")

train.head()

[34m[1mwandb[0m: Currently logged in as: [33mgnoeyheat[0m (use `wandb login --relogin` to force relogin)


Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1253 non-null   int64  
 1   Gender          1253 non-null   object 
 2   Lenght          1253 non-null   float64
 3   Diameter        1253 non-null   float64
 4   Height          1253 non-null   float64
 5   Whole Weight    1253 non-null   float64
 6   Shucked Weight  1253 non-null   float64
 7   Viscra Weight   1253 non-null   float64
 8   Shell Weight    1253 non-null   float64
 9   Target          1253 non-null   int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 98.0+ KB


In [4]:
# ## Valid

# X = train.drop(["id", "Target"], axis=1)
# y = train.Target
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)

## Test

X_train = train.drop(["id", "Target"], axis=1)
y_train = train.Target
X_test = test.drop(["id"], axis=1)

X_train=pd.get_dummies(X_train, drop_first=True)
X_test=pd.get_dummies(X_test, drop_first=True)

## Scaler

scaler=StandardScaler().fit(X_train)
# scaler=MinMaxScaler().fit(X_train)

X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

X_train.shape, X_test.shape

((1253, 9), (2924, 9))

* Outlier Detection Feature Extraction

In [5]:
m1=OneClassSVM()
m2=IsolationForest(n_jobs=-1, random_state=seed)
m3=LocalOutlierFactor(novelty=True, n_jobs=-1)

def OD_feature(model, X_train, y_train, X_test, type="soft", scale=None):

    clf=model.fit(X_train)

    if type=="soft":
        train_feature=pd.DataFrame(clf.decision_function(X_train))
        test_feature=pd.DataFrame(clf.decision_function(X_test))
    elif type=="hard":
        train_feature=pd.DataFrame(clf.predict(X_train))
        test_feature=pd.DataFrame(clf.predict(X_test))

    if scale=="standard":
        scaler=StandardScaler().fit(train_feature)
        train_feature=scaler.transform(train_feature)
        test_feature=scaler.transform(test_feature)
    elif scale=="minmax":
        scaler=MinMaxScaler().fit(train_feature)
        train_feature=scaler.transform(train_feature)
        test_feature=scaler.transform(test_feature)

    return train_feature, test_feature

In [6]:
# Outlier Detection Feature Extraction - soft
train_v1, test_v1 = OD_feature(m1, X_train, y_train, X_test, type="soft", scale="standard")
train_v2, test_v2 = OD_feature(m2, X_train, y_train, X_test, type="soft", scale="standard")
train_v3, test_v3 = OD_feature(m3, X_train, y_train, X_test, type="soft", scale="standard")

# Feature Engineering
X_train=np.concatenate((X_train, train_v1, train_v2, train_v3), axis=1)
X_test=np.concatenate((X_test, test_v1, test_v2, test_v3), axis=1)
X_train.shape, X_test.shape

In [7]:
# # Outlier Detection Feature Extraction - hard
# train_v1, test_v1 = OD_feature(m1, X_train, y_train, X_test, type="hard", scale=None)
# train_v2, test_v2 = OD_feature(m2, X_train, y_train, X_test, type="hard", scale=None)
# train_v3, test_v3 = OD_feature(m3, X_train, y_train, X_test, type="hard", scale=None)

# # Feature Engineering
# X_train=np.concatenate((X_train, train_v1, train_v2, train_v3), axis=1)
# X_test=np.concatenate((X_test, test_v1, test_v2, test_v3), axis=1)
# X_train.shape, X_test.shape

In [8]:
estimator1 = LinearRegression(n_jobs=-1)
estimator2 = Ridge(random_state=seed)
estimator3 = Lasso(random_state=seed)
estimator4 = ElasticNet(random_state=seed)
estimator5 = KNeighborsRegressor(n_jobs=-1)
estimator6 = SVR()
estimator7 = RandomForestRegressor(n_jobs=-1, random_state=seed)
estimator8 = ExtraTreesRegressor(n_jobs=-1, random_state=seed)
estimator9 = AdaBoostRegressor(random_state=seed)
estimator10 = GradientBoostingRegressor(random_state=seed)
estimator11 = HistGradientBoostingRegressor(random_state=seed)
estimator12 = XGBRegressor(n_jobs=-1, random_state=seed)
estimator13 = LGBMRegressor(n_jobs=-1, random_state=seed)
estimator14 = CatBoostRegressor(verbose=False, random_state=seed)
estimator15 = MLPRegressor(random_state=seed)

In [9]:
def get_stacking_ml_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n, y_train_n)):
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]
        
        model.fit(X_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        test_pred[:, folder_counter] = model.predict(X_test_n)
        
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)    
    
    return train_fold_pred, test_pred_mean

In [10]:
%%time

base_ml = [estimator1, estimator2, estimator3, estimator4, estimator5,
           estimator6, estimator7, estimator8, estimator9, estimator10,
           estimator11, estimator12, estimator13, estimator14, estimator15]

meta_ml_X_train=[]
meta_ml_X_test=[]
for estimator in base_ml:
    temp_X_train, temp_X_test = get_stacking_ml_datasets(estimator, X_train, y_train.values, X_test, cv)
    meta_ml_X_train.append(temp_X_train)
    meta_ml_X_test.append(temp_X_test)
    
meta_ml_X_train=np.hstack(meta_ml_X_train)
meta_ml_X_test=np.hstack(meta_ml_X_test)

meta_ml_X_train.shape, meta_ml_X_test.shape





CPU times: user 1min 23s, sys: 17.4 s, total: 1min 40s
Wall time: 1min 12s




In [11]:
# # Outlier Detection Feature Extraction - stacking
# train_v1, test_v1 = OD_feature(m1, meta_ml_X_train, y_train, meta_ml_X_test, type="soft", scale="minmax")
# train_v2, test_v2 = OD_feature(m2, meta_ml_X_train, y_train, meta_ml_X_test, type="soft", scale="minmax")
# train_v3, test_v3 = OD_feature(m3, meta_ml_X_train, y_train, meta_ml_X_test, type="soft", scale="minmax")

# # Feature Engineering
# meta_ml_X_train=np.concatenate((meta_ml_X_train, train_v1, train_v2, train_v3), axis=1)
# meta_ml_X_test=np.concatenate((meta_ml_X_test, test_v1, test_v2, test_v3), axis=1)
# meta_ml_X_train.shape, meta_ml_X_test.shape

In [12]:
meta_clf=LinearRegression()
meta_clf.fit(meta_ml_X_train, y_train)
submission=meta_clf.predict(meta_ml_X_test)

# ## Valid

# def NMAE(true, pred):
#     target_idx = np.where(true!=0)
#     true = true.values[target_idx]
#     pred = pred[target_idx].round()
#     score = np.mean(np.abs(true-pred)/true)
#     return score

# nmae=NMAE(y_test, submission)
# print(nmae)

# wandb.log({'NMAE': nmae, 'cv': cv, 'seed': seed})

## Test

sample_submission = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/data/sample_submission.csv")
sample_submission.Target = submission.round()
sample_submission.to_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/submission.csv",index=False)