In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score, make_scorer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA

# from sklearn.feature_selection import SelectKBest, f_regression

In [2]:
np.random.seed(7)

# metrics

In [3]:
def mae(y_true, y_pred) :
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    y_true = y_true.reshape(1, -1)[0]
    y_pred = y_pred.reshape(1, -1)[0]
    over_threshold = y_true >= 0.1
    
    return np.mean(np.abs(y_true[over_threshold] - y_pred[over_threshold]))

def fscore(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    y_true = y_true.reshape(1, -1)[0]
    y_pred = y_pred.reshape(1, -1)[0]
    remove_NAs = y_true >= 0
    
    y_true = np.where(y_true[remove_NAs] >= 0.1, 1, 0)
    y_pred = np.where(y_pred[remove_NAs] >= 0.1, 1, 0)
    
    return (f1_score(y_true, y_pred))

def maeOverFscore(y_true, y_pred):
    return mae(y_true, y_pred) / (fscore(y_true, y_pred) + 1e-07)

def score(y_val, pred):
    f_value = fscore(y_val, pred)
    mae_value = maeOverFscore(y_val, pred)
    
    print(f"fscore : {f_value} \t maeOverFscore : {mae_value}")
    
    return (f_value, mae_value)

# fscore_sklearn = make_scorer(fscore)
# maeOverFscore_sklearn = make_scorer(maeOverFscore)

# data load

In [4]:
base = os.getcwd()
data_path = os.path.join(base, 'data')
submit_path = os.path.join(base, 'submit')

def load(name):
    return np.load(os.path.join(data_path, f"{name}.npy"))

def reshape(data):
    return data.reshape(data.shape[0] * 40 * 40, data.shape[-1])

In [11]:
data = load('EDA')
print(data.shape)

(121608234, 15)


In [12]:
del data

In [13]:
data = load('train')
print(data.shape)

(121608234, 15)


# Feature Selection

## select K
![img](feacture_selection.PNG)

## selectK 7
- [False, False,  True,  True,  True,  True,  True,  True, False, False, False, False, False, True]

## selectK 8
- [False, False,  True,  True,  True,  True,  True,  True, False, False, False, True, False, True]

In [17]:
selectK_7 = [2, 3, 4, 5, 6, 7, 13]
selectK_8 = [2, 3, 4, 5, 6, 7, 11, 13]

# Seperate Data

In [None]:
X = data[:, selectK_8]
Y = data[:,  -1]

del data
print(X.shape, Y.shape)

In [None]:
clf = Ridge(alpha=10.0).fit(X, Y)

In [None]:
pred = clf.predict(X)
del X

In [None]:
score(Y, pred)
del Y, pred

# Ridge + KFold

In [14]:
kfold = KFold(n_splits=4, random_state=7, shuffle=True)
scores = list()

for (train_idx, val_idx) in kfold.split(Y):
    
    clf = Ridge(alpha=10.0)
    scaler= RobustScaler()
    
    print("scaler fit")
    x = scaler.fit_transform(X[train_idx, :])

    print("clf fit")
    ridge = clf.fit(x, Y[train_idx])

    del x
    print("scaler transform")
    x = scaler.transform(X[val_idx, :])

    print("clf predict")
    pred = clf.predict(x)
    scores.append(score(Y[val_idx], pred))

    del x

scaler fit
clf fit
scaler transform
clf predict
fscore : 0.2749365314732829 	 maeOverFscore : 6.261598197378067
scaler fit
clf fit
scaler transform
clf predict
fscore : 0.2752106860272466 	 maeOverFscore : 6.2545566924636615
scaler fit
clf fit


  overwrite_a=True).T


scaler transform
clf predict
fscore : 0.27483218889350364 	 maeOverFscore : 6.215133163802377
scaler fit
clf fit
scaler transform
clf predict
fscore : 0.2744506712189898 	 maeOverFscore : 6.258121761417609


In [15]:
for f_value, mae_value in scores:
    print(f"fscore : {f_value} \t maeOverFscore : {mae_value}")

fscore : 0.2749365314732829 	 maeOverFscore : 6.261598197378067
fscore : 0.2752106860272466 	 maeOverFscore : 6.2545566924636615
fscore : 0.27483218889350364 	 maeOverFscore : 6.215133163802377
fscore : 0.2744506712189898 	 maeOverFscore : 6.258121761417609


# Ridge alpha 12.5 SelectK 8
- fscore : 0.25464065823329085 	 maeOverFscore : 6.752701127612951
- fscore : 0.25458534451407944 	 maeOverFscore : 6.7528914220204515
- fscore : 0.2545837754161434 	 maeOverFscore : 6.702930965930638
- fscore : 0.25427823328070515 	 maeOverFscore : 6.747219104960767

# Ridge alpha 10.0 SelectK 8
- fscore : 0.25464065823329085 	 maeOverFscore : 6.752701129278926
- fscore : 0.25458534451407944 	 maeOverFscore : 6.75289142477857
- fscore : 0.2545837754161434 	 maeOverFscore : 6.70293096792134
- fscore : 0.25427826739578746 	 maeOverFscore : 6.747218201741558

# Ridge alpha 10.0 SelectK 8 StandardScaler
- fscore : 0.2546402018844707 	 maeOverFscore : 6.752709268799811
- fscore : 0.2545845515075978 	 maeOverFscore : 6.752907726649149
- fscore : 0.2545843704403055 	 maeOverFscore : 6.702911044140701
- fscore : 0.25427924623684933 	 maeOverFscore : 6.7471879931893675

# Ridge alpha 10.0 SelectK 8 RobustScaler
- fscore : 0.2749365314732829 	 maeOverFscore : 6.261598197378067
- fscore : 0.2752106860272466 	 maeOverFscore : 6.2545566924636615
- fscore : 0.27483218889350364 	 maeOverFscore : 6.215133163802377
- fscore : 0.2744506712189898 	 maeOverFscore : 6.258121761417609

# Ridge alpha 10.0 SelectK 8 PCA 2 dim
- fscore : 0.24107474262012732 	 maeOverFscore : 8.224389019645066
- fscore : 0.2410970325797045 	 maeOverFscore : 8.222939301354975
- fscore : 0.24133791359125045 	 maeOverFscore : 8.159670908854318
- fscore : 0.2408981513939592 	 maeOverFscore : 8.21528441818803

# Ridge alpha 10.0 SelectK 8 PCA 1 dim
- 0.16343634206703112 12.186417405981281
- 0.16353559251460106 12.177879886692459
- 0.16345101377437446 12.102961373810988
- 0.16322487488861562 12.179870422653915

# Train

In [16]:
clf = Ridge(alpha=10.0)
scaler= RobustScaler()

x = scaler.fit_transform(X)
clf.fit(x, Y)

Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

# Submit

In [10]:
def submit(clf, name, features=None, scaler=None):
    x_test = load('test')
    x_test = reshape(x_test)
    
    if features is not None:
        x_test = x_test[:, features]
    
    if scaler is not None:
        x_test = scaler.transform(x_test)
        print("transform")
    
    pred = clf.predict(x_test)

    submission = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))
    submission.iloc[:, 1:] = pred.reshape(-1, 1600)

    submission.to_csv(os.path.join(submit_path, f'{name}.csv'), index = False)

In [27]:
submit(clf, 'Ridge_10_SelectK_8_Robust', selectK_8, scaler)

transform


- https://dacon.io/competitions/official/235591/mysubmission/
- D:\인공지능_공모전\github\submit