# Categorical Feature Encoding Challenge

## Config

In [1]:
# Basic packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rd # generating random numbers
import datetime # manipulating date formats
from dateutil.relativedelta import relativedelta

# Viz
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns # for prettier plots
import japanize_matplotlib

# settings
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)

# # logging
# import logging
# consoleHandler = logging.StreamHandler()
# consoleHandler.setLevel(logging.WARNING)

In [2]:
class CFG:
    data_input_path = '../data/input/'
    data_output_path = '../data/output/'
    randome_state = 0

## Read

In [3]:
df_train = pd.read_csv(CFG.data_input_path + 'train.csv')
df_test = pd.read_csv(CFG.data_input_path + 'test.csv')
df_sub = pd.read_csv(CFG.data_input_path + 'sample_submission.csv')

df_train_raw = df_train.copy()
df_test_raw = df_test.copy()
df_sub_raw = df_sub.copy()

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 25 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      300000 non-null  int64 
 1   bin_0   300000 non-null  int64 
 2   bin_1   300000 non-null  int64 
 3   bin_2   300000 non-null  int64 
 4   bin_3   300000 non-null  object
 5   bin_4   300000 non-null  object
 6   nom_0   300000 non-null  object
 7   nom_1   300000 non-null  object
 8   nom_2   300000 non-null  object
 9   nom_3   300000 non-null  object
 10  nom_4   300000 non-null  object
 11  nom_5   300000 non-null  object
 12  nom_6   300000 non-null  object
 13  nom_7   300000 non-null  object
 14  nom_8   300000 non-null  object
 15  nom_9   300000 non-null  object
 16  ord_0   300000 non-null  int64 
 17  ord_1   300000 non-null  object
 18  ord_2   300000 non-null  object
 19  ord_3   300000 non-null  object
 20  ord_4   300000 non-null  object
 21  ord_5   300000 non-null  object
 

In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 24 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      200000 non-null  int64 
 1   bin_0   200000 non-null  int64 
 2   bin_1   200000 non-null  int64 
 3   bin_2   200000 non-null  int64 
 4   bin_3   200000 non-null  object
 5   bin_4   200000 non-null  object
 6   nom_0   200000 non-null  object
 7   nom_1   200000 non-null  object
 8   nom_2   200000 non-null  object
 9   nom_3   200000 non-null  object
 10  nom_4   200000 non-null  object
 11  nom_5   200000 non-null  object
 12  nom_6   200000 non-null  object
 13  nom_7   200000 non-null  object
 14  nom_8   200000 non-null  object
 15  nom_9   200000 non-null  object
 16  ord_0   200000 non-null  int64 
 17  ord_1   200000 non-null  object
 18  ord_2   200000 non-null  object
 19  ord_3   200000 non-null  object
 20  ord_4   200000 non-null  object
 21  ord_5   200000 non-null  object
 

In [6]:
df_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      200000 non-null  int64  
 1   target  200000 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 3.1 MB


## Preprocessing

In [7]:
# 学習データとテストデータをマージしてから前処理する。
df_proc = pd.concat([df_train,df_test])

df_proc.isnull().sum()

id             0
bin_0          0
bin_1          0
bin_2          0
bin_3          0
bin_4          0
nom_0          0
nom_1          0
nom_2          0
nom_3          0
nom_4          0
nom_5          0
nom_6          0
nom_7          0
nom_8          0
nom_9          0
ord_0          0
ord_1          0
ord_2          0
ord_3          0
ord_4          0
ord_5          0
day            0
month          0
target    200000
dtype: int64

In [8]:
df_proc.head(1)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0.0


In [9]:
# 各カラムのユニーク数を確認する。
df_proc.nunique()

id        500000
bin_0          2
bin_1          2
bin_2          2
bin_3          2
bin_4          2
nom_0          3
nom_1          6
nom_2          6
nom_3          6
nom_4          4
nom_5        222
nom_6        522
nom_7       1220
nom_8       2219
nom_9      12068
ord_0          3
ord_1          5
ord_2          6
ord_3         15
ord_4         26
ord_5        192
day            7
month         12
target         2
dtype: int64

## EDA

## Feature Engineering

## Modeling

In [10]:
df_feat = df_proc.copy()

# データフレームを学習用とテスト用で分け直す
df_feat_train = df_feat[~df_feat['target'].isna()]
train_x,train_y = df_feat_train.drop('target',axis=1),df_feat_train['target']
print(df_feat_train.shape)

df_feat_test = df_proc[df_proc['target'].isna()]
test_x = df_feat_test.drop('target',axis=1)
print(df_feat_test.shape)

(300000, 25)
(200000, 25)


### 検証

In [11]:
# Classifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# 交差検証の関数
def closs_validation(x,y,clf,n_splits,scoring):

    scores = cross_val_score(clf,x,y,cv=n_splits,scoring=scoring)

    # スコアの出力
    for i,score in enumerate(scores):
        print(f'{i+1}回目スコア: {score}')

    # 全体の平均スコアを表示する
    print(f'全体スコア平均: {np.mean(scores)}')

clf = DummyClassifier(strategy='stratified')

closs_validation(train_x,train_y,clf,5,'roc_auc')

1回目スコア: 0.503343911324235
2回目スコア: 0.4987303914274759
3回目スコア: 0.4992276986787019
4回目スコア: 0.500907112404862
5回目スコア: 0.5016223578631931
全体スコア平均: 0.5007662943396936


### 予測

In [12]:
model = clf.fit(train_x,train_y)

pred = model.predict_proba(test_x)[:, 1]

df_pred = test_x.copy()
df_pred['target'] = pred

## Submission

In [13]:
df_pred_sub = df_pred[df_sub_raw.columns]
df_pred_sub.to_csv(CFG.data_output_path+'submission.csv',index=False)
print(CFG.data_output_path+'submission.csv')

../data/output/submission.csv


In [14]:
# ! kaggle competitions submit -c cat-in-the-dat -f ../data/output/submission.csv -m "Message"