In [1]:
import numpy as np
import pandas as pd
import random
import os
import csv
import joblib

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold, train_test_split

from catboost import CatBoostClassifier
from sklearn.linear_model import SGDClassifier, LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score

In [2]:
TRAIN = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/train.csv', index_col='id')
TEST = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/test.csv', index_col='id')

# 列出 train.csv 中的訊息，可以清楚看到在 26570 筆資料中有一些資料為空。
print("train.csv")
TRAIN.info()

train.csv
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26570 entries, 0 to 26569
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_code    26570 non-null  object 
 1   loading         26320 non-null  float64
 2   attribute_0     26570 non-null  object 
 3   attribute_1     26570 non-null  object 
 4   attribute_2     26570 non-null  int64  
 5   attribute_3     26570 non-null  int64  
 6   measurement_0   26570 non-null  int64  
 7   measurement_1   26570 non-null  int64  
 8   measurement_2   26570 non-null  int64  
 9   measurement_3   26189 non-null  float64
 10  measurement_4   26032 non-null  float64
 11  measurement_5   25894 non-null  float64
 12  measurement_6   25774 non-null  float64
 13  measurement_7   25633 non-null  float64
 14  measurement_8   25522 non-null  float64
 15  measurement_9   25343 non-null  float64
 16  measurement_10  25270 non-null  float64
 17  measurement_11  25102

# 填上缺失的Data

In [3]:
#將 train.csv 中的 failure 欄(column)刪除，並將 axis = 1 ，表示該 failure 為 column。
x = TRAIN.drop('failure', axis=1)

#將 train.csv 中的 failure 欄(column)賦予給 y 。
y = TRAIN.failure

#將 test.csv 複製給 test_copy 。
test_copy = TEST.copy()

In [4]:
# 將 x 中 data type 為 object 的欄紀錄起來放進 cat_cols。
cat_cols = [cname for cname in x.columns if x[cname].dtype == 'object']

# 將 x 中 data type 為 float64 或 int64 的欄紀錄起來放進 num_cols。
num_cols = [cname for cname in x.columns if x[cname].dtype in ['float64', 'int64']]

# 將 x 中各欄(columns)有空(is null = 1)的欄紀錄起來放進 na_cols。
na_cols = [cname for cname in x.columns if x[cname].isnull().sum() > 0]

# 將 x 中各欄(columns)非空(isnull = 0)的欄紀錄起來放進 none_na_cols。
none_na_cols = [cname for cname in x.columns if cname not in na_cols]

In [5]:
# 定義 random_state
SEED = random.randint(30000, 35000)
print(SEED)

31639


In [6]:
# 利用 Scikit-learn 中的 IterativeImputer 來補足缺失的資料
# max_iter：指定填充缺失值的迭代次數。
# skip_complete：指定是否跳過沒有缺失值的樣本。設為 True，表示會跳過沒有缺失值的樣本。。
# n_nearest_features：指定在填充缺失值時使用多少個相似的特徵。
# random_state：指定隨機數生成器的種類。
imputer = IterativeImputer(max_iter=50, skip_complete=True, n_nearest_features=20, random_state=SEED)

# 利用 pd.concat 將 x, test_copy 的行(row)連起來放入 data。
data = pd.concat([x, test_copy], axis=0)

# 賦值
# data.loc 函數選擇了所有與當前匹配的行
# 並將這些行的所有數值型列傳遞給了 IterativeImputer 的 fit_transform 方法
# 我們使用賦值運算符將估計值替換回原來的數據
for code in data['product_code'].unique():
    data.loc[data['product_code']==code, num_cols] = imputer.fit_transform(data.loc[data['product_code']==code, num_cols])

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47345 entries, 0 to 47344
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_code    47345 non-null  object 
 1   loading         47345 non-null  float64
 2   attribute_0     47345 non-null  object 
 3   attribute_1     47345 non-null  object 
 4   attribute_2     47345 non-null  float64
 5   attribute_3     47345 non-null  float64
 6   measurement_0   47345 non-null  float64
 7   measurement_1   47345 non-null  float64
 8   measurement_2   47345 non-null  float64
 9   measurement_3   47345 non-null  float64
 10  measurement_4   47345 non-null  float64
 11  measurement_5   47345 non-null  float64
 12  measurement_6   47345 non-null  float64
 13  measurement_7   47345 non-null  float64
 14  measurement_8   47345 non-null  float64
 15  measurement_9   47345 non-null  float64
 16  measurement_10  47345 non-null  float64
 17  measurement_11  47345 non-null 

# 資料預處理

In [8]:
for col in ['attribute_0', 'attribute_1']:
    tempdf = pd.get_dummies(data[col], prefix=col)
    data = pd.merge(left=data, right=tempdf, left_index=True, right_index=True)
data = data.drop(['attribute_0', 'attribute_1'], axis=1)

e = data.drop('attribute_0_material_5', axis=1, inplace=True)

In [9]:
# na_cols = x 中各欄(columns)有空(is null = 1)的欄紀錄起來放進 na_cols。
for cname in na_cols:
    x[f'na_{cname}'] = np.where(x[cname].isna()==True, 1, 0)
    test_copy[f'na_{cname}'] = np.where(test_copy[cname].isna()==True, 1, 0)
na_vars = [cname for cname in x.columns if 'na' in cname]

# 紀錄'measurement_3'及'measurement_5'中有哪些缺值
# 加入新的 column 'xxx' 到 data 
data['measurement_3_na'] = data['measurement_3'].isna().astype(int)
data['measurement_5_na'] = data['measurement_5'].isna().astype(int)

# Combination
data['attribute_2*3'] = data['attribute_2'] * data['attribute_3']

# aggregation
meas_gr1_cols = [f"measurement_{i:d}" for i in list(range(3, 5)) + list(range(9, 17))]
data['meas_gr1_avg'] = np.mean(data[meas_gr1_cols], axis=1)
data['meas_gr1_std'] = np.std(data[meas_gr1_cols], axis=1)
meas_gr2_cols = [f"measurement_{i:d}" for i in list(range(5, 9))]
data['meas_gr2_avg'] = np.mean(data[meas_gr2_cols], axis=1)

# Ratio
data['meas17/meas_gr2_avg'] = data['measurement_17'] / data['meas_gr2_avg']

x = data.iloc[:x.shape[0], :].copy()
test_copy = data.iloc[x.shape[0]:, :].copy()

In [10]:
scale_feats = [col for col in x.columns if x[col].dtypes == 'float64']
# StandardScaler 是 scikit-learn 中的一個類，它可以對輸入數據進行標準化處理，使其分佈滿足均值為 0，方差為 1 的正態分佈。
scaler = StandardScaler()

x[scale_feats] = scaler.fit_transform(x[scale_feats])
test_copy[scale_feats] = scaler.transform(test_copy[scale_feats])

# Feature Selection

In [11]:
def FisherScore(x, y, predictors):
    
    target_var_val = y.unique()
    
    # 對每一個 predictor 算出對應的 FisherScore
    predictor_FisherScore = []
    for cname in predictors:
        fs = np.abs(np.mean(x.loc[y == target_var_val[0], cname]) - np.mean(x.loc[y == target_var_val[1], cname])) / \
            np.sqrt(np.var(x.loc[y == target_var_val[0], cname]) + np.var(x.loc[y == target_var_val[1], cname]))
        predictor_FisherScore.append(fs)
    return predictor_FisherScore

In [12]:
# 對每一個 variables  算出對應的 FisherScore
fs = FisherScore(x, y, x.drop('product_code', axis=1).columns)
fs_df = pd.DataFrame({"predictor": x.drop('product_code', axis=1).columns, "fisherscore": fs})

  if __name__ == "__main__":


In [13]:
# sort 完後得出 fisherscore 高的特徵進行選取
fs_df = fs_df.sort_values('fisherscore', ascending=False).reset_index(drop=True)
fs_df.head(10)

Unnamed: 0,predictor,fisherscore
0,loading,0.217027
1,measurement_17,0.056489
2,meas_gr2_avg,0.056425
3,meas17/meas_gr2_avg,0.053893
4,attribute_3,0.033091
5,measurement_8,0.029819
6,measurement_7,0.029784
7,measurement_5,0.028152
8,measurement_2,0.027061
9,attribute_2*3,0.02681


In [14]:
best_feats = fs_df['predictor'][:19]
x = x[best_feats]
test_copy = test_copy[best_feats]

# RUN

In [15]:
auc_list = []
test_pred_list = []
# importance_list = []
kfold = GroupKFold(n_splits=len(TRAIN.product_code.unique()))
model = LogisticRegression(max_iter = 500, C=0.05, penalty='l1', solver='liblinear', random_state=SEED)

In [16]:
# model.save('/kaggle/working/my_model.h5') 
# model = load_model('/kaggle/working/my_model.h5')
filename = '/kaggle/working/my_model.pkl'
joblib.dump(model, filename)

filename = '/kaggle/working/my_model.pkl'
model = joblib.load(filename)

In [17]:
for fold, (idx_tr, idx_va) in enumerate(kfold.split(x, y, TRAIN.product_code)):
    X_tr = x.iloc[idx_tr]
    X_va = x.iloc[idx_va]
    y_tr = y.loc[idx_tr]
    y_va = y.loc[idx_va]
    
    model.fit(X_tr, y_tr)
    # importance_list.append(model.coef_.ravel())
    
    va_preds = model.predict_proba(X_va)[:,1]
    score = roc_auc_score(y_va, va_preds)
    print(f"Fold {fold} accuracy = {score:.5f}")
    auc_list.append(score)
    
    test_pred_list.append(model.predict_proba(test_copy)[:,1])
    
print(f'Average accuracy = {sum(auc_list) / len(auc_list):.5f}')

Fold 0 accuracy = 0.58521
Fold 1 accuracy = 0.58518
Fold 2 accuracy = 0.59206
Fold 3 accuracy = 0.59820
Fold 4 accuracy = 0.59253
Average accuracy = 0.59063


In [18]:
output = pd.DataFrame(columns=['id', 'failure'])
output['id'] = test_copy.index
# pread = output['failure']
output['failure'] = sum(test_pred_list)/len(test_pred_list)
output.to_csv('submission.csv', index=False)