In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
from sklearn.feature_selection import RFE
import shap
import catboost as cb
from tqdm import tqdm
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, BorderlineSMOTE, SVMSMOTE, ADASYN, KMeansSMOTE



ROOT_DIR = "data"
RANDOM_STATE = 110

train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_mod.csv"))

In [2]:
def cat2num(X):
    non_numeric_columns = X.select_dtypes(include=['object']).columns
    # print("Non-numeric columns:", non_numeric_columns)

    encoded_columns = {}

    for column in non_numeric_columns:
        encoder = LabelEncoder()
        encoded_columns[column] = encoder.fit_transform(X[column])

    encoded_df = pd.DataFrame(encoded_columns, index=X.index)

    X = X.drop(columns=non_numeric_columns)
    X = pd.concat([X, encoded_df], axis=1)

    return X

def featuregen(train_data):
    axis = ['X', 'Y', 'Z']
    process = ['Dam', 'Fill1', 'Fill2']

    for ax in axis:
        for proc in process:
            stage1_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage1) Collect Result_{proc}'
            stage2_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage2) Collect Result_{proc}'
            stage3_col = f'HEAD NORMAL COORDINATE {ax} AXIS(Stage3) Collect Result_{proc}'
            new_col_1_3 = f'Head_DIFF_{ax}_Stage1&3_{proc}'
            new_col_max_min = f'Head_MinMax_{ax}_{proc}'
            
        
            train_data[new_col_1_3] = (train_data[stage1_col] - train_data[stage3_col]).abs()
            train_data[new_col_max_min] = train_data[[stage1_col, stage2_col, stage3_col]].max(axis=1) - train_data[[stage1_col, stage2_col, stage3_col]].min(axis=1)

    return train_data
                                                                                                            
def generate_stage_averages(df):
    stages = ['Stage1', 'Stage2', 'Stage3']
    
    for stage in stages:
        stage_columns = [col for col in df.columns if stage in col and ('Circle' in col or 'Line' in col)]
        df[f'{stage}_Average'] = df[stage_columns].mean(axis=1)
    
    return df
                                                                                                            
def generating_features(df):
 
    # Thickness difference
    df['Thickness_Diff_1_2'] = df['THICKNESS 1 Collect Result_Dam'] - df['THICKNESS 2 Collect Result_Dam']
    df['Thickness_Diff_2_3'] = df['THICKNESS 2 Collect Result_Dam'] - df['THICKNESS 3 Collect Result_Dam']
    df['Thickness_Std'] = df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].std(axis=1)
    df['Thickness_Max_Min_Diff'] = df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].max(axis=1) - df[['THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam']].min(axis=1)
    df['Temperature_Change_Rate'] = df['Chamber Temp. Collect Result_AutoClave'] * df['Chamber Temp. Unit Time_AutoClave']

    return df

def generate_volume_to_speed_ratio(df):
    df['Volume_to_Speed_Ratio_Stage1'] = df['Dispense Volume(Stage1) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    df['Volume_to_Speed_Ratio_Stage2'] = df['Dispense Volume(Stage2) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    df['Volume_to_Speed_Ratio_Stage3'] = df['Dispense Volume(Stage3) Collect Result_Fill1'] / df['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    df['Volume_Sum_Fill1'] = df['Dispense Volume(Stage1) Collect Result_Fill1'] + df['Dispense Volume(Stage2) Collect Result_Fill1'] + df['Dispense Volume(Stage3) Collect Result_Fill1']
    
    return df

def generate_pressure_change_rate(df):
    df['Pressure_Change_Rate_1st'] = df['1st Pressure Collect Result_AutoClave'] * df['1st Pressure 1st Pressure Unit Time_AutoClave']
    df['Pressure_Change_Rate_2nd'] = df['2nd Pressure Collect Result_AutoClave'] * df['2nd Pressure Unit Time_AutoClave']
    df['Pressure_Change_Rate_3rd'] = df['3rd Pressure Collect Result_AutoClave'] * df['3rd Pressure Unit Time_AutoClave']
    
    return df

def generate_volume_to_time_ratio(df):
    df['Volume_to_Time_Ratio_Stage1'] = df['Dispense Volume(Stage1) Collect Result_Dam'] / df['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
    df['Volume_to_Time_Ratio_Stage2'] = df['Dispense Volume(Stage2) Collect Result_Dam'] / df['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
    df['Volume_to_Time_Ratio_Stage3'] = df['Dispense Volume(Stage3) Collect Result_Dam'] / df['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']
    
    return df


train_data = cat2num(train_data)
train_data = featuregen(train_data)
train_data = generating_features(train_data)
train_data = generate_volume_to_speed_ratio(train_data)
train_data = generate_pressure_change_rate(train_data)
train_data = generate_volume_to_time_ratio(train_data)
train_data = generate_stage_averages(train_data)


train_data = train_data[[
    
                         'Head_DIFF_X_Stage1&3_Dam',
                         'Head_DIFF_X_Stage1&3_Fill1',
                         'Head_DIFF_X_Stage1&3_Fill2',
                         
                         'Head_MinMax_Y_Dam',
                         'Head_MinMax_Y_Fill1',
                         'Head_MinMax_Y_Fill2',
                         
                         'Stage1_Average',
                         'Stage2_Average',
                         'Stage3_Average',

                         'Thickness_Max_Min_Diff',
                         
                         'Temperature_Change_Rate',
                         
                         'Pressure_Change_Rate_1st',
                         'Pressure_Change_Rate_2nd',
                         'Pressure_Change_Rate_3rd',
                         
                         'Volume_to_Time_Ratio_Stage1',
                         'Volume_to_Time_Ratio_Stage2',
                         'Volume_to_Time_Ratio_Stage3',
    
                         'Volume_Sum_Fill1',
                         'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
    
                         'target']]

In [3]:
train_data.columns

Index(['Head_DIFF_X_Stage1&3_Dam', 'Head_DIFF_X_Stage1&3_Fill1',
       'Head_DIFF_X_Stage1&3_Fill2', 'Head_MinMax_Y_Dam',
       'Head_MinMax_Y_Fill1', 'Head_MinMax_Y_Fill2', 'Stage1_Average',
       'Stage2_Average', 'Stage3_Average', 'Thickness_Max_Min_Diff',
       'Temperature_Change_Rate', 'Pressure_Change_Rate_1st',
       'Pressure_Change_Rate_2nd', 'Pressure_Change_Rate_3rd',
       'Volume_to_Time_Ratio_Stage1', 'Volume_to_Time_Ratio_Stage2',
       'Volume_to_Time_Ratio_Stage3', 'Volume_Sum_Fill1',
       'DISCHARGED SPEED OF RESIN Collect Result_Fill1', 'target'],
      dtype='object')

In [4]:
from imblearn.over_sampling import BorderlineSMOTE

# Borderline SMOTE for oversampling the minority class
X = train_data.drop(columns=['target'])
y = train_data['target']

borderline_smote = BorderlineSMOTE(sampling_strategy={0 : 10000}, random_state=RANDOM_STATE)
X_resampled, y_resampled = borderline_smote.fit_resample(X, y)

train_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='target')], axis=1)

In [5]:
# Scale the data
scaler = MinMaxScaler()
# scaler = StandardScaler()
columns_to_scale = [col for col in train_data.columns if col != 'target']
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# Undersample the data
normal_ratio = 1.0  # 1:1 ratio
df_normal = train_data[train_data["target"] == 1]
df_abnormal = train_data[train_data["target"] == 0]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

# Split the data into features and target
train_x = df_concat.drop(columns=['target'])
train_y = df_concat['target']

X_train, X_test, y_train, y_test = train_test_split(
    train_x,
    train_y,
    test_size=0.22,
    random_state=RANDOM_STATE,
)


Total: Normal: 38156, AbNormal: 10000


In [6]:
train_x

Unnamed: 0,Head_DIFF_X_Stage1&3_Dam,Head_DIFF_X_Stage1&3_Fill1,Head_DIFF_X_Stage1&3_Fill2,Head_MinMax_Y_Dam,Head_MinMax_Y_Fill1,Head_MinMax_Y_Fill2,Stage1_Average,Stage2_Average,Stage3_Average,Thickness_Max_Min_Diff,Temperature_Change_Rate,Pressure_Change_Rate_1st,Pressure_Change_Rate_2nd,Pressure_Change_Rate_3rd,Volume_to_Time_Ratio_Stage1,Volume_to_Time_Ratio_Stage2,Volume_to_Time_Ratio_Stage3,Volume_Sum_Fill1,DISCHARGED SPEED OF RESIN Collect Result_Fill1
0,0.000000,0.997654,1.000000,0.004444,0.042857,1.0,1.0,1.00000,1.0,0.0,0.749783,0.968652,0.143260,0.331399,0.017613,0.019532,0.017613,0.946115,1.000000
1,0.307692,0.998094,0.003431,0.005333,0.057143,0.0,0.2,0.62500,0.2,0.0,0.754499,0.780669,0.291713,0.332826,0.022712,0.023319,0.012692,0.910401,0.972477
2,0.000000,0.997654,1.000000,0.004444,0.042857,1.0,1.0,0.62500,1.0,0.0,0.765181,0.777429,0.461770,0.331399,0.017613,0.023319,0.000000,0.948935,1.000000
3,0.615385,0.998680,0.003431,0.996444,0.142857,0.0,0.5,0.18750,0.5,0.0,0.554230,0.763041,0.002341,0.334175,0.015295,0.035181,0.015295,0.871554,0.972477
4,0.769231,0.999120,1.000000,0.000000,0.085714,1.0,0.2,0.62500,0.2,0.0,0.754499,0.758004,0.281428,0.335524,0.022712,0.023319,0.012692,0.910401,0.972477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.793744,0.998793,0.003431,0.006144,0.103924,0.0,0.0,0.00000,0.0,0.0,0.800982,0.780669,0.289843,0.334175,0.021563,0.028811,0.021563,0.912535,0.972477
19996,0.307692,0.998094,0.003431,0.005333,0.100000,0.0,0.2,0.62500,0.2,0.0,0.754499,0.778784,0.290542,0.335019,0.022712,0.023319,0.020190,0.907581,0.972477
19997,0.615385,0.998680,0.003431,0.996444,0.142857,0.0,0.5,0.18750,0.5,0.0,0.647387,0.776606,0.003928,0.348229,0.015295,0.035181,0.015295,0.874687,0.972477
19998,0.461538,0.999120,1.000000,0.004444,0.085714,1.0,0.5,0.28125,0.5,0.0,0.624098,0.751298,0.003802,0.333950,0.015295,0.022006,0.015295,0.872810,0.972477


In [7]:
# Recursive Feature Elimination (RFE) for backward feature selection
model = cb.CatBoostClassifier(
    depth=4,
    iterations=400,
    l2_leaf_reg=5,
    learning_rate=0.1,
    colsample_bylevel = 0.9,
    verbose=0  # No output during training
)

# Train final model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")

#SHAP
# SHAP value caculation
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Features importance
df_shap = pd.DataFrame(shap_values, columns=X_test.columns)
shap_importance = df_shap.abs().mean().sort_values(ascending=False)
print(shap_importance)

F1 Score: 0.8764
Recall: 0.9680
Accuracy: 0.8625
Precision: 0.8007
Pressure_Change_Rate_3rd                          0.805258
Volume_Sum_Fill1                                  0.460271
Pressure_Change_Rate_2nd                          0.243452
Volume_to_Time_Ratio_Stage1                       0.232663
Pressure_Change_Rate_1st                          0.207590
Volume_to_Time_Ratio_Stage2                       0.176794
Volume_to_Time_Ratio_Stage3                       0.168003
Head_MinMax_Y_Fill1                               0.143190
Head_DIFF_X_Stage1&3_Dam                          0.116904
Stage2_Average                                    0.101246
Temperature_Change_Rate                           0.092889
Head_MinMax_Y_Dam                                 0.088887
Stage3_Average                                    0.074329
Stage1_Average                                    0.071938
Head_DIFF_X_Stage1&3_Fill2                        0.071230
Head_MinMax_Y_Fill2                             

In [8]:
# Process test data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_mod.csv"))
test_data = cat2num(test_data)
test_data = featuregen(test_data)
test_data = generating_features(test_data) 
test_data = generate_volume_to_speed_ratio(test_data)
test_data = generate_pressure_change_rate(test_data)
test_data = generate_volume_to_time_ratio(test_data)
test_data = generate_stage_averages(test_data)


# Scale the test data
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

# Select the same features as the training data
test_x_rfe = test_data[X_train.columns]

# Predict on test data
y_pred = model.predict(test_x_rfe)

# Prepare submission
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred
df_sub['target'] = df_sub['target'].map({1: 'Normal', 0: 'AbNormal'})

# Calculate the ratio of abnormal cases
counts = df_sub['target'].value_counts()
ratio = counts['AbNormal'] / (counts['AbNormal'] + counts['Normal'])
print("The ratio of abnormal is:", ratio)

# Save the submission file
df_sub.to_csv("submission.csv", index=False)

The ratio of abnormal is: 0.038707447727665455
