# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
#from tqdm import tqdm

### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


In [3]:
import pandas as pd

ROOT_DIR = "data"
df = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
df.dropna(axis=1, how='all', inplace=True)

"""
Data process part 
"""

output_path = 'data/train_null.csv'
df.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")

Processed data saved to data/train_null.csv


### 언더 샘플링


데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.


In [3]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
AbNormal    2350
Normal      2350
Name: count, dtype: int64

### 데이터 전처리


In [2]:
#PART 1. Exclude columns where all values are the same and where every value is unique for each row 
def preprocess_unique_value(df):
    #Identify columns where all values are the same
    same_rows_columns = [column for column in df.columns if df[column].nunique() == 0]
    
    #Identify columns where every row's value is unique 
    row_count = len(df) 
##unique_rows_exclude = ['']
    matching_row_columns = [column for column in df.columns if df[column].nunique() == row_count]

    #Exclude specific columns from the unique value check 
    matching_row_columns = [column for column in matching_row_columns if column not in unique_rows_exclude]

    #Drop columns identified in the above steps 
    df.drop(columns=same_rows_columns + matching_row_columns, inplace=True)

    return df

In [3]:
#PART 2. Remove the features with high correlation
#It looks for columns with identical distribution of unique values and checks for identical mappings between these values 
def reduce_dataframe(df):
#2-1. Get the ratio of the unique values in the given series as a sorted array
    def get_value_counts_ratio(series):
        value_counts = series.value_counts(normalize=True) 
        return value_counts.sort_values().values 

#2-2. Check if there's identical mapping between the unique values of two columns 
# Returns 'true' if one unique value in 'col1' maps to one unique value in 'col2'
    def check_value_mapping(df, col1, col2):
        unique_column_1 = df[col1].unique()
        unique_column_2 = df[col2].unique()

        if len(unique_column_1) != len(unique_column_2) : return False

        value_mapping = {}
        #Check if there's consistent mapping between values in col1 and col2 
        #maybe needs some margin??
        for val1 in unique_column_1:
            consistent_values = df[df[col1] == val1][col2].unique()
            if len(consistent_values) != 1 : 
                return False
            value_mapping [val1] = consistent_values[0]
        
        #Check if there's matching proportion of occurences of values in col1 and col2
        for val1 in unique_column_1: 
            ratio_1 = (df[col1] == val1).mean()
            ratio_2 = (df[col2] == value_mapping[val1]).mean()
            if ratio_1 != ratio_2:
                return False

        return True

#2-3. Idenify columns to remove 
    def compare_all_features(df):
        #Calculate Ratios 
        ratios = {column: get_value_counts_ratio(df[column]) for column in df.columns}
        similar_columns_dict = {column: [] for column in df.columns}

        #Update 'similar_columns_dict' with columns having identical ratios
        columns = list(ratios.keys())
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                if np.array_equal(ratios[columns[i]], ratios[columns[j]]):
                    similar_columns_dict[columns[i]].append(columns[j])

        #Update 'comparisions' fro identical value mappings among columns with similar ratios
        comparisons = []
        for key, values in similar_columns_dict.items():
            for value in values:
                if check_value_mapping(df, key, value):
                    comparisons.append((key, value))
                    

        if comparisons:
            print("다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:")
            for base, compare in comparisons:
                print(f"'{base}'와(과) '{compare}'")
        else:
            print("모든 피쳐의 고유값 비율과 양상이 동일하지 않습니다.")

        return comparisons

    comparisons = compare_all_features(df)
    columns_to_remove = set()
    for _, col_to_remove in comparisons:
        columns_to_remove.add(col_to_remove)

    df = df.drop(columns=columns_to_remove)
    global df_columns
    df_columns = df.columns
    return df

In [4]:
def preprocess_train_dataframe(df):
    df = preprocess_unique_value(df)
    df = reduce_dataframe(df)
    return df

In [5]:
# Add part
df_p = pd.read_csv('data/train_null.csv')
df_pre = preprocess_train_dataframe(df_p)
df_pre.head(2)
pre_path = 'data/train_preprocess.csv'
df_pre.to_csv(pre_path, index=False)

다음 피쳐 쌍은 고유값 비율과 양상이 동일합니다:
'Wip Line_Dam'와(과) 'Process Desc._Dam'
'Wip Line_Dam'와(과) 'Insp. Seq No._Dam'
'Wip Line_Dam'와(과) 'Insp Judge Code_Dam'
'Wip Line_Dam'와(과) 'CURE STANDBY POSITION X Collect Result_Dam'
'Wip Line_Dam'와(과) 'CURE STANDBY POSITION Z Collect Result_Dam'
'Wip Line_Dam'와(과) 'CURE STANDBY POSITION Θ Collect Result_Dam'
'Wip Line_Dam'와(과) 'CURE START POSITION Z Collect Result_Dam'
'Wip Line_Dam'와(과) 'Wip Line_AutoClave'
'Wip Line_Dam'와(과) 'Process Desc._AutoClave'
'Wip Line_Dam'와(과) 'Equipment_AutoClave'
'Wip Line_Dam'와(과) 'Insp. Seq No._AutoClave'
'Wip Line_Dam'와(과) 'Insp Judge Code_AutoClave'
'Wip Line_Dam'와(과) '1st Pressure Judge Value_AutoClave'
'Wip Line_Dam'와(과) '2nd Pressure Judge Value_AutoClave'
'Wip Line_Dam'와(과) '3rd Pressure Judge Value_AutoClave'
'Wip Line_Dam'와(과) 'Wip Line_Fill1'
'Wip Line_Dam'와(과) 'Process Desc._Fill1'
'Wip Line_Dam'와(과) 'Insp. Seq No._Fill1'
'Wip Line_Dam'와(과) 'Insp Judge Code_Fill1'
'Wip Line_Dam'와(과) 'Wip Line_Fill2'
'Wip Line_Dam'와(

Unnamed: 0,Wip Line_Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE SPEED Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,...,HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2,HEAD Standby Position X Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,IVI-OB6,Dam dispenser #1,AJX75334505,4F1XA938-1,100,16,14.9,8.4,14.7,1.04,...,835.5,458.0,428.0,243.7,114.612,19.9,7,127,1,Normal
1,IVI-OB6,Dam dispenser #1,AJX75334505,3KPM0016-2,70,10,21.3,4.9,21.3,1.49,...,458.0,156.0,427.9,270.0,19.6,7.0,185,1,0,Normal


In [6]:
pre_path = 'data/train_preprocess.csv'
df_pre.to_csv(pre_path, index=False)

In [13]:
def is_column_to_drop(col):
    unique_values = col.dropna().unique()
    return len(unique_values) == 0 or (len(unique_values) == 1 and unique_values[0] == 'OK') or set(unique_values) == {'OK'}

# Apply the function to determine which columns to drop
columns_to_drop = [col for col in df_pre.columns if is_column_to_drop(df_pre[col])]
df_pre.drop(columns=columns_to_drop, inplace=True)


output_path = 'data/train_preprocess2.csv'
df_pre.to_csv(output_path, index=False)

In [15]:

len(df_pre['Workorder_Dam'].unique())


663

# Under Sampling data preprocess

In [None]:
df_preprocessed = preprocess_train_dataframe(df_concat)

In [15]:
df_preprocessed.shape

(4700, 102)

### 데이터 분할


In [None]:
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

## 3. 모델 학습


### 모델 정의


In [None]:
model = RandomForestClassifier(random_state=RANDOM_STATE)

### 모델 학습


In [None]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train[features]
train_y = df_train["target"]

model.fit(train_x, train_y)

## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [None]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [None]:
df_test_x = test_data[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

In [None]:
test_pred = model.predict(df_test_x)
test_pred

### 제출 파일 작성


In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
