In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [9]:
# 범주형 변수를 인코딩하는 함수
def encode_categorical_features(data):
    label_encoders = {}
    for column in data.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))  # 문자열로 변환 후 인코딩
        label_encoders[column] = le
    return data, label_encoders


# 정규화 함수
def normalize_data(data):
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data.select_dtypes(include=[np.number]))  # 수치형 데이터만 정규화
    scaled_df = pd.DataFrame(scaled_data, columns=data.select_dtypes(include=[np.number]).columns, index=data.index)
    return scaled_df

In [2]:
df = pd.read_csv('./all_data.csv')

  df = pd.read_csv('./all_data.csv')


In [5]:
#target이 "AbNormal"인 데이터만 추출
df_abnormal = df[df['target'] == 'AbNormal']

#df_abnormal의 개수
print('Abnormal data count:', len(df_abnormal))

Abnormal data count: 2350


In [6]:
#target이 "Normal"인 데이터만 추출
df_normal = df[df['target'] == 'Normal']

#df_normal의 개수
print('Normal data count:', len(df_normal))

Normal data count: 38156


In [7]:
#2350개만 df_normal에서 추출
df_normal = df_normal.sample(n=2350, random_state=1)

#df_normal과 df_abnormal을 합침
df = pd.concat([df_normal, df_abnormal])

df

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam,Head Clean Position Z Collect Result_Dam,Head Purge Position Z Collect Result_Dam,Head Zero Position Y Collect Result_Dam,...,CURE DpS Z Collect Result_Fill2,CURE DpS Θ Collect Result_Fill2,Average Stage1 CL Distance Speed Collect Result_Dam,Average Stage2 CL Distance Speed Collect Result_Dam,Average Stage3 CL Distance Speed Collect Result_Dam,HEAD NORMAL COORDINATE Y AXIS(Stage) Collect Result_Dam,HEAD NORMAL COORDINATE Z AXIS(Stage) Collect Result_Dam,HEAD NORMAL COORDINATE Y AXIS(Stage) Collect Result_Fill,HEAD NORMAL COORDINATE Z AXIS(Stage) Collect Result_Fill,HEAD NORMAL COORDINATE X AXIS(Stage) Collect Result_Fill
28691,0,0,291,10,551.8,465.3,162.6,130.85,130.85,300.0,...,0.00,5.4,4000.0,4000.0,4000.0,377.433333,282.15,429.383333,244.0500,1451.60
18254,1,1,476,16,161.2,464.2,550.5,130.85,130.85,300.0,...,0.00,5.4,6500.0,5500.0,6500.0,1309.166667,281.43,1323.616667,244.0025,1475.90
27091,0,0,56,10,549.0,463.0,160.5,133.50,133.50,303.5,...,0.00,5.4,9000.0,12000.0,9000.0,377.333333,284.80,429.183333,244.0000,1452.45
34227,0,0,47,10,549.0,463.0,160.5,133.50,133.50,303.5,...,0.00,5.4,9000.0,12000.0,9000.0,377.333333,284.80,429.183333,244.0000,1452.45
14089,0,0,293,10,551.8,465.3,162.6,130.85,130.85,300.0,...,0.00,5.4,4000.0,4000.0,4000.0,377.433333,282.15,429.383333,244.0500,1451.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40437,0,0,518,16,550.6,463.9,161.5,124.00,130.85,300.0,...,0.00,5.4,6000.0,5500.0,6000.0,377.566667,274.40,429.383333,244.1590,1451.65
40463,0,0,78,10,549.0,463.0,160.0,133.50,133.50,300.0,...,0.02,5.4,9000.0,9000.0,9000.0,377.333333,284.80,429.183333,244.0500,1452.20
40481,1,1,258,10,162.4,465.5,552.0,130.85,130.85,300.0,...,0.00,5.4,4000.0,5000.0,4000.0,1271.433333,282.15,1323.516667,243.9000,1476.15
40496,0,0,50,10,549.0,463.0,160.5,133.50,133.50,303.5,...,0.00,5.4,9000.0,9000.0,9000.0,377.333333,284.80,429.183333,244.0500,1452.45


In [10]:
df_encoded, label_encoders = encode_categorical_features(df)
df_normalized = normalize_data(df_encoded)

In [11]:
df_final = df_normalized

## 모델 학습

In [12]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


# 데이터프레임에서 특성과 레이블 나누기
X = df_final.drop('target', axis=1)
y = df_final['target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## RF

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# 랜덤 포레스트 모델 초기화
rf_model = RandomForestClassifier(random_state=42)

# 모델 학습
rf_model.fit(X_train, y_train)

# 테스트 세트에서 예측
y_pred = rf_model.predict(X_test)

# F1-score 계산
f1 = f1_score(y_test, y_pred, average='macro')  # 평균 방식은 'macro', 'micro', 'weighted' 중 선택 가능

# 모델 성능 평가
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 성능 결과 출력
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

F1 Score: 0.6094
Confusion Matrix:
[[296 167]
 [200 277]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.60      0.64      0.62       463
         1.0       0.62      0.58      0.60       477

    accuracy                           0.61       940
   macro avg       0.61      0.61      0.61       940
weighted avg       0.61      0.61      0.61       940



In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# 랜덤 포레스트 모델 초기화
rf_model = RandomForestClassifier(random_state=42)

# 튜닝할 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [80, 100, 110],
    'max_depth': [None, 3, 6, 10],
    'min_samples_split': [7, 10, 15],
    'min_samples_leaf': [1]
}

# GridSearchCV 초기화
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           scoring='f1_weighted', cv=5, verbose=1, n_jobs=-1)

# 하이퍼파라미터 튜닝 수행
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 및 모델
best_rf_model = grid_search.best_estimator_
print(f"Best Hyperparameters: {grid_search.best_params_}")

# 테스트 세트에서 예측
y_pred = best_rf_model.predict(X_test)

# F1-score 계산
f1 = f1_score(y_test, y_pred, average='weighted')

# 모델 성능 평가
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 성능 결과 출력
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
F1 Score: 0.6200
Confusion Matrix:
[[300 163]
 [194 283]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.61      0.65      0.63       463
         1.0       0.63      0.59      0.61       477

    accuracy                           0.62       940
   macro avg       0.62      0.62      0.62       940
weighted avg       0.62      0.62      0.62       940

