In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split




In [None]:
# 범주형 변수를 인코딩하는 함수
def encode_categorical_features(data):
    label_encoders = {}
    for column in data.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))  # 문자열로 변환 후 인코딩
        label_encoders[column] = le
    return data, label_encoders


# 정규화 함수
def normalize_data(data):
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data.select_dtypes(include=[np.number]))  # 수치형 데이터만 정규화
    scaled_df = pd.DataFrame(scaled_data, columns=data.select_dtypes(include=[np.number]).columns, index=data.index)
    return scaled_df

In [None]:
df = pd.read_csv('./all_data.csv')

In [48]:
# target 변수가 nan인 행을 추출하여 새로운 데이터프레임 df_test 만들기
df_test = df[df['target'].isnull()]
df_test = df_test.drop('target', axis=1)

#범주형 데이터 인코딩, 정규화
df_test_encoded, label_encoders = encode_categorical_features(df_test)
df_test_final = normalize_data(df_test_encoded)

In [None]:
#target이 "AbNormal"인 데이터만 추출
df_abnormal = df[df['target'] == 'AbNormal']

#target이 "Normal"인 데이터만 추출
df_normal = df[df['target'] == 'Normal']

#2350개만 df_normal에서 추출
num_of_abnormal = len(df_abnormal)
df_normal = df_normal.sample(n=num_of_abnormal, random_state=1)

#df_normal과 df_abnormal을 합침
df = pd.concat([df_normal, df_abnormal])

# 데이터프레임에서 특성과 레이블 나누기
X = df.drop('target', axis=1)
y = df['target']

#범주형 데이터 인코딩, 정규화
df_encoded, label_encoders = encode_categorical_features(X)
df_normalized = normalize_data(df_encoded)

X = df_normalized

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
df_final = df_normalized

## 모델 학습

## RF

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# 랜덤 포레스트 모델 초기화
rf_model = RandomForestClassifier(random_state=42)

# 모델 학습
rf_model.fit(X_train, y_train)

# 테스트 세트에서 예측
y_pred = rf_model.predict(X_test)

# F1-score 계산
f1 = f1_score(y_test, y_pred, average='macro')  # 평균 방식은 'macro', 'micro', 'weighted' 중 선택 가능

# 모델 성능 평가
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 성능 결과 출력
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# 랜덤 포레스트 모델 초기화
rf_model = RandomForestClassifier(random_state=42)

# 튜닝할 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [110, 130, 200],
    'max_depth': [None],
    'min_samples_split': [10],
    'min_samples_leaf': [1]
}

# GridSearchCV 초기화
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           scoring='f1_weighted', cv=5, verbose=1, n_jobs=-1)

# 하이퍼파라미터 튜닝 수행
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 및 모델
best_rf_model = grid_search.best_estimator_
print(f"Best Hyperparameters: {grid_search.best_params_}")

# 테스트 세트에서 예측
y_pred = best_rf_model.predict(X_test)

# F1-score 계산
f1 = f1_score(y_test, y_pred, average='weighted')

# 모델 성능 평가
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 성능 결과 출력
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

In [51]:
y_pred_final = grid_search.best_estimator_.predict(df_test_final)

In [52]:
y_pred_final

array([0., 1., 0., ..., 0., 0., 1.])

In [53]:
submission = pd.read_csv('./data/submission.csv')

In [54]:
submission['target'] = y_pred_final

submission.to_csv('./data_submission/submission2.csv', index=False)