In [1]:
# 교통사고 심각도 예측 모델링 (시간 컬럼 제거 포함)
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# 데이터 불러오기
df = pd.read_csv('traffic_accidents.csv')

# 타겟 레이블 인코딩
le = LabelEncoder()
df['most_severe_injury_encoded'] = le.fit_transform(df['most_severe_injury'])

# 날짜/시간 관련 컬럼 제거
datetime_cols = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]
df = df.drop(columns=datetime_cols)

# 고유값이 너무 많은 범주형 컬럼 제거
high_card_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].nunique() > 100]
categorical_cols = [col for col in df.select_dtypes(include='object').columns if col not in high_card_cols and col != 'most_severe_injury']

# 피처 구성 및 인코딩
features = df.drop(columns=['most_severe_injury', 'most_severe_injury_encoded'])
X = pd.get_dummies(features, columns=categorical_cols)
y = df['most_severe_injury_encoded']

# 학습/테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 정의 및 하이퍼파라미터
models = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'KNeighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

params = {
    'RandomForest': {'n_estimators': [100], 'max_depth': [10]},
    'LogisticRegression': {'C': [10]},
    'KNeighbors': {'n_neighbors': [3]},
    'XGBoost': {'max_depth': [3], 'n_estimators': [100]}
}

# 모델 학습 및 평가
best_models = {}

for name, model in models.items():
    print(f"--- {name} ---")
    grid = GridSearchCV(model, params[name], cv=3, scoring='f1_macro')
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print("Best Params:", grid.best_params_)
    y_pred = grid.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))


--- RandomForest ---
Best Params: {'max_depth': 10, 'n_estimators': 100}
                          precision    recall  f1-score   support

                   FATAL       1.00      0.64      0.78        56
   INCAPACITATING INJURY       0.99      1.00      1.00      1338
 NO INDICATION OF INJURY       1.00      1.00      1.00     30831
NONINCAPACITATING INJURY       1.00      1.00      1.00      6384
   REPORTED, NOT EVIDENT       1.00      1.00      1.00      3253

                accuracy                           1.00     41862
               macro avg       1.00      0.93      0.96     41862
            weighted avg       1.00      1.00      1.00     41862

--- LogisticRegression ---
Best Params: {'C': 10}
                          precision    recall  f1-score   support

                   FATAL       1.00      0.96      0.98        56
   INCAPACITATING INJURY       1.00      1.00      1.00      1338
 NO INDICATION OF INJURY       1.00      1.00      1.00     30831
NONINCAPACITATI

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Params: {'max_depth': 3, 'n_estimators': 100}
                          precision    recall  f1-score   support

                   FATAL       1.00      1.00      1.00        56
   INCAPACITATING INJURY       1.00      1.00      1.00      1338
 NO INDICATION OF INJURY       1.00      1.00      1.00     30831
NONINCAPACITATING INJURY       1.00      1.00      1.00      6384
   REPORTED, NOT EVIDENT       1.00      1.00      1.00      3253

                accuracy                           1.00     41862
               macro avg       1.00      1.00      1.00     41862
            weighted avg       1.00      1.00      1.00     41862



In [2]:
for name, model in best_models.items():
    print(f"\n💡 {name} 성능 평가")
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))



💡 RandomForest 성능 평가
              precision    recall  f1-score   support

           0       1.00      0.64      0.78        56
           1       0.99      1.00      1.00      1338
           2       1.00      1.00      1.00     30831
           3       1.00      1.00      1.00      6384
           4       1.00      1.00      1.00      3253

    accuracy                           1.00     41862
   macro avg       1.00      0.93      0.96     41862
weighted avg       1.00      1.00      1.00     41862


💡 LogisticRegression 성능 평가
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        56
           1       1.00      1.00      1.00      1338
           2       1.00      1.00      1.00     30831
           3       1.00      1.00      1.00      6384
           4       1.00      1.00      1.00      3253

    accuracy                           1.00     41862
   macro avg       1.00      0.99      1.00     41862
weighted avg       1.00    