In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score

# 분류 모델들
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# 7. 모델 저장 및 로드
import joblib


In [5]:
# 데이터 로드
train_df = pd.read_csv('dataset/train.csv',  index_col='ID')
test_df = pd.read_csv('dataset/test.csv', index_col='ID')

# 모든 열을 레이블 인코딩
label_encoders = {}
for column in train_df.columns:
    mlb = MultiLabelBinarizer()
    train_df[column]=train_df[column].str.split()
    train_df[column] = mlb.fit_transform(train_df[column])
    label_encoders[column] = mlb

# 특징과 레이블 분리
features = [col for col in train_df.columns if col != 'SUBCLASS']
X = train_df[features]
y = train_df['SUBCLASS']

# 계층적 샘플링을 사용한 데이터 분할 (학습과 검증 세트로 분할)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 모델 리스트
models = {
    "RandomForest": RandomForestClassifier(random_state=42)
    # "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    # "SupportVectorMachine": SVC(random_state=42),
    # "KNeighbors": KNeighborsClassifier(),
    # "DecisionTree": DecisionTreeClassifier(random_state=42)
}

# 모델 학습 및 검증 평가
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

    # 모델 저장
    joblib.dump(model, './trained_model/'+model_name+'.pkl')

    # 검증 데이터 평가
    y_val_pred = model.predict(X_val)
    print(f"\n{model_name} Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
    print(f"\n{model_name} Validation Classification Report:")
    print(classification_report(y_val, y_val_pred, target_names=label_encoders['SUBCLASS'].classes_))
    print("-" * 50)

Training RandomForest...

RandomForest Validation Accuracy: 0.2884770346494762

RandomForest Validation Classification Report:
              precision    recall  f1-score   support

         ACC       0.79      0.79      0.79        14
        BLCA       0.00      0.00      0.00        21
        BRCA       0.34      0.50      0.40       157
        CESC       0.50      0.03      0.06        31
        COAD       0.54      0.56      0.55        45
        DLBC       0.00      0.00      0.00         7
      GBMLGG       0.20      0.22      0.21        92
        HNSC       0.19      0.16      0.17        45
       KIPAN       0.14      0.16      0.15       103
        KIRC       0.06      0.06      0.06        67
        LAML       0.59      0.62      0.61        32
         LGG       0.18      0.22      0.20        46
        LIHC       0.60      0.10      0.17        31
        LUAD       0.00      0.00      0.00        37
        LUSC       0.33      0.03      0.05        36
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



LogisticRegression Validation Accuracy: 0.25141015310233683

LogisticRegression Validation Classification Report:
              precision    recall  f1-score   support

         ACC       0.88      0.50      0.64        14
        BLCA       0.00      0.00      0.00        21
        BRCA       0.39      0.47      0.43       157
        CESC       0.18      0.16      0.17        31
        COAD       0.47      0.33      0.39        45
        DLBC       0.00      0.00      0.00         7
      GBMLGG       0.15      0.18      0.17        92
        HNSC       0.15      0.11      0.13        45
       KIPAN       0.12      0.13      0.12       103
        KIRC       0.06      0.06      0.06        67
        LAML       0.64      0.28      0.39        32
         LGG       0.11      0.11      0.11        46
        LIHC       0.11      0.06      0.08        31
        LUAD       0.08      0.05      0.06        37
        LUSC       0.31      0.25      0.28        36
          OV       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)



KNeighbors Validation Accuracy: 0.17647058823529413

KNeighbors Validation Classification Report:
              precision    recall  f1-score   support

         ACC       0.17      0.07      0.10        14
        BLCA       0.22      0.10      0.13        21
        BRCA       0.22      0.54      0.31       157
        CESC       0.00      0.00      0.00        31
        COAD       0.25      0.18      0.21        45
        DLBC       0.00      0.00      0.00         7
      GBMLGG       0.13      0.21      0.16        92
        HNSC       0.11      0.02      0.04        45
       KIPAN       0.12      0.10      0.11       103
        KIRC       0.05      0.04      0.05        67
        LAML       0.22      0.47      0.30        32
         LGG       0.23      0.15      0.18        46
        LIHC       0.00      0.00      0.00        31
        LUAD       0.00      0.00      0.00        37
        LUSC       0.00      0.00      0.00        36
          OV       0.15      0.16   

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, StackingClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score

# 데이터 로드
train_df = pd.read_csv('dataset/train.csv',  index_col='ID')
test_df = pd.read_csv('dataset/test.csv', index_col='ID')

# 모든 열을 레이블 인코딩
label_encoders = {}
for column in train_df.columns:
    le = LabelEncoder()
    train_df[column] = le.fit_transform(train_df[column])
    label_encoders[column] = le

# 특징과 레이블 분리
features = [col for col in train_df.columns if col != 'SUBCLASS']
X = train_df[features]
y = train_df['SUBCLASS']

# 계층적 샘플링을 사용한 데이터 분할 (학습과 검증 세트로 분할)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 기본 학습 모델들 정의
base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
    ('et', ExtraTreesClassifier(random_state=42))
]

# 메타 모델 정의 (LightGBM)
meta_model = LGBMClassifier(random_state=42)

# 스태킹 모델 정의
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# 스태킹 모델 학습
stacking_clf.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = stacking_clf.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Validation Classification Report:")
print(classification_report(y_val, y_val_pred, target_names=label_encoders['SUBCLASS'].classes_))
print("-" * 50)

In [None]:
# 모델 저장
joblib.dump(stacking_clf, './trained_model/stacking_rf_gb_et.pkl')

In [4]:
stacking_clf = joblib.load('trained_model/stacking_rf_gb_et.pkl')

In [8]:
from sklearn.preprocessing import LabelEncoder

# Load the datasets
train_df = pd.read_csv('dataset/train.csv',  index_col='ID')
submission_df = pd.read_csv('dataset/sample_submission.csv')
test_df = pd.read_csv('dataset/test.csv')

# Separate features and labels
features = [col for col in train_df.columns if col != 'SUBCLASS']

# Label encoding for features
label_encoders = {}
for column in features:
    le = LabelEncoder()
    train_df[column] = le.fit_transform(train_df[column])
    label_encoders[column] = le

# Label encoding for the target 'SUBCLASS'
subclass_encoder = LabelEncoder()
train_df['SUBCLASS'] = subclass_encoder.fit_transform(train_df['SUBCLASS'])

# Encode test data using the same encoders
for column in features:
    test_df[column] = label_encoders[column].transform(test_df[column])

# Drop the 'ID' column from test_df
test_df_id = test_df['ID']
submission_df['ID'] = test_df_id
test_df = test_df.drop('ID', axis=1)

# Assuming stacking_clf is already trained, predict on the test set
test_predictions = stacking_clf.predict(test_df[features])

# Decode the predicted labels to their original form
test_predictions_decoded = subclass_encoder.inverse_transform(test_predictions)

# Format the submission file
submission_df['SUBCLASS'] = test_predictions_decoded
submission_df.to_csv('./dataset/my_submission.csv', index=False)

print("Sample submission file has been created and saved as 'my_submission.csv'.")

ValueError: y contains previously unseen labels: 'D625H D675H D775H'