# Read All Dataset CSV

In [62]:
import os
import csv
import pandas as pd
import numpy as np

In [64]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [65]:
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]

for folder_name in os.listdir("./Competition_data"):
    
    x_train = pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0)
    y_train = pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0)
    x_test = pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0)  
    
    # Initialize Encoder
    label_encoder = LabelEncoder()
    OneHotEncoder_encoder = OneHotEncoder()
    standard_encoder = StandardScaler()
    minmax_encoder = MinMaxScaler()

    # seperate the categorical columns and numerical columns
    numerical_columns = []
    categorical_columns = []
    for i in x_train.columns:
        if x_train[i].dtype == 'float64':
            numerical_columns.append(i)
        else:
            categorical_columns.append(i)
            
    # copy to avoid changing the original data
    X_train_encoded = x_train.copy()
    X_test_encoded = x_test.copy()


    for col in categorical_columns:
        # Fit the one hot encoder on the combined data of train and test to avoid unseen labels
        combined_data = pd.concat([x_train[col], x_test[col]], axis=0)
        OneHotEncoder_encoder.fit(combined_data.values.reshape(-1, 1))

        # Transform the train and test data
        train_encoded = OneHotEncoder_encoder.transform(x_train[col].values.reshape(-1, 1)).toarray()
        test_encoded = OneHotEncoder_encoder.transform(x_test[col].values.reshape(-1, 1)).toarray()

        # Create new column names for the one hot encoded columns
        train_encoded_df = pd.DataFrame(train_encoded, columns=[f"{col}_{int(i)}" for i in range(train_encoded.shape[1])])
        test_encoded_df = pd.DataFrame(test_encoded, columns=[f"{col}_{int(i)}" for i in range(test_encoded.shape[1])])

        # Concatenate the new one hot encoded columns to the original dataframe
        X_train_encoded = pd.concat([X_train_encoded, train_encoded_df], axis=1)
        X_test_encoded = pd.concat([X_test_encoded, test_encoded_df], axis=1)

        # Drop the original categorical columns
        X_train_encoded.drop(columns=[col], inplace=True)
        X_test_encoded.drop(columns=[col], inplace=True)

    # for col in categorical_columns:
    #     # Fit the label encoder on the combined data of train and test to avoid unseen labels
    #     combined_data = pd.concat([x_train[col], x_test[col]], axis=0)
    #     label_encoder.fit(combined_data)

    #     X_train_encoded[col] = label_encoder.transform(x_train[col])
    #     X_test_encoded[col] = label_encoder.transform(x_test[col])

    # #  == apply standard scaler to numerical columns == ##
    # for col in numerical_columns:
    #     combined_data = pd.concat([x_train[col], x_test[col]], axis=0)
    #     standard_encoder.fit(combined_data.values.reshape(-1, 1))

    #     X_train_encoded[col] = standard_encoder.fit_transform(x_train[col].values.reshape(-1, 1))
    #     X_test_encoded[col] = standard_encoder.transform(x_test[col].values.reshape(-1, 1))

    ##  == apply minmax scaler to numerical columns cuz we use XGBOOST and RF== ##
    for col in numerical_columns:
        combined_data = pd.concat([x_train[col], x_test[col]], axis=0)
        minmax_encoder.fit(combined_data.values.reshape(-1, 1))

        X_train_encoded[col] = minmax_encoder.transform(x_train[col].values.reshape(-1, 1))
        X_test_encoded[col] = minmax_encoder.transform(x_test[col].values.reshape(-1, 1))

    dataset_names.append(folder_name)
    X_trains.append(X_train_encoded)
    y_trains.append(y_train)
    X_tests.append(X_test_encoded)

        

In [None]:
X_trains[2]

## Data Preprocessing & Feature Engineering

In [39]:
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import  VarianceThreshold
from sklearn.ensemble import RandomForestClassifier

In [52]:
for i in range(len(dataset_names)):

    ## First: use Filter method to remove features with low variance
    filter = VarianceThreshold()
    X_train_encoded = X_trains[i]
    X_test_encoded = X_tests[i]
    y_train = y_trains[i]   

    # Fit the filter on the train data
    filter.fit(X_train_encoded)

    # Transform the train and test data
    X_train_encoded = filter.transform(X_train_encoded)
    X_test_encoded = filter.transform(X_test_encoded)

    ## Second: use embedded method to select features
    # Initialize the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Fit the model
    model.fit(X_train_encoded, y_train.values.ravel())

    # Get feature importances
    importances = model.feature_importances_

    # Select features based on importance
    selected_features = importances > np.mean(importances)

    # Convert the numpy arrays back to DataFrames with the selected feature names
    selected_feature_names = X_trains[i].columns[filter.get_support()][selected_features]
    X_trains[i] = pd.DataFrame(X_train_encoded[:, selected_features], columns=selected_feature_names)
    X_tests[i] = pd.DataFrame(X_test_encoded[:, selected_features], columns=selected_feature_names)

In [None]:
X_trains[2]

## train test split & build Model
You can select an appropriate model and perform corresponding hyperparameter tuning.

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


使用VotingClassifier

In [None]:
from sklearn.ensemble import VotingClassifier

y_predicts_voting = []
plt.figure(figsize=(10, 8))

for i in range(len(dataset_names)):
    X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    ##  Boosting first, Bagging second, Stacking third

    # 1.Boosting first: Initialize the boosting models
    boosting_clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    boosting_clf2 = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
    boosting_clf3 = RandomForestClassifier(n_estimators=100, random_state=42)


    # 2.Bagging second: use VotingClassifier to combine the boosting models
    votin_clf= VotingClassifier(estimators=[
                ('gb1', boosting_clf1),
                ('gb2', boosting_clf2),
                ('gb3', boosting_clf3)],voting='soft', n_jobs=-1)


    bagging_model = BaggingClassifier(estimator=votin_clf,n_estimators=10,random_state=42,n_jobs=-1)

    # 3.Stacking third: Initialize the stacking model
    stacking_model = StackingClassifier(
        estimators=[
            ('bagging', bagging_model)  # Bagging 模型作為 Stacking 的基分類器之一
        ],
        final_estimator=LogisticRegression(),  # Meta-model
        passthrough=True  # 保留原始特徵與基分類器的輸出
    )

    # train the stacking model
    stacking_model.fit(X_train_part, y_train_part)


    # 使用驗證集進行預測
    y_pred = stacking_model.predict(X_valid)
    y_pred_proba = stacking_model.predict_proba(X_valid)[:, 1]

    # 評估模型
    accuracy = accuracy_score(y_valid, y_pred)
    roc_auc1 = roc_auc_score(y_valid, y_pred_proba)

    print(f"{i} times finish")
    print(f"AUC: {roc_auc1:.2f}")

    # 使用訓練好的Voting模型來預測X_test並儲存結果
    y_test_pred = stacking_model.predict_proba(X_tests[i])[:, 1]
    df = pd.DataFrame(y_test_pred, columns=['y_predict_proba'])
    y_predicts_voting.append(df)

    # 繪製AUC曲線
    # Calculate ROC and AUC
    fpr, tpr, thresholds = roc_curve(y_valid, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    plt.plot(fpr, tpr, lw=2, label=f'{dataset_names[i]} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [71]:
for idx,dataset_name in enumerate(dataset_names):
    df=y_predicts_voting[idx]
    df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False,header=True)