# Read All Dataset CSV

In [None]:
import os
import csv
import pandas as pd
import numpy as np

In [73]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

We attempted to use `LabelEncoder` and `OneHotEncoder` to transform categorical data, and `MinMaxScaler` and `StandardScaler` to transform numerical data. After several trials, we decided to use `OneHotEncoder` for categorical data and `MinMaxScaler` for numerical data. The reasons are as follows:

1. Categorical data does not have an inherent order. Using `LabelEncoder` might inadvertently create ordinal features.
2. Numerical data exhibits significant variance across columns. Using `StandardScaler` might reduce the 'distance' features.

Therefore, we opted for `OneHotEncoder` and `MinMaxScaler`.

In [None]:
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]

for folder_name in os.listdir("./Competition_data"):
    
    # read the data
    x_train = pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0)
    y_train = pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0)
    x_test = pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0)  
    
    # Initialize Encoder
    label_encoder = LabelEncoder()
    OneHotEncoder_encoder = OneHotEncoder()
    standard_encoder = StandardScaler()
    minmax_encoder = MinMaxScaler()

    # seperate the categorical columns and numerical columns
    numerical_columns = []
    categorical_columns = []
    for i in x_train.columns:
        if x_train[i].dtype == 'float64':
            numerical_columns.append(i) #'float64' is the data type of numerical columns
        else:
            categorical_columns.append(i) #the other type is the data type of categorical columns
            
    # copy to avoid changing the original data
    X_train_encoded = x_train.copy()
    X_test_encoded = x_test.copy()

    ## == apply one hot encoding to categorical columns == ##
    ## =================================================== ##
    for col in categorical_columns:
        # Fit the one hot encoder on the combined data of train and test to avoid unseen labels
        combined_data = pd.concat([x_train[col], x_test[col]], axis=0)
        OneHotEncoder_encoder.fit(combined_data.values.reshape(-1, 1))

        # Transform the train and test data
        ## reshape(-1, 1) is used to convert the 1D array to 2D array
        ## The one hot encoder returns a sparse matrix, so we convert it to a dense matrix
        train_encoded = OneHotEncoder_encoder.transform(x_train[col].values.reshape(-1, 1)).toarray()
        test_encoded = OneHotEncoder_encoder.transform(x_test[col].values.reshape(-1, 1)).toarray()

        # Create new column names for the one hot encoded columns
        train_encoded_df = pd.DataFrame(train_encoded, columns=[f"{col}_{int(i)}" for i in range(train_encoded.shape[1])])
        test_encoded_df = pd.DataFrame(test_encoded, columns=[f"{col}_{int(i)}" for i in range(test_encoded.shape[1])])

        # Concatenate the new one hot encoded columns to the original dataframe
        X_train_encoded = pd.concat([X_train_encoded, train_encoded_df], axis=1)
        X_test_encoded = pd.concat([X_test_encoded, test_encoded_df], axis=1)

        # Drop the original categorical columns after encoding
        X_train_encoded.drop(columns=[col], inplace=True)
        X_test_encoded.drop(columns=[col], inplace=True)

    # for col in categorical_columns:
    #     # Fit the label encoder on the combined data of train and test to avoid unseen labels
    #     combined_data = pd.concat([x_train[col], x_test[col]], axis=0)
    #     label_encoder.fit(combined_data)

    #     X_train_encoded[col] = label_encoder.transform(x_train[col])
    #     X_test_encoded[col] = label_encoder.transform(x_test[col])

    # #  == apply standard scaler to numerical columns == ##
    # for col in numerical_columns:
    #     combined_data = pd.concat([x_train[col], x_test[col]], axis=0)
    #     standard_encoder.fit(combined_data.values.reshape(-1, 1))

    #     X_train_encoded[col] = standard_encoder.fit_transform(x_train[col].values.reshape(-1, 1))
    #     X_test_encoded[col] = standard_encoder.transform(x_test[col].values.reshape(-1, 1))

    ##  == apply minmax scaler to numerical columns cuz we use XGBOOST and RF== ##
    for col in numerical_columns:
        combined_data = pd.concat([x_train[col], x_test[col]], axis=0)
        minmax_encoder.fit(combined_data.values.reshape(-1, 1))

        X_train_encoded[col] = minmax_encoder.transform(x_train[col].values.reshape(-1, 1))
        X_test_encoded[col] = minmax_encoder.transform(x_test[col].values.reshape(-1, 1))

    dataset_names.append(folder_name)
    X_trains.append(X_train_encoded)
    y_trains.append(y_train)
    X_tests.append(X_test_encoded)

        

check the encoded rightly input in the X_trains

In [None]:
X_trains[2]

## Data Preprocessing & Feature Engineering

In [76]:
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import  VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

1. shit way

In [52]:
for i in range(len(dataset_names)):

    ## First: use Filter method to remove features with low variance
    filter = VarianceThreshold()
    X_train_encoded = X_trains[i]
    X_test_encoded = X_tests[i]
    y_train = y_trains[i]   

    # Fit the filter on the train data
    filter.fit(X_train_encoded)

    # Transform the train and test data
    X_train_encoded = filter.transform(X_train_encoded)
    X_test_encoded = filter.transform(X_test_encoded)

    ## Second: use embedded method to select features
    # Initialize the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Fit the model
    model.fit(X_train_encoded, y_train.values.ravel())

    # Get feature importances
    importances = model.feature_importances_

    # Select features based on importance
    selected_features = importances > np.mean(importances)

    # Convert the numpy arrays back to DataFrames with the selected feature names
    selected_feature_names = X_trains[i].columns[filter.get_support()][selected_features]
    X_trains[i] = pd.DataFrame(X_train_encoded[:, selected_features], columns=selected_feature_names)
    X_tests[i] = pd.DataFrame(X_test_encoded[:, selected_features], columns=selected_feature_names)

2. good way but too lo0ng

In [None]:
for i in range(len(dataset_names)):
    # Initialize the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Initialize RFECV
    rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(5), scoring='roc_auc')

    x_train = X_trains[i]
    y_train = y_trains[i]

    # Fit RFECV
    rfecv.fit(x_train, y_train.values.ravel())

    # Get the selected features
    selected_features = x_train.columns[rfecv.support_]

    # print("Selected features:", selected_features)

    # Transform the datasets
    X_trains[i] = X_trains[i][selected_features]
    X_tests[i] = X_tests[i][selected_features]

3. PCA

In [None]:
print(f"Before PCA, X_train shape: {X_trains[0].shape}")
print(f"Before PCA, y_train shape: {y_trains[0].shape}")

# Step 1: 切分資料
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_trains[0], y_trains[0], test_size=0.2, random_state=42)

# 確保切分後的樣本數一致
print(f"After train_test_split, X_train_part shape: {X_train_part.shape}")
print(f"After train_test_split, y_train_part shape: {y_train_part.shape}")

# Step 2: PCA 降維
pca = PCA(n_components=0.95)  # 保留 95% 的數據變異
X_train_pca = pca.fit_transform(X_train_part)
X_valid_pca = pca.transform(X_valid)  # 注意這裡是 transform，不是 fit_transform

# 確保降維後樣本數保持一致
print(f"After PCA, X_train_pca shape: {X_train_pca.shape}")
print(f"After PCA, X_valid_pca shape: {X_valid_pca.shape}")

# Step 3: Mutual Information 特徵選擇
mi_scores = mutual_info_classif(X_train_pca, y_train_part.values.ravel())  # 計算互信息分數
mi_threshold = np.mean(mi_scores)  # 使用平均值作為閾值
selected_mi_features = mi_scores > mi_threshold

# 保留選中的特徵
X_train_mi = X_train_pca[:, selected_mi_features]
X_valid_mi = X_valid_pca[:, selected_mi_features]  # 記得對驗證集也做相同的選擇

# 確保選擇後特徵數量正確
print(f"互信息篩選後的特徵數量: {X_train_mi.shape[1]}")

# Step 4: LightGBM 嵌入式方法
lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb_model.fit(X_train_mi, y_train_part.values.ravel())  # 訓練模型

# 使用特徵重要性進行篩選
feature_importances = lgb_model.feature_importances_
importance_threshold = np.mean(feature_importances)  # 使用平均值作為閾值
selected_lgb_features = feature_importances > importance_threshold

X_train_final = X_train_mi[:, selected_lgb_features]
X_valid_final = X_valid_mi[:, selected_lgb_features]  # 記得對驗證集也進行相同操作

# 確保最終特徵數量
print(f"LightGBM 篩選後的特徵數量: {X_train_final.shape[1]}")

# Step 5: 將最終特徵轉回 DataFrame
X_train_final_df = pd.DataFrame(X_train_final, columns=[f"Feature_{i+1}" for i in range(X_train_final.shape[1])])
X_valid_final_df = pd.DataFrame(X_valid_final, columns=[f"Feature_{i+1}" for i in range(X_valid_final.shape[1])])

# 更新 X_trains 和 X_tests
X_trains[0] = pd.concat([X_train_final_df,X_valid_final_df],axis=0) 

# 顯示最終的特徵
print(f"Final features in X_train: {X_trains[0].shape}")
print(f"Final features in X_test: {X_tests[0].shape}")


## train test split & build Model
You can select an appropriate model and perform corresponding hyperparameter tuning.

In [94]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

使用VotingClassifier

In [None]:
from sklearn.ensemble import VotingClassifier

y_predicts_voting = []
plt.figure(figsize=(10, 8))

for i in range(len(dataset_names)):
    X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    ##  Boosting first, Bagging second, Stacking third

    # 1.Boosting first: Initialize the boosting models
    boosting_clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    boosting_clf2 = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
    boosting_clf3 = LGBMClassifier(n_estimators=100, random_state=42)
    boosting_clf4 = RandomForestClassifier(n_estimators=100, random_state=42)
    boosting_clf5 = LogisticRegression(max_iter=1000, random_state=42)
    boosting_clf6 = SVC(probability=True, random_state=42)
    boosting_clf8 = GaussianNB()
    boosting_clf10 = AdaBoostClassifier(n_estimators=100, random_state=42)



    # 2.Bagging second: use VotingClassifier to combine the boosting models
    votin_clf= VotingClassifier(estimators=[
                ('gb1', boosting_clf1),
                ('gb2', boosting_clf2),
                ('gb3', boosting_clf3),
                ('gb4', boosting_clf4),
                ('gb5', boosting_clf5),
                ('gb6', boosting_clf6),
                ('gb8', boosting_clf8),
                ('gb10', boosting_clf10)],voting='soft', n_jobs=-1)


    bagging_model = BaggingClassifier(estimator=votin_clf,n_estimators=10,random_state=42,n_jobs=-1)

    # 3.Stacking third: Initialize the stacking model
    stacking_model = StackingClassifier(
        estimators=[
            ('bagging', bagging_model)  # Bagging 模型作為 Stacking 的基分類器之一
        ],
        final_estimator=LogisticRegression(),  # Meta-model
        passthrough=True  # 保留原始特徵與基分類器的輸出
    )

    # train the stacking model
    stacking_model.fit(X_train_part, y_train_part)


    # 使用驗證集進行預測
    y_pred = stacking_model.predict(X_valid)
    y_pred_proba = stacking_model.predict_proba(X_valid)[:, 1]

    # 評估模型
    accuracy = accuracy_score(y_valid, y_pred)
    roc_auc1 = roc_auc_score(y_valid, y_pred_proba)

    print(f"{i} times finish")
    print(f"AUC: {roc_auc1:.2f}")

    # 使用訓練好的Voting模型來預測X_test並儲存結果
    y_test_pred = stacking_model.predict_proba(X_tests[i])[:, 1]
    df = pd.DataFrame(y_test_pred, columns=['y_predict_proba'])
    y_predicts_voting.append(df)

    # 繪製AUC曲線
    # Calculate ROC and AUC
    fpr, tpr, thresholds = roc_curve(y_valid, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    plt.plot(fpr, tpr, lw=2, label=f'{dataset_names[i]} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [98]:
for idx,dataset_name in enumerate(dataset_names):
    df=y_predicts_voting[idx]
    df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False,header=True)