# Read All Dataset CSV

In [1]:
import os
import csv
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

We attempted to use `LabelEncoder` and `OneHotEncoder` to transform categorical data, and `MinMaxScaler` and `StandardScaler` to transform numerical data. After several trials, we decided to use `OneHotEncoder` for categorical data and `MinMaxScaler` for numerical data. The reasons are as follows:

1. Categorical data does not have an inherent order. Using `LabelEncoder` might inadvertently create ordinal features.
2. Numerical data exhibits significant variance across columns. Using `StandardScaler` might reduce the 'distance' features, also it's better for  XGBOOST and RF

Therefore, we opted for `OneHotEncoder` and `MinMaxScaler`.

In [3]:
# create a list to store the data
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]

for folder_name in os.listdir("./Competition_data"):
    
    # read the data
    x_train = pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0)
    y_train = pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0)
    x_test = pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0)  
    
    # Initialize Encoder
    label_encoder = LabelEncoder()
    OneHotEncoder_encoder = OneHotEncoder()
    standard_encoder = StandardScaler()
    minmax_encoder = MinMaxScaler()

    # seperate the categorical columns and numerical columns
    numerical_columns = []
    categorical_columns = []
    for i in x_train.columns:
        if x_train[i].dtype == 'float64':
            numerical_columns.append(i) #'float64' is the data type of numerical columns
        else:
            categorical_columns.append(i) #the other type is the data type of categorical columns
            
    # copy to avoid changing the original data
    X_train_encoded = x_train.copy()
    X_test_encoded = x_test.copy()

    ## == apply one hot encoding to categorical columns == ##
    ## =================================================== ##
    for col in categorical_columns:
        # Fit the one hot encoder on the combined data of train and test to avoid unseen labels
        combined_data = pd.concat([x_train[col], x_test[col]], axis=0)
        OneHotEncoder_encoder.fit(combined_data.values.reshape(-1, 1))

        # Transform the train and test data
        ## reshape(-1, 1) is used to convert the 1D array to 2D array
        ## The one hot encoder returns a sparse matrix, so we convert it to a dense matrix
        train_encoded = OneHotEncoder_encoder.transform(x_train[col].values.reshape(-1, 1)).toarray()
        test_encoded = OneHotEncoder_encoder.transform(x_test[col].values.reshape(-1, 1)).toarray()

        # Create new column names for the one hot encoded columns
        train_encoded_df = pd.DataFrame(train_encoded, columns=[f"{col}_{int(i)}" for i in range(train_encoded.shape[1])])
        test_encoded_df = pd.DataFrame(test_encoded, columns=[f"{col}_{int(i)}" for i in range(test_encoded.shape[1])])

        # Concatenate the new one hot encoded columns to the original dataframe
        X_train_encoded = pd.concat([X_train_encoded, train_encoded_df], axis=1)
        X_test_encoded = pd.concat([X_test_encoded, test_encoded_df], axis=1)

        # Drop the original categorical columns after encoding
        X_train_encoded.drop(columns=[col], inplace=True)
        X_test_encoded.drop(columns=[col], inplace=True)

    ## == apply minmax scaler to numerical columns cuz it's better for  XGBOOST and RF== ##
    ## ================================================================================= ##
    for col in numerical_columns:
        combined_data = pd.concat([x_train[col], x_test[col]], axis=0)
        minmax_encoder.fit(combined_data.values.reshape(-1, 1))

        X_train_encoded[col] = minmax_encoder.transform(x_train[col].values.reshape(-1, 1))
        X_test_encoded[col] = minmax_encoder.transform(x_test[col].values.reshape(-1, 1))


    # input the data to the list 
    dataset_names.append(folder_name)
    X_trains.append(X_train_encoded)
    y_trains.append(y_train)
    X_tests.append(X_test_encoded)

        

check the encoded rightly input in the X_trains or not

In [None]:
X_trains[2]


# Data Preprocessing & Feature Engineering

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import  VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_classif
import lightgbm as lgb
from sklearn.model_selection import train_test_split

## 1. VarianceThreshold and  embedded method
use VarianceThreshold to remove features with low variance and RF to pick the important features

In [6]:
for i in range(len(dataset_names)):

    ## First: use Filter method to remove features with low variance
    filter = VarianceThreshold()
    X_train_encoded = X_trains[i]
    X_test_encoded = X_tests[i]
    y_train = y_trains[i]   

    # Fit the filter on the train data
    filter.fit(X_train_encoded)

    # Transform the train and test data
    X_train_encoded = filter.transform(X_train_encoded)
    X_test_encoded = filter.transform(X_test_encoded)

    ## Second: use embedded method to select features
    # Initialize the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Fit the model
    model.fit(X_train_encoded, y_train.values.ravel())

    # Get feature importances
    importances = model.feature_importances_

    # Select features based on importance
    selected_features = importances > np.mean(importances)

    # Convert the numpy arrays back to DataFrames with the selected feature names
    selected_feature_names = X_trains[i].columns[filter.get_support()][selected_features]
    X_trains[i] = pd.DataFrame(X_train_encoded[:, selected_features], columns=selected_feature_names)
    X_tests[i] = pd.DataFrame(X_test_encoded[:, selected_features], columns=selected_feature_names)

### Conclustion: 
1. ignore the interact effect between features
2. choose improperly

=> rejec tot use

## 2. PCA +Mutual Information +lightgbm
Good way to reduct dimension and let 'Mutual Information' to keep interaction effect
At least, use 'lightgbm'to pick the important features.

In [None]:
# split the data into training and validation sets
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_trains[0], y_trains[0], test_size=0.2, random_state=42)


# 1. use PCA to reduce the dimensionality of the data  #
pca = PCA(n_components=0.95)  
X_train_pca = pca.fit_transform(X_train_part)
X_valid_pca = pca.transform(X_valid) 


# 2. use Mutual Information to select high-quality features  #
## use mean score as threshold
mi_scores = mutual_info_classif(X_train_pca, y_train_part.values.ravel())  
mi_threshold = np.mean(mi_scores)  
selected_mi_features = mi_scores > mi_threshold

## keep the selected features
X_train_mi = X_train_pca[:, selected_mi_features]
X_valid_mi = X_valid_pca[:, selected_mi_features]  


# 3. use LightGBM to pick the best features  #
lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb_model.fit(X_train_mi, y_train_part.values.ravel()) 

## get the feature importances and select the features with importance higher than the mean importance
feature_importances = lgb_model.feature_importances_
importance_threshold = np.mean(feature_importances)  
selected_lgb_features = feature_importances > importance_threshold

## keep the selected features
X_train_final = X_train_mi[:, selected_lgb_features]
X_valid_final = X_valid_mi[:, selected_lgb_features]  


# 4. input the selected features back to DataFrame
X_train_final_df = pd.DataFrame(X_train_final, columns=[f"Feature_{i+1}" for i in range(X_train_final.shape[1])])
X_valid_final_df = pd.DataFrame(X_valid_final, columns=[f"Feature_{i+1}" for i in range(X_valid_final.shape[1])])

# 更新 X_trains 和 X_tests
X_trains[0] = pd.concat([X_train_final_df,X_valid_final_df],axis=0) 
X_tests[0] = pd.concat([X_train_final_df,X_valid_final_df],axis=0)  #有問題需要修改


### Conclustion: 
We don't know why the sample size of x train decrease after the PCA and can't find out where is the problem

=> rejec tot use

# 3. RFECV
recursively eliminate features and, in combination with cross-validation, to select the optimal feature subset

In [None]:
for i in range(len(dataset_names)):
    # Initialize the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Initialize RFECV
    rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(5), scoring='roc_auc')

    x_train = X_trains[i]
    y_train = y_trains[i]

    # Fit RFECV
    rfecv.fit(x_train, y_train.values.ravel())

    # Get the selected features
    selected_features = x_train.columns[rfecv.support_]

    # print("Selected features:", selected_features)

    # Transform the datasets
    X_trains[i] = X_trains[i][selected_features]
    X_tests[i] = X_tests[i][selected_features]

### Conclustion: 
wasting too much time

=> rejec tot use

#### Therefore, we give up to use Feature Engineering

# train test split & build Model
select an appropriate model and perform corresponding hyperparameter tuning.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

### strategy: Boosting first, Bagging second, Stacking third
1. Boosting first: use mulitple classifier as base classifiers
2. Bagging second: enhance stability and reduce variance through Bagging with random sampling
3. Stacking third: use the output of Bagging as features to train a final meta-model for the ultimate prediction. We choose LogisticRegression to be final meta-model

Therefore, we try different combination of Boosting clf to get a better AUC score.

like

In [None]:
# 1.Boosting first: Initialize the boosting models
boosting_clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
boosting_clf2 = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
boosting_clf3 = LGBMClassifier(n_estimators=100, random_state=42)
boosting_clf4 = RandomForestClassifier(n_estimators=100, random_state=42)
boosting_clf5 = CatBoostClassifier(iterations=100, random_state=42)



# 2.Bagging second: use VotingClassifier to combine the boosting models
votin_clf= VotingClassifier(estimators=[
            ('gb1', boosting_clf1),
            ('gb2', boosting_clf2),
            ('gb3', boosting_clf3),
            ('gb4', boosting_clf4),
            ('gb5', boosting_clf5),],voting='soft', n_jobs=-1)

and this 

In [None]:
# 1.Boosting first: Initialize the boosting models
boosting_clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
boosting_clf2 = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
boosting_clf3 = LGBMClassifier(n_estimators=100, random_state=42)
boosting_clf4 = RandomForestClassifier(n_estimators=100, random_state=42)
boosting_clf5 = LogisticRegression(max_iter=1000, random_state=42)
boosting_clf6 = SVC(probability=True, random_state=42)


# 2.Bagging second: use VotingClassifier to combine the boosting models
votin_clf= VotingClassifier(estimators=[
            ('gb1', boosting_clf1),
            ('gb2', boosting_clf2),
            ('gb3', boosting_clf3),
            ('gb4', boosting_clf4),
            ('gb5', boosting_clf5),
            ('gb6', boosting_clf6),],voting='soft', n_jobs=-1)

and this 

In [None]:
# 1.Boosting first: Initialize the boosting models
boosting_clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
boosting_clf2 = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
boosting_clf3 = LGBMClassifier(n_estimators=100, random_state=42)
boosting_clf4 = RandomForestClassifier(n_estimators=100, random_state=42)
boosting_clf5 = LogisticRegression(max_iter=1000, random_state=42)
boosting_clf6 = SVC(probability=True, random_state=42)
boosting_clf8 = GaussianNB()
boosting_clf10 = AdaBoostClassifier(n_estimators=100, random_state=42)



# 2.Bagging second: use VotingClassifier to combine the boosting models
votin_clf= VotingClassifier(estimators=[
        ('gb1', boosting_clf1),
        ('gb2', boosting_clf2),
        ('gb3', boosting_clf3),
        ('gb4', boosting_clf4),
        ('gb5', boosting_clf5),
        ('gb6', boosting_clf6),
        ('gb8', boosting_clf8),
        ('gb10', boosting_clf10)],voting='soft', n_jobs=-1)

and We finally find this way could perform the best AUC score and get auc score 0.863

In [None]:
y_predicts_voting = []
plt.figure(figsize=(10, 8)) # plot AUC curve to check the performance of the model

for i in range(len(dataset_names)):
    X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    
    ##  Boosting first, Bagging second, Stacking third

    # 1.Boosting first: Initialize the boosting models
    boosting_clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    boosting_clf2 = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
    boosting_clf3 = RandomForestClassifier(n_estimators=100, random_state=42)



    # 2.Bagging second: use VotingClassifier to combine the boosting models
    votin_clf= VotingClassifier(estimators=[
                ('gb1', boosting_clf1),
                ('gb2', boosting_clf2),
                ('gb3', boosting_clf3),],voting='soft', n_jobs=-1)


    bagging_model = BaggingClassifier(estimator=votin_clf,n_estimators=10,random_state=42,n_jobs=-1)

    # 3.Stacking third: Initialize the stacking model
    stacking_model = StackingClassifier(
        estimators=[ ('bagging', bagging_model)],# use the bagging model as the base model
        final_estimator=LogisticRegression(),  # Meta-model
        passthrough=True  # Retain the original features and the output of the base classifiers.
    )

    # train the stacking model
    stacking_model.fit(X_train_part, y_train_part)


    # predict the validation set
    y_pred = stacking_model.predict(X_valid)
    y_pred_proba = stacking_model.predict_proba(X_valid)[:, 1]

    # Calculate the accuracy and AUC
    accuracy = accuracy_score(y_valid, y_pred)
    roc_auc1 = roc_auc_score(y_valid, y_pred_proba)

    print(f"{i} times finish")
    print(f"AUC: {roc_auc1:.2f}")

    # store the prediction results
    y_test_pred = stacking_model.predict_proba(X_tests[i])[:, 1]
    df = pd.DataFrame(y_test_pred, columns=['y_predict_proba'])
    y_predicts_voting.append(df)

    # Calculate ROC and AUC and plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_valid, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{dataset_names[i]} (AUC = {roc_auc:.2f})')

## add the diagonal line and set the title and labels
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


# Save result

In [None]:
for idx,dataset_name in enumerate(dataset_names):
    df=y_predicts_voting[idx]
    df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False,header=True)