In [6]:
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import category_encoders as ce

from transformers.DropHighNaNColumnsTransformer import DropHighNaNColumnsTransformer


In [7]:
data = pd.read_csv('./db/final_proj_data.csv')
#valid = pd.read_csv('../final_proj_data.csv')

data = data.dropna(subset='y')

X_train, X_test, y_train, y_test = (
    train_test_split(
        data.drop('y', axis=1),
        data['y'],
        test_size=0.2,
        random_state=42))

In [8]:
cat_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', ce.TargetEncoder().set_output(transform='pandas')),
    ])

num_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
    ])


col_processor = (ColumnTransformer(
    transformers=[
        ('cat',
         cat_transformer,
         make_column_selector(dtype_include=object)),
        ('num',
         num_transformer,
         make_column_selector(dtype_exclude=object))],
    n_jobs=-1,
    verbose_feature_names_out=False)
    .set_output(transform='pandas'))


clf_estimator = GradientBoostingClassifier(random_state=42)


clf_pipe_model = (ImbPipeline(
    steps=[
        ('drop_nan_cols', DropHighNaNColumnsTransformer()),
        ('col_processor', col_processor),
        ('scaler', StandardScaler().set_output(transform='pandas')),
        ("smote", SMOTE(random_state=42)),
        ("pca", PCA(n_components=0.95).set_output(transform="pandas")),
        ('clf_estimator', clf_estimator)
    ]))


clf_pipe_model.fit(X_train, y_train)


In [9]:
X_test_transform = clf_pipe_model[:-1].transform(X_test)
print(X_test_transform)

          pca0      pca1      pca2      pca3      pca4      pca5      pca6  \
6252  0.763913 -1.192009  1.996731 -1.047676 -1.032413 -0.116784 -3.304672   
4684  9.401701 -1.705367  3.682542  1.434952  0.632595 -0.434353  0.639783   
1731  3.462479 -0.603903  1.049190 -1.261440 -0.743048  0.378723 -3.436263   
4742  3.534335 -1.537835 -2.409530  1.159485 -0.209959 -0.066964  2.455098   
4521  0.399360 -0.018500  0.081943 -1.972544 -0.658241  0.440032 -0.270284   
...        ...       ...       ...       ...       ...       ...       ...   
6412 -0.799912  0.318100 -0.601220 -0.880984  0.166449  0.219184  1.824835   
8285 -1.929844 -0.702132 -1.493151 -0.629734 -0.856116  0.018656  0.153544   
7853 -0.519256 -0.878140  2.884158 -1.058209 -0.780813  0.131372  0.117648   
1095  6.729153 -1.408774 -1.761556  0.271287 -0.039100  0.107634 -1.093140   
6929 -1.015890 -0.232734  0.712433  0.582484 -0.014475 -0.083554  1.187830   

          pca7      pca8      pca9  ...     pca42     pca43    

In [10]:
pred_pipe = clf_pipe_model.predict(X_test)
pred_pipe

array([0, 0, 0, ..., 0, 0, 0])