In [3]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

df = pd.read_csv("training_set_features.csv")


# Define ordinal encoding mappings
age_group_mapping = {'18 - 34 Years': 0, '35 - 44 Years': 1, '45 - 54 Years': 2, '55 - 64 Years': 3, '65+ Years': 4}
education_mapping = {'< 12 Years': 0, '12 Years': 1, 'Some College': 2, 'College Graduate': 3}
income_poverty_mapping = {'Below Poverty': 0, '<= $75,000, Above Poverty': 1, '> $75,000': 2}

# Apply ordinal encoding
df['age_group'] = df['age_group'].map(age_group_mapping)
df['education'] = df['education'].map(education_mapping)
df['income_poverty'] = df['income_poverty'].map(income_poverty_mapping)

# Impute missing values using KNNImputer
columns_to_impute = ['age_group', 'education', 'income_poverty']

# Perform MICE imputation
imputer = IterativeImputer()
df_imputed = pd.DataFrame(imputer.fit_transform(df[columns_to_impute]), columns=columns_to_impute)
df[['age_group','education','income_poverty']].head()

Unnamed: 0,age_group,education,income_poverty
0,3,0.0,0.0
1,1,1.0,0.0
2,0,3.0,1.0
3,4,1.0,0.0
4,2,2.0,1.0


In [3]:


import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# Assuming df is your dataframe
categorical_columns = df.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
encoder = OneHotEncoder(drop='if_binary', sparse=False)
X_encoded = encoder.fit_transform(df[categorical_columns])

# Construct dataframe with encoded columns
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Combine encoded dataframe with non-categorical columns
df_encoded = pd.concat([df.drop(columns=categorical_columns), encoded_df], axis=1)

# Perform MICE imputation
imputer = IterativeImputer()
df_imputed = pd.DataFrame(imputer.fit_transform(df_encoded), columns=df_encoded.columns)

# Now df_imputed contains the modified original columns with imputed values




In [5]:
df_imputed.isna().sum()

respondent_id                     0
h1n1_concern                      0
h1n1_knowledge                    0
behavioral_antiviral_meds         0
behavioral_avoidance              0
                                 ..
employment_occupation_xgwztkwe    0
employment_occupation_xqwwgdyp    0
employment_occupation_xtkaffoo    0
employment_occupation_xzmlyyjv    0
employment_occupation_nan         0
Length: 101, dtype: int64

In [5]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
df = pd.read_csv("training_set_features.csv")
for column in df[['household_adults', 'household_children']]:
    unique_values = df[column].dropna().unique()  # Drop missing values before finding unique values
    missing_values = df[column].isnull().sum()
    print(f"Column: {column}")
    print(f"Unique values: {unique_values}")
    print(f"Number of missing values: {missing_values}")

Column: household_adults
Unique values: [0. 2. 1. 3.]
Number of missing values: 249
Column: household_children
Unique values: [0. 3. 2. 1.]
Number of missing values: 249


In [5]:
import pandas as pd


y = pd.read_csv("training_set_labels.csv")
for col in y:
    unique_values = y[col].unique()
    print(f"Column: {col}")
    print(f"Unique values: {unique_values}")
    print("no.of unique: ", y[col].unique().sum())
    print("-" * 20)
print(y['seasonal_vaccine'].value_counts())
print(y['h1n1_vaccine'].value_counts())

Column: respondent_id
Unique values: [    0     1     2 ... 26704 26705 26706]
no.of unique:  356618571
--------------------
Column: h1n1_vaccine
Unique values: [0 1]
no.of unique:  1
--------------------
Column: seasonal_vaccine
Unique values: [0 1]
no.of unique:  1
--------------------
seasonal_vaccine
0    14272
1    12435
Name: count, dtype: int64
h1n1_vaccine
0    21033
1     5674
Name: count, dtype: int64


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from tqdm import tqdm

# Load the datasets
X = pd.read_csv("training_set_features.csv")
X = X.drop(['employment_industry', 'employment_occupation', 'respondent_id'], axis=1)
y = pd.read_csv("training_set_labels.csv").drop(['respondent_id', 'seasonal_vaccine'], axis=1)
z = pd.read_csv("training_set_labels.csv").drop(['respondent_id', 'h1n1_vaccine'], axis=1)

# Define numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Define preprocessing steps
numerical_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define parameter search space
param_space = {
    'model__base_estimator__n_estimators': [225, 200, 250, 275, 212],
    'model__base_estimator__max_depth': [3, 8, 9, 10, 11],
    'model__base_estimator__learning_rate': [0.155, 0.18, 0.16, 0.15, 0.17]
}

# Define stratified k-fold cross-validator
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to perform hyperparameter tuning, k-fold cross-validation, and evaluation
def evaluate_model(X_data, y_data):
    # Compute scale_pos_weight based on the class distribution in the training data
    scale_pos_weight = (len(y_data) - y_data.sum()) / y_data.sum()
    scale_pos_weight = scale_pos_weight.iloc[0]

    # Define the XGBoost model with scale_pos_weight
    model_xgb = XGBClassifier(scale_pos_weight=scale_pos_weight)

    # Define the BaggingClassifier with XGBClassifier as the base estimator
    model_bagging = BaggingClassifier(base_estimator=model_xgb, n_estimators=10)

    # Create pipeline with preprocessing and modeling steps
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model_bagging)])

    # Perform hyperparameter tuning with RandomizedSearchCV and stratified k-fold cross-validation
    random_search = RandomizedSearchCV(
        clf,
        param_distributions=param_space,
        n_iter=10,
        cv=skf,
        scoring='accuracy',
        verbose=2,
        random_state=42,
        n_jobs=-1  # Use all available CPU cores for parallel computation
    )

    # Fit the model using RandomizedSearchCV
    random_search.fit(X_data, y_data)

    # Print best hyperparameters
    print("Best Hyperparameters:", random_search.best_params_)
    print()

    # Get the best model
    best_model = random_search.best_estimator_

    # Perform stratified k-fold testing with the best model
    accuracy_scores = []
    classification_reports = []
    roc_auc_scores = []

    # Initialize tqdm progress bar
    with tqdm(total=skf.get_n_splits(X_data, y_data)) as pbar:
        for train_index, test_index in skf.split(X_data, y_data):
            X_train_skf, X_test_skf = X_data.iloc[train_index], X_data.iloc[test_index]
            y_train_skf, y_test_skf = y_data.iloc[train_index], y_data.iloc[test_index]

            # Fit the best model
            best_model.fit(X_train_skf, y_train_skf)

            # Predict on the test set
            y_pred_skf = best_model.predict(X_test_skf)

            # Evaluate the model
            accuracy = accuracy_score(y_test_skf, y_pred_skf)
            accuracy_scores.append(accuracy)

            classification_rep = classification_report(y_test_skf, y_pred_skf)
            classification_reports.append(classification_rep)

            y_pred_proba = best_model.predict_proba(X_test_skf)[:, 1]  # Probability of positive class
            roc_auc = roc_auc_score(y_test_skf, y_pred_proba)
            roc_auc_scores.append(roc_auc)

            # Update tqdm progress bar
            pbar.update(1)

    # Calculate mean scores across all fold
    mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
    mean_roc_auc = sum(roc_auc_scores) / len(roc_auc_scores)

    print("Mean Accuracy:", mean_accuracy)
    print("Mean ROC AUC Score:", mean_roc_auc)

    # Print classification reports for each fold
    for i, report in enumerate(classification_reports):
        print(f"\nClassification Report for Fold {i+1}:\n{report}")

# Evaluate model for dataset y
print("Evaluation for Dataset y:")
evaluate_model(X, y)

# Evalu ate model for dataset z
print("\nEvaluation for Dataset z:")
evaluate_model(X, z)


In [7]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from pytorch_tabnet.tab_model import TabNetClassifier

# Load the datasets
X = pd.read_csv("training_set_features.csv")
X = X.drop(['employment_industry', 'employment_occupation','respondent_id'], axis=1)
y = pd.read_csv("training_set_labels.csv").drop(['respondent_id', 'seasonal_vaccine'], axis=1)

# Preprocessing
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess data using the pipeline
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Define TabNet model
tabnet_model = TabNetClassifier()

# Train the model
tabnet_model.fit(X_train, y_train.values.ravel(), eval_set=[(X_test, y_test.values.ravel())], patience=10)

# Evaluate the model
y_pred = tabnet_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_score = roc_auc_score(y_test, y_pred)
classification_report_text = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("ROC AUC Score:", roc_score)
print("Classification Report:\n", classification_report_text)




epoch 0  | loss: 0.54049 | val_0_auc: 0.68342 |  0:00:02s
epoch 1  | loss: 0.47489 | val_0_auc: 0.73437 |  0:00:04s
epoch 2  | loss: 0.45555 | val_0_auc: 0.76749 |  0:00:07s
epoch 3  | loss: 0.44236 | val_0_auc: 0.77983 |  0:00:09s
epoch 4  | loss: 0.43623 | val_0_auc: 0.78652 |  0:00:12s
epoch 5  | loss: 0.42295 | val_0_auc: 0.79896 |  0:00:14s
epoch 6  | loss: 0.41681 | val_0_auc: 0.80147 |  0:00:16s
epoch 7  | loss: 0.41475 | val_0_auc: 0.80394 |  0:00:18s
epoch 8  | loss: 0.40443 | val_0_auc: 0.81185 |  0:00:20s
epoch 9  | loss: 0.40108 | val_0_auc: 0.81801 |  0:00:22s
epoch 10 | loss: 0.39979 | val_0_auc: 0.81705 |  0:00:25s
epoch 11 | loss: 0.39696 | val_0_auc: 0.82108 |  0:00:28s
epoch 12 | loss: 0.39578 | val_0_auc: 0.82096 |  0:00:30s
epoch 13 | loss: 0.3943  | val_0_auc: 0.82103 |  0:00:32s
epoch 14 | loss: 0.39261 | val_0_auc: 0.82081 |  0:00:34s
epoch 15 | loss: 0.39109 | val_0_auc: 0.82115 |  0:00:37s
epoch 16 | loss: 0.38851 | val_0_auc: 0.81761 |  0:00:39s
epoch 17 | los



Accuracy: 0.8382628229127668
ROC AUC Score: 0.6944324685475127
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.94      0.90      4212
           1       0.68      0.45      0.54      1130

    accuracy                           0.84      5342
   macro avg       0.77      0.69      0.72      5342
weighted avg       0.82      0.84      0.82      5342



In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from tqdm import tqdm

# Load the datasets
X = pd.read_csv("training_set_features.csv")
X = X.drop(['employment_industry', 'employment_occupation', 'respondent_id', 'health_insurance'], axis=1)
y = pd.read_csv("training_set_labels.csv").drop(['respondent_id', 'seasonal_vaccine'], axis=1)
z = pd.read_csv("training_set_labels.csv").drop(['respondent_id', 'h1n1_vaccine'], axis=1)

# Define numerical and categorical features
numerical_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Pipeline for preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=42, max_iter=50))
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', XGBClassifier())])

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from tabnet import TabNetClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from fancyimpute import IterativeImputer
from sklearn.impute import SimpleImputer

# Load data
X = pd.read_csv("training_set_features.csv")
X.drop(['employment_industry', 'employment_occupation', 'respondent_id', 'health_insurance'], axis=1, inplace=True)
y = pd.read_csv("training_set_labels.csv").drop(['respondent_id'], axis=1)  # Assuming both 'h1n1_vaccine' and 'seasonal_vaccine' are target variables

# Separate target variables
y_h1n1 = y['h1n1_vaccine']
y_seasonal = y['seasonal_vaccine']

# Separate numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Oversampling for class imbalance
smote = SMOTE(random_state=42)

# GridSearchCV for hyperparameter tuning with regularization
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {
    'feature_dim': [32, 64, 128],  # Adjust based on feature importance and computational resources
    'output_dim': 2,  # Number of target variables
    'n_decision_trees': [50, 100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.001, 0.01],
    'optimizer': ['adam', 'adamw'],
    'scheduler': ['cosineannealinglr', 'reducelronplateau'],
    'lambda_sparse': [0.001, 0.01, 0.1],
    'seed': [42, 100]
}
model = TabNetClassifier(
    cat_cols=categorical_features,  # Specify categorical features for regularization
    cat_embed_dim=16  # Adjust embedding dimension for categorical features
)
grid_search = GridSearchCV(model, param_grid, scoring='roc_auc_macro', cv=kfold)

# Train model with hyperparameter tuning, regularization, and oversampling
grid_search.fit(X_imputed, [y_h1n1, y_seasonal])
best_model = grid_search.best_estimator_

# Access best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Store metrics and reports
mean_roc_h1n1, mean_roc_seasonal = 0, 0
mean_accuracy_h1n1, mean_accuracy_seasonal = 0, 0
classification_reports_h1n1, classification_reports_seasonal = [], []

for fold, (train_index, test_index) in enumerate(kfold.split(X_imputed, [y_h1n1, y_seasonal])):
    X_train, X_test = X_imputed[train_index], X_imputed[test_index]
    y_train_h1n1, y_test_h1n1 = y_h1n1.iloc[train_index], y_test_h1n1
    y_train_seasonal, y_test_seasonal = y_seasonal.iloc[train_index], y_test_seasonal
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    # Oversample training data for h1n1 and seasonal vaccines
    X_train_resampled_h1n1, y_train_h1n1_resampled = smote.fit_resample(X_train_preprocessed, y_train_h1n1)
    X_train_resampled_seasonal, y_train_seasonal_resampled = smote.fit_resample(X_train_preprocessed, y_train_seasonal)

    # Train model on oversampled data (replace with undersampling if preferable)
    best_model.fit(X_train_resampled_h1n1, y_train_h1n1_resampled, X_train_resampled_seasonal, y_train_seasonal_resampled)

    # Prediction and evaluation
    y_pred_h1n1, y_pred_prob_h1n1 = best_model.predict(X_test_preprocessed)
    y_pred_seasonal, y_pred_prob_seasonal = best_model.predict(X_test_preprocessed)

    # Calculate ROC AUC score
    roc_auc_h1n1 = roc_auc_score(y_test_h1n1, y_pred_prob_h1n1[:, 0])
    roc_auc_seasonal = roc_auc_score(y_test_seasonal, y_pred_prob_seasonal[:, 1])
    mean_roc_h1n1 += roc_auc_h1n1
    mean_roc_seasonal += roc_auc_seasonal

    # Calculate accuracy
    accuracy_h1n1 = accuracy_score(y_test_h1n1, y_pred_h1n1)
    accuracy_seasonal = accuracy_score(y_test_seasonal, y_pred_seasonal)
    mean_accuracy_h1n1 += accuracy_h1n1
    mean_accuracy_seasonal += accuracy_seasonal

    # Generate classification reports
    classification_reports_h1n1.append(classification_report(y_test_h1n1, y_pred_h1n1))
    classification_reports_seasonal.append(classification_report(y_test_seasonal, y_pred_seasonal))

# Print aggregated metrics
print("Mean ROC AUC (h1n1):", mean_roc_h1n1 / kfold.n_splits)
print("Mean ROC AUC (seasonal):", mean_roc_seasonal / kfold.n_splits)
print("Mean accuracy (h1n1):", mean_accuracy_h1n1 / kfold.n_splits)
print("Mean accuracy (seasonal):", mean_accuracy_seasonal / kfold.n_splits)

# Print or save detailed classification reports
print("Classification reports:")
for report in classification_reports_h1n1:
    print("h1n1 vaccine:")
    print(report)
for report in classification_reports_seasonal:
    print("seasonal vaccine:")
    print(report)

NameError: name 'Pipeline' is not defined

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from tabnet import TabNetClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from fancyimpute import IterativeImputer  # Importing IterativeImputer from fancyimpute
from sklearn.impute import SimpleImputer

# Load data
X = pd.read_csv("training_set_features.csv")
X.drop(['employment_industry', 'employment_occupation', 'respondent_id', 'health_insurance'], axis=1, inplace=True)
y = pd.read_csv("training_set_labels.csv").drop(['respondent_id'], axis=1)  # Assuming both 'h1n1_vaccine' and 'seasonal_vaccine' are target variables

# Separate target variables
y_h1n1 = y['h1n1_vaccine']
y_seasonal = y['seasonal_vaccine']

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Oversampling for class imbalance
smote = SMOTE(random_state=42)

# GridSearchCV for hyperparameter tuning with regularization
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {
    'feature_dim': [32, 64, 128],  # Adjust based on feature importance and computational resources
    'output_dim': 2,  # Number of target variables
    'n_decision_trees': [50, 100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.001, 0.01],
    'optimizer': ['adam', 'adamw'],
    'scheduler': ['cosineannealinglr', 'reducelronplateau'],
    'lambda_sparse': [0.001, 0.01, 0.1],
    'seed': [42, 100]
}
model = TabNetClassifier(
    cat_cols=categorical_features,  # Specify categorical features for regularization
    cat_embed_dim=16  # Adjust embedding dimension for categorical features
)
grid_search = GridSearchCV(model, param_grid, scoring='roc_auc_macro', cv=kfold)

# Train model with hyperparameter tuning, regularization, and oversampling
grid_search.fit(X_imputed, [y_h1n1, y_seasonal])
best_model = grid_search.best_estimator_

# Access best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Store metrics and reports
mean_roc_h1n1, mean_roc_seasonal = 0, 0
mean_accuracy_h1n1, mean_accuracy_seasonal = 0, 0
classification_reports_h1n1, classification_reports_seasonal = [], []

for fold, (train_index, test_index) in enumerate(kfold.split(X_imputed, [y_h1n1, y_seasonal])):
    X_train, X_test = X_imputed[train_index], X_imputed[test_index]
    y_train_h1n1, y_test_h1n1 = y_h1n1.iloc[train_index], y_h1n1.iloc[test_index]  # Fix variable names here
    y_train_seasonal, y_test_seasonal = y_seasonal.iloc[train_index], y_seasonal.iloc[test_index]  # Fix variable names here
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    # Oversample training data for h1n1 and seasonal vaccines
    X_train_resampled_h1n1, y_train_h1n1_resampled = smote.fit_resample(X_train_preprocessed, y_train_h1n1)
    X_train_resampled_seasonal, y_train_seasonal_resampled = smote.fit_resample(X_train_preprocessed, y_train_seasonal)

    # Train model on oversampled data (replace with undersampling if preferable)
    best_model.fit(X_train_resampled_h1n1, y_train_h1n1_resampled, X_train_resampled_seasonal, y_train_seasonal_resampled)

    # Prediction and evaluation
    y_pred_h1n1, y_pred_prob_h1n1 = best_model.predict(X_test_preprocessed)
    y_pred_seasonal, y_pred_prob_seasonal = best_model.predict(X_test_preprocessed)

    # Calculate ROC AUC score
    roc_auc_h1n1 = roc_auc_score(y_test_h1n1, y_pred_prob_h1n1[:, 0])
    roc_auc_seasonal = roc_auc_score(y_test_seasonal, y_pred_prob_seasonal[:, 1])
    mean_roc_h1n1 += roc_auc_h1n1
    mean_roc_seasonal += roc_auc_seasonal

    # Calculate accuracy
    accuracy_h1n1 = accuracy_score(y_test_h1n1, y_pred_h1n1)
    accuracy_seasonal = accuracy_score(y_test_seasonal, y_pred_seasonal)
    mean_accuracy_h1n1 += accuracy_h1n1
    mean_accuracy_seasonal += accuracy_seasonal

    # Generate classification reports
    classification_reports_h1n1.append(classification_report(y_test_h1n1, y_pred_h1n1))
    classification_reports_seasonal.append(classification_report(y_test_seasonal, y_pred_seasonal))

# Print aggregated metrics
print("Mean ROC AUC (h1n1):", mean_roc_h1n1 / kfold.n_splits)
print("Mean ROC AUC (seasonal):", mean_roc_seasonal / kfold.n_splits)
print("Mean accuracy (h1n1):", mean_accuracy_h1n1 / kfold.n_splits)
print("Mean accuracy (seasonal):", mean_accuracy_seasonal / kfold.n_splits)

# Print or save detailed classification reports
print("Classification reports:")
for report in classification_reports_h1n1:
    print("h1n1 vaccine:")
    print(report)
for report in classification_reports_seasonal:
    print("seasonal vaccine:")
    print(report)


TypeError: TabNetClassifier.__init__() missing 2 required positional arguments: 'feature_columns' and 'num_classes'

In [5]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from pytorch_tabnet.tab_model import TabNetClassifier
from skorch.callbacks import EarlyStopping
from sklearn.compose import ColumnTransformer
# Load the datasets
X = pd.read_csv("training_set_features.csv")
X = X.drop(['employment_industry', 'employment_occupation', 'respondent_id', 'health_insurance'], axis=1)
y = pd.read_csv("training_set_labels.csv").drop(['respondent_id', 'seasonal_vaccine'], axis=1)
z = pd.read_csv("training_set_labels.csv").drop(['respondent_id', 'h1n1_vaccine'], axis=1)

# Define numerical and categorical features
numerical_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Pipeline for preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=42, max_iter=50)),
    ('scaler', StandardScaler())  # Standardization
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define TabNetClassifier model
cat_idxs = [X.columns.get_loc(col) for col in categorical_features]  # Define cat_idxs
cat_dims = [len(X[col].unique()) for col in categorical_features]  # Define cat_dims

model = TabNetClassifier(optimizer_params=dict(lr=2e-2, weight_decay=1e-5),  # Adjust weight_decay for class imbalance handling
                         scheduler_params={"step_size":50, "gamma":0.9},
                         scheduler_fn="StepLR",
                         verbose=0,
                         cat_idxs=cat_idxs,  # Pass cat_idxs
                         cat_dims=cat_dims,  # Pass cat_dims
                         cat_emb_dim=1,
                         mask_type="entmax",
                         device_name="cuda" if torch.cuda.is_available() else "cpu",
                         )

# Combine preprocessing with TabNetClassifier model in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

# Hyperparameters for tuning
params = {
    "model__n_d": [8, 16],
    "model__n_a": [8, 16],
    "model__n_steps": [3, 5],
    "model__gamma": [1.3, 1.8],
    "model__lambda_sparse": [0.0001, 0.001],
    "model__batch_size": [64, 128],
    "model__virtual_batch_size": [32, 64],
}

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store evaluation metrics for each fold
roc_auc_scores = []
accuracy_scores = []
classification_reports = []

# Perform K-Fold Cross Validation with hyperparameter tuning
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    print(f"Fold: {fold+1}")

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train the TabNetClassifier model with early stopping
    clf.fit(X_train, y_train, model__callbacks=[EarlyStopping(patience=10)], model__n_d=8, model__n_a=8, model__n_steps=5, model__gamma=1.3, model__lambda_sparse=0.001, model__batch_size=64, model__virtual_batch_size=32)

    # Predict on the test set
    y_pred_proba = clf.predict_proba(X_test)[:, 1]

    # Calculate evaluation metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, (y_pred_proba > 0.5).astype(int))
    classification_rep = classification_report(y_test, (y_pred_proba > 0.5).astype(int))

    # Append evaluation metrics to lists
    roc_auc_scores.append(roc_auc)
    accuracy_scores.append(accuracy)
    classification_reports.append(classification_rep)

    print(f"ROC AUC: {roc_auc}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_rep)
    print("-" * 50)

# Print mean evaluation metrics across all folds
print("Mean ROC AUC:", np.mean(roc_auc_scores))
print("Mean Accuracy:", np.mean(accuracy_scores))


Fold: 1


TypeError: TabModel.fit() got an unexpected keyword argument 'n_d'

In [6]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from pytorch_tabnet.tab_model import TabNetClassifier
from skorch.callbacks import EarlyStopping
from sklearn.compose import ColumnTransformer

# Load the datasets
X = pd.read_csv("training_set_features.csv")
X = X.drop(['employment_industry', 'employment_occupation', 'respondent_id', 'health_insurance'], axis=1)
y = pd.read_csv("training_set_labels.csv").drop(['respondent_id', 'seasonal_vaccine'], axis=1)
z = pd.read_csv("training_set_labels.csv").drop(['respondent_id', 'h1n1_vaccine'], axis=1)

# Define numerical and categorical features
numerical_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Pipeline for preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=42, max_iter=50)),
    ('scaler', StandardScaler())  # Standardization
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define TabNetClassifier model
model = TabNetClassifier(optimizer_params=dict(lr=2e-2, weight_decay=1e-5),  # Adjust weight_decay for class imbalance handling
                         scheduler_params={"step_size":50, "gamma":0.9},
                         scheduler_fn="StepLR",
                         verbose=0,
                         device_name="cuda" if torch.cuda.is_available() else "cpu",
                         )

# Combine preprocessing with TabNetClassifier model in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

# Hyperparameters for tuning
param_dist = {
    "model__n_d": [8, 16],
    "model__n_a": [8, 16],
    "model__n_steps": [3, 5],
    "model__gamma": [1.3, 1.8],
    "model__lambda_sparse": [0.0001, 0.001],
    "model__batch_size": [64, 128],
    "model__virtual_batch_size": [32, 64],
}

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Randomized Search Cross Validation
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=10, scoring='roc_auc', n_jobs=-1, cv=skf, verbose=1)

# Perform Randomized Search Cross Validation
random_search.fit(X, y)

# Best parameters and their corresponding score
print("Best Parameters:", random_search.best_params_)
print("Best ROC AUC Score:", random_search.best_score_)

# Instantiate TabNetClassifier with best parameters
best_model = TabNetClassifier(optimizer_params=dict(lr=2e-2, weight_decay=1e-5),  # Adjust weight_decay for class imbalance handling
                              scheduler_params={"step_size":50, "gamma":0.9},
                              scheduler_fn="StepLR",
                              verbose=0,
                              device_name="cuda" if torch.cuda.is_available() else "cpu",
                              **random_search.best_params_
                             )

# Lists to store evaluation metrics for each fold
roc_auc_scores = []
accuracy_scores = []
classification_reports = []

# Perform K-Fold Cross Validation with best parameters
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    print(f"Fold: {fold+1}")

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train the TabNetClassifier model with early stopping
    best_model.fit(X_train, y_train, callbacks=[EarlyStopping(patience=10)])

    # Predict on the test set
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    # Calculate evaluation metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, (y_pred_proba > 0.5).astype(int))
    classification_rep = classification_report(y_test, (y_pred_proba > 0.5).astype(int))

    # Append evaluation metrics to lists
    roc_auc_scores.append(roc_auc)
    accuracy_scores.append(accuracy)
    classification_reports.append(classification_rep)

    print(f"ROC AUC: {roc_auc}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_rep)
    print("-" * 50)

# Print mean evaluation metrics across all folds
print("Mean ROC AUC:", np.mean(roc_auc_scores))
print("Mean Accuracy:", np.mean(accuracy_scores))


Fitting 5 folds for each of 10 candidates, totalling 50 fits


ValueError: Invalid parameter 'virtual_batch_size' for estimator TabNetClassifier(n_d=8, n_a=8, n_steps=3, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=[], n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0.001, seed=0, clip_value=1, verbose=0, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02, 'weight_decay': 1e-05}, scheduler_fn='StepLR', scheduler_params={'step_size': 50, 'gamma': 0.9}, mask_type='sparsemax', input_dim=None, output_dim=None, device_name='cpu', n_shared_decoder=1, n_indep_decoder=1, grouped_features=[]). Valid parameters are: ['cat_dims', 'cat_emb_dim', 'cat_idxs', 'clip_value', 'device_name', 'epsilon', 'gamma', 'grouped_features', 'input_dim', 'lambda_sparse', 'mask_type', 'momentum', 'n_a', 'n_d', 'n_indep_decoder', 'n_independent', 'n_shared', 'n_shared_decoder', 'n_steps', 'optimizer_fn', 'optimizer_params', 'output_dim', 'scheduler_fn', 'scheduler_params', 'seed', 'verbose'].