In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score

%run datainfo.ipynb

def train_and_evaluate_model(base_model):
    # 1. Create a hold-out validation set from train_df only
    X_supervised_train, X_val, y_supervised_train, y_val = train_test_split(
        train_df.drop('Class/ASD', axis=1),
        train_df['Class/ASD'],
        test_size=0.4,
        random_state=42
    )

    # 2. Prepare self-training data
    # Add fake -1 to test_df again to make sure it's still 'unlabeled'
    test_df['Class/ASD'] = -1

    # Combine the reduced train and test for self-training
    train_for_self_training = pd.concat([
        X_supervised_train.assign(**{'Class/ASD': y_supervised_train}),
        test_df
    ], ignore_index=True)

    X_combined = train_for_self_training.drop('Class/ASD', axis=1)
    y_combined = train_for_self_training['Class/ASD']

    # 3. Define and train the self-training model
    #base_model = RandomForestClassifier(n_estimators=600, max_depth=10, min_samples_split=10)
    self_training_model = SelfTrainingClassifier(base_model, criterion='threshold', k_best=251, threshold=.84)
    self_training_model.fit(X_combined, y_combined)

    # 4. Evaluate on the real validation set (never seen during self-training)
    y_pred_val_self_training = self_training_model.predict(X_val)

    f1_self_training = f1_score(y_val, y_pred_val_self_training, average='macro')
    f1_supervised = final_f1

    # Calculate the uplift
    uplift = ((f1_self_training - f1_supervised) / f1_supervised) * 100

    print("Self-training model Macro-F1 on validation set:", f1_self_training)
    print ("Supervised model Macro-F1 on validation set:", f1_supervised)
    print(f"Macro-F1 Uplift/iporivment: {uplift:.2f}%")


In [150]:
# random forest model
rf_model = RandomForestClassifier(n_estimators=600, max_depth=10, min_samples_split=10)
# Train the base model on the full training set
train_and_evaluate_model(rf_model)

Self-training model Macro-F1 on validation set: 0.8256410256410256
Supervised model Macro-F1 on validation set: 0.7164179104477612
Macro-F1 Uplift/iporivment: 15.25%


In [151]:
# random forest model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=1,  # Handle imbalance through class weight
    random_state=42
)
# Train the base model on the full training set
train_and_evaluate_model(xgb_model)

Self-training model Macro-F1 on validation set: 0.8032364012657658
Supervised model Macro-F1 on validation set: 0.7164179104477612
Macro-F1 Uplift/iporivment: 12.12%


In [152]:
# random forest model
cb_model = CatBoostClassifier(
    random_state=42, verbose=0
)
# Train the base model on the full training set
train_and_evaluate_model(cb_model)

Self-training model Macro-F1 on validation set: 0.8009678544880838
Supervised model Macro-F1 on validation set: 0.7164179104477612
Macro-F1 Uplift/iporivment: 11.80%


In [153]:
# random forest model
lr_model = LogisticRegression(
    class_weight='balanced', random_state=42
)
# Train the base model on the full training set
train_and_evaluate_model(lr_model)

Self-training model Macro-F1 on validation set: 0.7916666666666667
Supervised model Macro-F1 on validation set: 0.7164179104477612
Macro-F1 Uplift/iporivment: 10.50%
