In [None]:
!pip install scikit-learn pandas numpy matplotlib plotly ipykernel "nbformat>=4.2.0"

In [27]:
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neural_network import MLPClassifier
import warnings
from sklearn.decomposition import PCA


warnings.filterwarnings('ignore')

# generate random data
SAMPLES = 1_000
LABELED_RATIO = 0.1
x, y = make_classification(n_samples=SAMPLES, n_features=20, n_informative=15, random_state=42, n_classes=2)

# select 20% of the data for later testing
test_split_x = x[:int(SAMPLES * 0.2)]
test_split_y = y[:int(SAMPLES * 0.2)]
train_split_x = x[int(SAMPLES * 0.2):]
train_split_y = y[int(SAMPLES * 0.2):]

# train a model to select labeled data
labeled_data_selection_model = MLPClassifier(hidden_layer_sizes=(10), max_iter=50, random_state=42)
labeled_data_selection_model.fit(x, y)
scores_train = labeled_data_selection_model.predict_proba(train_split_x)
scores_train = scores_train[:, 1]
scores_test = labeled_data_selection_model.predict_proba(test_split_x)
scores_test = scores_test[:, 1]

# Select labeled data based on the model's scores
threshold_above = np.percentile(scores_train, 100 - (100 * LABELED_RATIO))
threshold_below = np.percentile(scores_train, 100 * LABELED_RATIO)
x_above_train = train_split_x[scores_train >= threshold_above]
y_above_train = train_split_y[scores_train >= threshold_above]
x_below_train = train_split_x[scores_train <= threshold_below]
y_below_train = train_split_y[scores_train <= threshold_below]
labeled_data_x_train = np.concatenate((x_above_train, x_below_train), axis=0)
labeled_data_y_train = np.concatenate((y_above_train, y_below_train), axis=0)
not_labeled_data_x_train = train_split_x[~np.isin(np.arange(len(train_split_x)), np.where(scores_train >= threshold_above)[0]) & ~np.isin(np.arange(len(train_split_x)), np.where(scores_train <= threshold_below)[0])]
not_labeled_data_y_train = train_split_y[~np.isin(np.arange(len(train_split_x)), np.where(scores_train >= threshold_above)[0]) & ~np.isin(np.arange(len(train_split_x)), np.where(scores_train <= threshold_below)[0])]

# do same for test data
test_data_labeled_x = test_split_x[scores_test >= threshold_above]
test_data_labeled_y = test_split_y[scores_test >= threshold_above]
test_data_not_labeled_x = test_split_x[~np.isin(np.arange(len(test_split_x)), np.where(scores_test >= threshold_above)[0])]
test_data_not_labeled_y = test_split_y[~np.isin(np.arange(len(test_split_x)), np.where(scores_test >= threshold_above)[0])]

In [28]:
# visualize the samples via pca:

pca = PCA(n_components=2)
pca_x = pca.fit_transform(x)
pca_test_split_x = pca.transform(test_split_x)
pca_train_split_x = pca.transform(train_split_x)
fig = px.scatter(x=pca_x[:, 0], y=pca_x[:, 1], color=y, title="PCA of the data")
#fig.add_trace(go.Scatter(x=pca_test_split_x[:, 0], y=pca_test_split_x[:, 1], mode='markers', name='Test Data', marker=dict(color='red', size=5, symbol='cross')))
#fig.add_trace(go.Scatter(x=pca_train_split_x[:, 0], y=pca_train_split_x[:, 1], mode='markers', name='Train Data', marker=dict(color='blue', size=5, symbol='circle')))
fig.show()

In [None]:
# barplot of the score distribution
fig = px.histogram(pd.Series(scores_train), nbins=50, title='Score Distribution', labels={'value': 'Score'})
fig.update_layout(xaxis_title='Score', yaxis_title='Count')
fig.show()

In [31]:
# Train a propensity model:
propensity_model = MLPClassifier(hidden_layer_sizes=(10), max_iter=1000, random_state=42, early_stopping=True, validation_fraction=0.2, n_iter_no_change=10)
propensity_train_x = np.concatenate((labeled_data_x_train, not_labeled_data_x_train), axis=0)
propensity_train_y = np.concatenate((np.ones(len(labeled_data_x_train)), np.zeros(len(not_labeled_data_x_train))), axis=0)  # 1 for labeled, 0 for not labeled
propensity_model.fit(train_split_x, train_split_y)

# Predict propensity scores
propensity_scores = propensity_model.predict_proba(train_split_x)[:, 1]  # get propensities

In [32]:
# visualize the embedding of the labeled data vs the unlabeled data
pca_labeled_data_x = pca.transform(labeled_data_x_train)
pca_not_labeled_data_x = pca.transform(not_labeled_data_x_train)
fig = px.scatter(x=pca_labeled_data_x[:, 0], y=pca_labeled_data_x[:, 1], color=labeled_data_y_train, title="PCA of the labeled data")
fig.add_trace(go.Scatter(x=pca_not_labeled_data_x[:, 0], y=pca_not_labeled_data_x[:, 1], mode='markers', name='Unlabeled Data', marker=dict(color='grey', size=5, symbol='x')))
fig.update_layout(xaxis_title='PCA Component 1', yaxis_title='PCA Component 2')
fig.show()

In [33]:
# split the labeled data into training and testing sets
train_data_labeled_x, test_data_labeled_x, train_data_labeled_y, test_data_labeled_y = train_test_split(labeled_data_x, labeled_data_y, test_size=0.4, random_state=42)

# train a model on the labeled data
outcome_model = MLPClassifier(hidden_layer_sizes=(10), max_iter=50, random_state=42, early_stopping=True, validation_fraction=0.2, n_iter_no_change=10)
outcome_model.fit(train_data_labeled_x, train_data_labeled_y)
scores_outcome_model = outcome_model.predict_proba(test_data_labeled_x)
print("ROC AUC Score (Labeled Data):", roc_auc_score(test_data_labeled_y, scores_outcome_model[:, 1]))
print("ROC AUC Score (all Data):", roc_auc_score(test_split_y, outcome_model.predict_proba(test_split_x)[:, 1]))

ROC AUC Score (Labeled Data): 0.718475073313783
ROC AUC Score (all Data): 0.5839342948717948


In [36]:
all_train_x = np.concatenate((train_data_labeled_x, not_labeled_data_x_train), axis=0)
all_train_y = np.concatenate((train_data_labeled_y, not_labeled_data_y_train), axis=0)
all_test_x = np.concatenate((test_data_labeled_x, test_data_not_labeled_x), axis=0)
all_test_y = np.concatenate((test_data_labeled_y, test_data_not_labeled_y), axis=0)

e_X = propensity_model.predict_proba(all_train_x)[:, 1]
e_X = np.clip(e_X, 1e-3, 1 - 1e-3)
T = np.concatenate((np.ones(len(train_data_labeled_x)), np.zeros(len(not_labeled_data_x_train))), axis=0)  # 1 for labeled, 0 for not labeled

# Schritt 2: Gewicht berechnen
weights = T / e_X + (1 - T) / (1 - e_X)

# Schritt 3: Outcome-Model trainieren mit gewichteter Regression
new_outcome_model = MLPClassifier(hidden_layer_sizes=(10), max_iter=1_000, random_state=42, early_stopping=True, validation_fraction=0.2, n_iter_no_change=10)
new_outcome_model.fit(np.hstack([all_train_x, T.reshape(-1, 1)]), all_train_y, sample_weight=weights)

# Predict the outcome using the new outcome model
scores_outcome_model = new_outcome_model.predict_proba(np.hstack([test_data_labeled_x, propensity_model.predict_proba(test_data_labeled_x)[:, 1].reshape(-1, 1)]))
print("ROC AUC Score (Labeled Data):", roc_auc_score(test_data_labeled_y, scores_outcome_model[:, 1]))
print("ROC AUC Score (all Data):", roc_auc_score(test_split_y, new_outcome_model.predict_proba(np.hstack([test_split_x, propensity_model.predict_proba(test_split_x)[:, 1].reshape(-1, 1)]))[:, 1]))

ROC AUC Score (Labeled Data): 0.9296187683284458
ROC AUC Score (all Data): 0.7720352564102564
