In [None]:
!pip install scikit-learn pandas numpy matplotlib plotly "nbformat>=4.2.0"

In [None]:
!pip install ipykernel

In [None]:
!pip install --upgrade nbformat

In [None]:
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')

# generate random data
SAMPLES = 1000
x, y = make_classification(n_samples=SAMPLES, n_features=20, n_informative=15, random_state=42, n_classes=2)
# select 20% of the data for later testing
test_split_x = x[:int(SAMPLES * 0.2)]
test_split_y = y[:int(SAMPLES * 0.2)]
train_split_x = x[int(SAMPLES * 0.2):]
train_split_y = y[int(SAMPLES * 0.2):]

In [None]:
labeled_data_selection_model = MLPClassifier(hidden_layer_sizes=(10), max_iter=50, random_state=42)
labeled_data_selection_model.fit(x, y)
scores = labeled_data_selection_model.predict_proba(train_split_x)
scores = scores[:, 1]  # get probabilities for the positive class

In [None]:
# barplot of the score distribution
fig = px.histogram(pd.Series(scores), nbins=50, title='Score Distribution', labels={'value': 'Score'})
fig.update_layout(xaxis_title='Score', yaxis_title='Count')
fig.show()

In [None]:
# find threshold such that 5% of the data is above it
threshold_above = np.percentile(scores, 95)
threshold_below = np.percentile(scores, 5)
# filter data above the threshold
x_above = train_split_x[scores >= threshold_above]
y_above = train_split_y[scores >= threshold_above]
x_below = train_split_x[scores <= threshold_below]
y_below = train_split_y[scores <= threshold_below]

labeled_data_x = np.concatenate((x_above, x_below), axis=0)
labeled_data_y = np.concatenate((y_above, y_below), axis=0)

In [None]:
train_data_labeled_x, test_data_labeled_x, train_data_labeled_y, test_data_labeled_y = train_test_split(labeled_data_x, labeled_data_y, test_size=0.4, random_state=42)

In [None]:
# train a model on the labeled data
labeled_data_selection_model = MLPClassifier(hidden_layer_sizes=(10), max_iter=50, random_state=42, early_stopping=True, validation_fraction=0.2, n_iter_no_change=10)
labeled_data_selection_model.fit(train_data_labeled_x, train_data_labeled_y)
scores_outcome_model = labeled_data_selection_model.predict_proba(test_data_labeled_x)
print("ROC AUC Score (Labeled Data):", roc_auc_score(test_data_labeled_y, scores_outcome_model[:, 1]))
print("ROC AUC Score (all Data):", roc_auc_score(test_split_y, labeled_data_selection_model.predict_proba(test_split_x)[:, 1]))