In [None]:
!pip install Faker
!pip install shap
!pip install explainerdashboard

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from faker import Faker
from explainerdashboard import ClassifierExplainer, ExplainerDashboard

np.random.seed(42)

In [None]:
# Set the seed for reproducibility
np.random.seed(42)

# Number of rows in the dataset
num_rows = 5000

# Create an instance of the Faker generator
fake = Faker()

# Generate random values for the variables
dropout = np.random.choice([0, 1], size=num_rows)
Køn = np.random.choice([0, 1], size=num_rows, p=[0.6, 0.4])
Alder_range = np.random.exponential(0.04, size=num_rows) * 32 + 18
Alder = np.clip(np.round(Alder_range), 18, 60).astype(int)

# Generate parent income based on Alder
Indkomst = np.zeros(num_rows)
Indkomst[(Alder >= 18) & (Alder <= 25)] = np.random.randint(0, 50001, size=np.sum((Alder >= 18) & (Alder <= 25)))
Indkomst[(Alder > 25)] = np.random.randint(-500000, 3500001, size=np.sum(Alder > 25))

# Generate vocational education categories
vocational_education_categories = [
    'Electrician',
    'Carpenter',
    'Plumber',
    'Hairdresser',
    'Chef',
    'Mechanic',
    'Nurse',
    'Graphic Designer',
    'Web Developer',
    'Accountant'
]

# Generate random vocational education categories
vocational_education = [fake.random_element(vocational_education_categories) for _ in range(num_rows)]

# Assign numerical categorization
vocational_education_mapping = {category: i + 1 for i, category in enumerate(vocational_education_categories)}
Erhvervsuddannelse = [vocational_education_mapping[category] for category in vocational_education]

# Generate Karaktergennemsnit
grade_values = ['00', '02', 4, 7, 10, 12]
grade_weights = [0.05, 0.05, 0.2, 0.3, 0.15, 0.25]
Karaktergennemsnit = np.random.choice(grade_values, size=num_rows, p=grade_weights)

# Generate parent education categories
parent_education_categories = [
    'High School',
    'Associate Degree',
    'Bachelor Degree',
    'Master Degree',
    'PhD'
]

# Generate random parent education categories
parent_education = [fake.random_element(parent_education_categories) for _ in range(num_rows)]

# Assign numerical categorization
parent_education_mapping = {category: i + 1 for i, category in enumerate(parent_education_categories)}
Forældre_uddannelse = [parent_education_mapping[category] for category in parent_education]

# Create a DataFrame with the generated data
data = pd.DataFrame({
    'dropout': dropout,
    'Køn': Køn,
    'Alder': Alder,
    'Indkomst': Indkomst,
    'vocational_education': vocational_education,
    'Erhvervsuddannelse': Erhvervsuddannelse,
    'grade': Karaktergennemsnit,
    'parent_education': parent_education,
    'Forældre_uddannelse': Forældre_uddannelse
})

In [None]:
data = data.drop(['vocational_education', 'parent_education'], axis = 1)

In [None]:
df = data.copy()

In [None]:
# Smote - en syntetisk resampling

# x og y genereres
# y
# df.pop siger at y bliver - vi har dermed en hel dataframe kun med target-variablen
y = df.pop('dropout')
# x
list_numerical = ['Køn', 'Alder', 'Indkomst',  'Erhvervsuddannelse', 'grade', "Forældre_uddannelse"]
# x skal være vores datafram med listen af intervalskalerede variable
X = df[list_numerical]

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42) # syntetisk resampling
# auto betyder at den kigger på variabeltyperne, og så kigger den på, hvad der er den bedste måde at resample

X_upsampled, y_upsampled = smote.fit_resample(X, y)
# den fitter ved at opfinde nye mennesker beseret på det datasæt vi har.
# På den måde opsamppler den ikke ved at kopiere eksisterende

In [None]:
# check at der er lige mange observationer i datasættene
X_upsampled.info(verbose=False) # check antal observationer i x sample
y_upsampled.info(verbose=False) # check antal observationer y sample

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5008 entries, 0 to 5007
Columns: 6 entries, Køn to Forældre_uddannelse
dtypes: float64(1), int64(4), object(1)
memory usage: 234.9+ KB
<class 'pandas.core.series.Series'>
RangeIndex: 5008 entries, 0 to 5007
dtypes: int64(1)
memory usage: 39.2 KB


In [None]:
# train- og testsæt
X_train, X_test, y_train, y_test = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

In [None]:
# StandardScaler
# normalisering gennem StandardScaler
# alle får  gennemsnit bliver 0, og så bliver skalaen afviget på tværs af standardafvigelser.
#scaler = StandardScaler().fit(X_train[list_numerical])

#X_train[list_numerical] = scaler.transform(X_train[list_numerical])
#X_test[list_numerical] = scaler.transform(X_test[list_numerical])

In [None]:
model = MLPClassifier(hidden_layer_sizes=(10,20), max_iter= 100 , activation = 'relu', solver = 'adam', random_state = 42)

In [None]:
model.fit(X_train, y_train)

In [None]:
explainer = ClassifierExplainer(model, X_train, y_train, labels = ['Bestået', 'Frafaldet'], target = 'dropout')

Note: shap values for shap='kernel' normally get calculated against X_background, but paramater X_background=None, so setting X_background=shap.sample(X, 50)...
Note: for ClassifierExplainer shap='kernel' defaults to model_output='probability
Generating self.shap_explainer = shap.KernelExplainer(model, X, link='identity')


In [None]:
ExplainerDashboard(explainer, title = 'Erhvervsuddannelse Frafald').run()

Building ExplainerDashboard..
Detected google colab environment, setting mode='external'
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...


  0%|          | 0/4006 [00:00<?, ?it/s]

Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating predictions...
Calculating pred_percentiles...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard on http://172.28.0.12:8050
You can terminate the dashboard with ExplainerDashboard.terminate(8050)
Dash is running on http://127.0.0.1:8050/



INFO:dash.dash:Dash is running on http://127.0.0.1:8050/



Dash app running on:


<IPython.core.display.Javascript object>