# Training eines Neuronalen Netzes


## Aufbereitung der Daten


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
app_df = pd.read_csv('application_record.csv')
app_df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [3]:
app_df.shape

(438557, 18)

In [4]:
credit_df = pd.read_csv('credit_record.csv')
credit_df.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [5]:
credit_df.shape

(1048575, 3)

## Data Merging


In [6]:
#sorting IDs
app_df = app_df.sort_values('ID')
credit_df = credit_df.sort_values('ID')
#Map the status to bad or good
credit_df['STATUS_BINARY'] = credit_df['STATUS']
credit_df = credit_df.replace({'STATUS_BINARY' :{'C' : 'Good','X' : 'Good','0' : 'Bad', '1' : 'Bad','2' : 'Bad','3' : 'Bad','4' : 'Bad','5' : 'Bad'}}) #C und X sind Good, alle Zahlen 0-5 sind Schlecht
result_df = credit_df.value_counts(subset=['ID', 'STATUS_BINARY']).unstack(fill_value=0).reset_index() #für jede ID wird gezählt, wie viele Good und Bad Einträge es gibt
#decide the user is good or bad based on his/her history majority
result_df.loc[(result_df['Good'] / result_df['Bad'] >= 1), 'Status'] = 1 #Wenn eine ID mehr Good als Bad hat, wird der Gesamtstatus auf 1 gut gesetzt, sonst auf 0 schlecht
result_df.loc[(result_df['Good'] / result_df['Bad'] < 1 ), 'Status'] = 0
result_df['Status'] = result_df['Status'].astype(int)
df = app_df.merge(result_df, how='inner', on=['ID']) #Die zusammengefassten Kreditinformationen werden mit den Antragsdaten durch ID verknüpft
#define the rate of good debts for each user
df['Good rate'] = (df['Good']) / (df['Good'] + df['Bad']) #Spalte Rate wird erstellt, die das Verhältnis von Good zu Bad für jede ID darstellt. Wahrscheinlich wird +0.1 hinzugefügt, um Division durch 0 zu vermeiden
df.drop(['Good','Bad'],axis=1,inplace = True) #Spalten Good und Bad werden gelöscht
#Renaming columns to more friendly names
df.columns = ['ID', 'Gender', 'Car', 'Realty', 'Children', 'Income', 'Income_Type',
                        'Education_Type', 'Family_Status', 'Housing_Type',  'Age', 'Years_Experience' , 'Mobile_Phone', 'Work_Phone', 
                        'Phone', 'Email', 'Job_Title', 'Total_Family' , 'Status','Good rate']
df.head(5)

Unnamed: 0,ID,Gender,Car,Realty,Children,Income,Income_Type,Education_Type,Family_Status,Housing_Type,Age,Years_Experience,Mobile_Phone,Work_Phone,Phone,Email,Job_Title,Total_Family,Status,Good rate
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,1,0.875
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,1,0.866667
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,1,0.766667
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,1,0.6
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,1,1.0


## Handling Missing Values


In [7]:
df.isna().sum()

ID                      0
Gender                  0
Car                     0
Realty                  0
Children                0
Income                  0
Income_Type             0
Education_Type          0
Family_Status           0
Housing_Type            0
Age                     0
Years_Experience        0
Mobile_Phone            0
Work_Phone              0
Phone                   0
Email                   0
Job_Title           11323
Total_Family            0
Status                  0
Good rate               0
dtype: int64

In [8]:
df.dropna()

Unnamed: 0,ID,Gender,Car,Realty,Children,Income,Income_Type,Education_Type,Family_Status,Housing_Type,Age,Years_Experience,Mobile_Phone,Work_Phone,Phone,Email,Job_Title,Total_Family,Status,Good rate
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,1,0.766667
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,1,0.600000
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,1,1.000000
5,5008810,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,1,0.777778
6,5008811,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,1,0.846154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,5150482,F,Y,Y,1,157500.0,Working,Secondary / secondary special,Married,House / apartment,-10808,-1739,1,0,0,0,Core staff,3.0,0,0.333333
36453,5150483,F,Y,Y,1,157500.0,Working,Secondary / secondary special,Married,House / apartment,-10808,-1739,1,0,0,0,Core staff,3.0,1,1.000000
36454,5150484,F,Y,Y,1,157500.0,Working,Secondary / secondary special,Married,House / apartment,-10808,-1739,1,0,0,0,Core staff,3.0,0,0.076923
36455,5150485,F,Y,Y,1,157500.0,Working,Secondary / secondary special,Married,House / apartment,-10808,-1739,1,0,0,0,Core staff,3.0,0,0.000000


## Convert Age and Years of Experience to year format


In [9]:
df['Age'] = df['Age']/(-1*365)
df['Years_Experience'] = df['Years_Experience']/(-1*365)
df.head(10)

Unnamed: 0,ID,Gender,Car,Realty,Children,Income,Income_Type,Education_Type,Family_Status,Housing_Type,Age,Years_Experience,Mobile_Phone,Work_Phone,Phone,Email,Job_Title,Total_Family,Status,Good rate
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32.890411,12.443836,1,1,0,0,,2.0,1,0.875
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32.890411,12.443836,1,1,0,0,,2.0,1,0.866667
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58.832877,3.106849,1,0,0,0,Security staff,2.0,1,0.766667
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52.356164,8.358904,1,0,1,1,Sales staff,1.0,1,0.6
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52.356164,8.358904,1,0,1,1,Sales staff,1.0,1,1.0
5,5008810,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52.356164,8.358904,1,0,1,1,Sales staff,1.0,1,0.777778
6,5008811,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52.356164,8.358904,1,0,1,1,Sales staff,1.0,1,0.846154
7,5008812,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,61.545205,-1000.665753,1,0,0,0,,1.0,0,0.176471
8,5008813,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,61.545205,-1000.665753,1,0,0,0,,1.0,0,0.176471
9,5008814,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,61.545205,-1000.665753,1,0,0,0,,1.0,0,0.176471


## Behandlung Kategoriale Daten

Neuronale Netze können kategoriale Daten nicht direkt verbinden, sie müssen durch One-Hot-Encoding in eine numerische Darstellung umgewandelt werden


In [10]:
Cat_features = ['Realty', 'Gender','Car','Income_Type','Education_Type','Family_Status','Housing_Type','Job_Title']

for col in Cat_features: #Einblick in alle Kategorien, die in den Datenstehen
    print(col + ':', df[col].unique())

df.info()

Realty: ['Y' 'N']
Gender: ['M' 'F']
Car: ['Y' 'N']
Income_Type: ['Working' 'Commercial associate' 'Pensioner' 'State servant' 'Student']
Education_Type: ['Higher education' 'Secondary / secondary special' 'Incomplete higher'
 'Lower secondary' 'Academic degree']
Family_Status: ['Civil marriage' 'Married' 'Single / not married' 'Separated' 'Widow']
Housing_Type: ['Rented apartment' 'House / apartment' 'Municipal apartment'
 'With parents' 'Co-op apartment' 'Office apartment']
Job_Title: [nan 'Security staff' 'Sales staff' 'Accountants' 'Laborers' 'Managers'
 'Drivers' 'Core staff' 'High skill tech staff' 'Cleaning staff'
 'Private service staff' 'Cooking staff' 'Low-skill Laborers'
 'Medicine staff' 'Secretaries' 'Waiters/barmen staff' 'HR staff'
 'Realty agents' 'IT staff']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36457 entries, 0 to 36456
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID     

In [11]:
df_encoded = pd.get_dummies(df, columns=Cat_features, drop_first=True) #getDummies konvertiert so viele 0/1 Variabel als es verschiedene Values gibt, setzte drop_first = True damit keine doppelten oder überflüssigen Informationsspalten gibt

df_encoded.head(10)

Unnamed: 0,ID,Children,Income,Age,Years_Experience,Mobile_Phone,Work_Phone,Phone,Email,Total_Family,...,Job_Title_Laborers,Job_Title_Low-skill Laborers,Job_Title_Managers,Job_Title_Medicine staff,Job_Title_Private service staff,Job_Title_Realty agents,Job_Title_Sales staff,Job_Title_Secretaries,Job_Title_Security staff,Job_Title_Waiters/barmen staff
0,5008804,0,427500.0,32.890411,12.443836,1,1,0,0,2.0,...,False,False,False,False,False,False,False,False,False,False
1,5008805,0,427500.0,32.890411,12.443836,1,1,0,0,2.0,...,False,False,False,False,False,False,False,False,False,False
2,5008806,0,112500.0,58.832877,3.106849,1,0,0,0,2.0,...,False,False,False,False,False,False,False,False,True,False
3,5008808,0,270000.0,52.356164,8.358904,1,0,1,1,1.0,...,False,False,False,False,False,False,True,False,False,False
4,5008809,0,270000.0,52.356164,8.358904,1,0,1,1,1.0,...,False,False,False,False,False,False,True,False,False,False
5,5008810,0,270000.0,52.356164,8.358904,1,0,1,1,1.0,...,False,False,False,False,False,False,True,False,False,False
6,5008811,0,270000.0,52.356164,8.358904,1,0,1,1,1.0,...,False,False,False,False,False,False,True,False,False,False
7,5008812,0,283500.0,61.545205,-1000.665753,1,0,0,0,1.0,...,False,False,False,False,False,False,False,False,False,False
8,5008813,0,283500.0,61.545205,-1000.665753,1,0,0,0,1.0,...,False,False,False,False,False,False,False,False,False,False
9,5008814,0,283500.0,61.545205,-1000.665753,1,0,0,0,1.0,...,False,False,False,False,False,False,False,False,False,False


In [12]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36457 entries, 0 to 36456
Data columns (total 49 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   ID                                            36457 non-null  int64  
 1   Children                                      36457 non-null  int64  
 2   Income                                        36457 non-null  float64
 3   Age                                           36457 non-null  float64
 4   Years_Experience                              36457 non-null  float64
 5   Mobile_Phone                                  36457 non-null  int64  
 6   Work_Phone                                    36457 non-null  int64  
 7   Phone                                         36457 non-null  int64  
 8   Email                                         36457 non-null  int64  
 9   Total_Family                                  36457 non-null 

In [13]:
df_encoded.describe()

Unnamed: 0,ID,Children,Income,Age,Years_Experience,Mobile_Phone,Work_Phone,Phone,Email,Total_Family,Status,Good rate
count,36457.0,36457.0,36457.0,36457.0,36457.0,36457.0,36457.0,36457.0,36457.0,36457.0,36457.0,36457.0
mean,5078227.0,0.430315,186685.7,43.767598,-162.364207,1.0,0.225526,0.294813,0.089722,2.198453,0.55416,0.511533
std,41875.24,0.742367,101789.2,11.508356,377.126945,0.0,0.417934,0.455965,0.285787,0.911686,0.497065,0.352356
min,5008804.0,0.0,27000.0,20.517808,-1000.665753,1.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,5042028.0,0.0,121500.0,34.142466,1.117808,1.0,0.0,0.0,0.0,2.0,0.0,0.157895
50%,5074614.0,0.0,157500.0,42.638356,4.252055,1.0,0.0,0.0,0.0,2.0,1.0,0.545455
75%,5115396.0,1.0,225000.0,53.254795,8.638356,1.0,0.0,1.0,0.0,3.0,1.0,0.830508
max,5150487.0,19.0,1575000.0,68.909589,43.049315,1.0,1.0,1.0,1.0,20.0,1.0,1.0


## Definition von Features (X) und Zielvariable (y)


In [15]:
X = df_encoded.drop(['ID', 'Status'], axis=1) #ID und Status sind keine Features
y = df_encoded['Status'] #Zielvariable

#Aufteilung der Daten in Trainings- und Testsets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y) #stratify=y Versuche die prozentuale Verteilung der Klassen in y_train und y_test so ähnlich wie möglich zu halten wie in y

#Skalierung der numerischen Spalten (Ursprüngliche numerische Spalten in df, die nicht entfernt wurden)

numeric_features = ['Children', 'Income', 'Age', 'Years_Experience', 'Total_Family', 'Good rate']

scaler = StandardScaler()

#Numerische Spalten im Trainingsset fitten und transformieren
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])

# Die gleichen Statistiken des Trainingsset verwenden um das Testset zu transformieren
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

print("Shape of X_train after scalar preprocessing:",  X_train.shape)
print("\nShape of X_test after scalar preprocessing:",X_test.shape)

Shape of X_train after scalar preprocessing: (29165, 47)

Shape of X_test after scalar preprocessing: (7292, 47)


Pseudocode / weiteres Vorgehen


Modellbau, Training und Evaluierung des Neuronalen Netzwerks

- input_shape
- Dense: Schichten, Anzahl der Neuronen festlegen
- activation: softmax Funktion
- Dropout: layers.Dropout einfügen, um Overfitting zu reduzieren

Wie soll Dense eingestellt werden?


In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import layers

#tf.keras.backend.clear_session()




In [25]:
num_features = X_train.shape[1] #Gesamtanzahl der Features von X_train

model = keras.Sequential()
model.add(keras.Input(shape=(num_features,)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.25))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.25))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.25))
model.add(layers.Dense(1, activation='sigmoid')) #Sollte nicht lieber sigmoid als Aktivierungsfunktion benutzt werden und nicht softmax?
# wir wollen ja nur 2 mögliche Klassen ausgeben, bei SOftmax wird eine Liste von Wahrscheinlichkeiten produziert, die sich zu 1 summieren


model.summary()

Kompilieren des Modells mit Optimierer, Loss-Funktionen und Metriken wie Accuracy

- Optimizer: adam? Standardwahl
- loss: binary_crossentropy: Standard für binäre Klassifizierung
- metrics: accuracy


Training des Modelles durch Wiederholung der Trainingsdaten: Nutzung der Epochen und Batches als auch Validierung

- epochs
- batch_size
- validation_split
- history-Object zum Speichern der Trainings- und Validierungsmetriken jeder Epoche


Evaluation mithilfe von Loss und Accuracy

- Visualsierung mithilfe eines Plots
- Plot von loss vs. val_loss
- Plot ovn accuracy vs. val_accuracy
- wenn val_loss nach eine Weile wieder ansteigt, während loss weiter sinkt, ist das Zeichen für Overfitting


Beim Bemerken, dass der Trainings-Loss sinkt, aber der Validierungs Loss nach einer Weile wieder steigt, ist das ein Zeichen von Overfitting. Man kann Dropout in Keras verwenden direkt nach Dense-Schichten


Vorhersage treffen auf neue Daten, Finale Evaluierung des Modells (mit dem ungesehen Testset)

- ausschließlich X_test und y_test verwenden
- evaluate-Methode gibt den finalen Loss und die Metriken zurück


Einfache Methode um Layers linear zu stacken ist mithilfe der Sequential API:

Functional API bietet mehr Flexibilität um Modell zu defineiren mit mehreren Inputs/Outputs


In [None]:
'''
Sequential API

model = keras.Sequential()
model.add(keras.layers.Dense(16, activation='softmax', input_shape=(784,)))
model.add(keras.layers.Dense(8, activation='softmax'))
model.add(keras.layers.Dense(4, activation='softmax'))
print(model.summary())

Functional API

m1_layer1 = keras.layers.Dense(12, activation='softmax')(m1_inputs)
m1_layer2 = keras.layers.Dense(4, activation='softmax')(m1_layer1)
m2_layer1 = keras.layers.Dense(12, activation='softmax')(m2_inputs)
m2_layer2 = keras.layers.Dense(4, activation='softmax')(m2_layer1)
merged = keras.layers.add([m1_layer2, m2_layer2])
model = keras.Model(inputs=[m1_inputs, m2_inputs], outputs=merged)
print(model.summary())

'''

# Federated Machine Learning
