**1. Vorbereitung der Daten für die statistische Modellierung**

In [None]:
# read in data sets, convert datatypes and data imputation
import pandas as pd

df_train = pd.read_csv('breastcancer_train.csv')
df_test = pd.read_csv('breastcancer_test.csv')
df_aim = pd.read_csv('breastcancer_aim.csv')

display(df_train.head())
display(df_test.head())
display(df_aim.head())

In [None]:
display(df_train.describe().T)
display(df_test.describe().T)
display(df_aim.describe().T)

In [None]:
#print shapes
print("Train", df_train.shape)
print("Test", df_test.shape)
print("Aim", df_aim.shape)

In [None]:
#checking datatypes
df_train.info()
print("\n")
df_test.info()
print("\n")
df_aim.info()

In [None]:
#count missing values
print(df_train.isna().sum())
print(df_test.isna().sum())
print(df_aim.isna().sum())

In [None]:
#handling missing values 
df_train = df_train.dropna(axis=0)

In [None]:
#check train set
print("Train", df_train.shape)
print(df_train.isna().sum())

In [None]:
#changing datatypes
df_train.loc[:, 'id'] = df_train.loc[:, 'id'].astype('category')
df_train.loc[:, 'bare_nucleoli'] = df_train.loc[:, 'bare_nucleoli'].astype('int')
df_train.loc[:, 'class'] = df_train.loc[:, 'class'].astype('category')

df_test.loc[:, 'id'] = df_test.loc[:, 'id'].astype('category')
df_test.loc[:, 'bare_nucleoli'] = df_test.loc[:, 'bare_nucleoli'].astype('int')
df_test.loc[:, 'class'] = df_test.loc[:, 'class'].astype('category')

df_aim.loc[:, 'id'] = df_aim.loc[:, 'id'].astype('category')
df_aim.loc[:, 'bare_nucleoli'] = df_aim.loc[:, 'bare_nucleoli'].astype('int')
df_aim.loc[:, 'class'] = df_aim.loc[:, 'class'].astype('category')

In [None]:
#checking datatypes again
df_train.info()
print("\n")
df_test.info()
print("\n")
df_aim.info()

In [None]:
# feature matrices and target vectors
features_train = df_train.drop(['class', 'id'], axis=1)
features_test = df_test.drop(['class', 'id'], axis=1)
features_aim = df_aim.drop(['class', 'id'], axis=1)

target_train = df_train.loc[:,'class']
target_test = df_test.loc[:,'class']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Zuerst die Korrelationsmatrix berechnen
correlation_matrix = features_train.corr()

# 1. Erstellen einer Maske für das obere Dreieck
# np.triu(matrix) gibt das obere Dreieck der Matrix zurück, wobei alles unterhalb
# der Hauptdiagonalen auf 0 gesetzt wird.
# Wir wollen aber das UNTERE Dreieck maskieren. Die Standardmaske für seaborn.heatmap
# ist True für die zu maskierenden Bereiche.
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Plot-Größe festlegen
plt.figure(figsize=(14, 10))

# Heatmap erstellen
sns.heatmap(
    correlation_matrix,
    mask=mask,       # Wenden Sie die Maske an, um den unteren Teil auszublenden
    annot=True,      # Koeffizienten anzeigen
    fmt='.2f',       # Format auf zwei Dezimalstellen
    vmin=-1,         # Farbskala-Minimum
    vmax=1,          # Farbskala-Maximum
    cmap='coolwarm'  # Eine Farbskala, die positiv/negativ gut unterscheidet
)

# Titel und Anzeige
plt.title('Korrelationsmatrix (Oberes Dreieck)')
plt.show()

In [None]:
# class distribution among train- and test set
target_prop_train = pd.crosstab(index=target_train, columns='count')
display(target_prop_train)
target_prop_test = pd.crosstab(index=target_test, columns='count')
display(target_prop_test)

In [None]:
# transform highly correlated data
col_correlated = ['clump_thickness', 
                  'size_uniformity',
                  'shape_uniformity',
                  'marginal_adhesion',
                  'epithelial_size',
                  'bare_nucleoli',
                  'normal_nucleoli'
                 ]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
std_pca = Pipeline([('std', StandardScaler()), 
                    ('pca', PCA(n_components=0.8))])

In [None]:
arr_corr_train = std_pca.fit_transform(features_train.loc[:, col_correlated])
features_train = features_train.drop(col_correlated, axis=1)
features_train.loc[:, 'pca_0'] = arr_corr_train[:, 0]
features_train.loc[:, 'pca_1'] = arr_corr_train[:, 1]
features_train.loc[:, 'pca_2'] = arr_corr_train[:, 2]

In [None]:
arr_corr_test = std_pca.transform(features_test.loc[:, col_correlated])
features_test = features_test.drop(col_correlated, axis=1)
features_test.loc[:, 'pca_0'] = arr_corr_test[:, 0]
features_test.loc[:, 'pca_1'] = arr_corr_test[:, 1]
features_test.loc[:, 'pca_2'] = arr_corr_test[:, 2]

In [None]:
display(features_train.loc[:, :].corr().tail(3))
display(features_test.loc[:, :].corr().tail(3))
display(features_train.describe().T.tail(3))
display(features_test.describe().T.tail(3))

In [None]:
print(features_train.shape)
features_train.head()

**2. Ermittlung der Performance der statistischen Klassifikationsmodelle auf den Trainingsdaten**

**KNN**

In [None]:
# best k-nearfrom sklearn.neighbors import KNeighborsClassifierest neighbors model on train set 
# nicht korrelierte Features müssen ebenfalls standardisiert werden -> nochmals alles standardisieren!
from sklearn.neighbors import KNeighborsClassifier

pipeline_knn = Pipeline([('std', StandardScaler()),
                         ('knn', KNeighborsClassifier())])

k = np.unique(np.geomspace(1, 20, 20, dtype='int'))  # create 20 values between 1 and 20 with increasing distance

search_space_knn = {'knn__n_neighbors': k,  # use the created values as number of neighbors
                    'knn__weights': ['uniform', 'distance']}
search_space_knn

In [None]:
from sklearn.model_selection import GridSearchCV

model_knn = GridSearchCV(estimator=pipeline_knn, 
                         param_grid=search_space_knn, 
                         scoring='f1',
                         cv=5)

model_knn.fit(features_train, target_train)

print(model_knn.best_estimator_)
print(model_knn.best_score_)

**Log Reg**

In [None]:
# best logistic regression model on train set 
from sklearn.linear_model import LogisticRegression

pipeline_log = Pipeline([('std', StandardScaler()),
                         ('log', LogisticRegression(solver='saga',
                                                    class_weight='balanced',
                                                    max_iter=1e4,
                                                    random_state=42))])

C_values = np.geomspace(start=0.001, stop=1000, num=14)

search_space_log = {'log__penalty': ['l1', 'l2'],
                    'log__C': C_values
                   }
search_space_log

In [None]:
model_log = GridSearchCV(estimator=pipeline_log,
                         param_grid=search_space_log,
                         scoring='f1',
                         cv=5)

model_log.fit(features_train, target_train)

print(model_log.best_estimator_)
print(model_log.best_score_)

**Random Forest**

In [None]:
# best random forst model on train set 
# RF muss nicht standardisiert werden!
from sklearn.ensemble import RandomForestClassifier

search_space_rf = {'max_depth': np.geomspace(start=3, stop=50, num=10, dtype='int'),
                   'min_samples_leaf': np.geomspace(start=1, stop=500, num=10, dtype='int')}

In [None]:
model_rf = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced',
                                                         n_estimators=50,
                                                         random_state=42),
                        param_grid=search_space_rf,
                        scoring='f1',
                        cv=5)

model_rf.fit(features_train, target_train)

print(model_rf.best_estimator_)
print(model_rf.best_score_)

**3. Evaluation der statistischen Klassifikationsmodelle auf den Testdaten**

In [None]:
# evaluate classifiers on test set
from sklearn.metrics import precision_score, recall_score, f1_score

for clf in [model_knn, model_log, model_rf]:
    
    target_test_pred = clf.predict(features_test)
    
    print('\nPrecision: ', precision_score(target_test, target_test_pred))
    print('Recall: ', recall_score(target_test, target_test_pred))
    print('F1: ', f1_score(target_test, target_test_pred))

**4. Vorhersage des Zielvektors auf den Zieldaten mithilfe des besten statistischen Klassifikationsmodells**

In [None]:
# predictions on aim set
arr_corr_aim = std_pca.transform(features_aim.loc[:, col_correlated])
features_aim = features_aim.drop(col_correlated, axis=1)
features_aim.loc[:, 'pca_0'] = arr_corr_aim[:, 0]
features_aim.loc[:, 'pca_1'] = arr_corr_aim[:, 1]
features_aim.loc[:, 'pca_2'] = arr_corr_aim[:, 2]

In [None]:
df_aim.loc[: , 'Prediction'] = model_rf.predict(features_aim)
sum(df_aim.loc[: , 'Prediction'])