In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sns.set()

In [None]:
# importing dataset
data = pd.read_csv('data.csv')
data = data.drop(columns = ['id', 'Unnamed: 32']).copy()
data.head()

Analyse whether any data is missing, and get an idea of how the data looks

In [None]:
print(data.columns)
assert max(data.isnull().sum()) == 0
data.describe()

In [None]:
sns.countplot(x="diagnosis", data=data)

Exploratory analysis
---

Here we observe that malignant observations are larger in radius, thus also in area and perimeter. There is also a clear trend of more concavity points and higher values for concavity when the observation is malignant.

In [None]:
sns.pairplot(data=data, vars = data.iloc[:, [1,3,4,7,8]], hue = 'diagnosis', corner=True)

In [None]:
size = 10

plt.figure(figsize=(size, size))

mask = np.array([[True] * size] * size)
print(mask.shape)
for i in range(size):
    for j in range(i):
        mask[i][j] = False

correlated_data = data.iloc[:, :size+1].corr()
sns.heatmap(correlated_data, annot=True, fmt='.0%', mask=mask)

Preprocessing
---
Label encode diagnosis

Split the data into datapoints and results

In [None]:
from sklearn.preprocessing import LabelEncoder
data.iloc[:, 0] = LabelEncoder().fit_transform(data.iloc[:, 0].values)

X = data.drop(columns='diagnosis', axis = 1).copy()
y = data['diagnosis']
# X.shape

In [None]:
# imo je diskutabilne ake uzitocne je odstranovat takymto sposobom columns

correlated_features = set()
correlation = X.corr()

for i in range(len(correlation.columns)):
    for j in range(i):
        if (abs(correlation.iloc[i, j]) > 0.9 and abs(correlation.iloc[i, j]) != 1):
            colname = correlation.columns[i]
            correlated_features.add(colname)

X = X.drop(columns=correlated_features, axis=1).copy()

print(X.shape)

correlated_features

In [None]:
# Toto mame v exploratory analysis, na ose x=-y

# #distribuce dat
# for column in data.columns[1:11]:
#     sns.displot(data[column], rug = True)
#     plt.show()

Outliers

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(MinMaxScaler(), LocalOutlierFactor())
outliers = pipeline.fit_predict(data)==-1
data["outlier"] = pd.Series(outliers)
data.head()

In [None]:
# toto dost zjavne nie je prave presne, asi sa treba pozriet na nastavenia LocalOutlierFactor
sns.pairplot(data, vars=data.iloc[:, :8],  hue = 'outlier')

In [None]:
from sklearn.model_selection import train_test_split

X, y = data.drop(columns='diagnosis'), data['diagnosis']
X = MinMaxScaler().fit_transform(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# X_train, X_validation, y_train, y_validation = train_test_split(
#     X_train, y_train, test_size=0.2, random_state=42
# )

In [None]:
# skade je toto? co to je vobec?

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=43, k_neighbors=30)
X_res, y_res = sm.fit_resample(X_train, y_train)


sns.countplot(y_res)

Decision tree model
---
Grid search

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
range_depth = np.linspace(1, 18, 21, dtype='int')
grid_parameter = {'max_depth':range_depth}
dtc = DecisionTreeClassifier()

# tento grid_clf_1 je tu len kvoli tomu grafu, ze?
grid_clf_1 = GridSearchCV(dtc, grid_parameter, scoring = 'f1', cv=10)
grid_clf_1.fit(X_train, y_train)
scores = grid_clf_1.cv_results_['mean_test_score']

plt.figure()
plt.plot(range_depth, scores, 'b')

plt.xlabel('max_depth')
plt.ylabel('F1')

plt.show()

In [None]:
grid_parameters = {'max_depth':range_depth, 'min_samples_leaf':range_depth, 'criterion': ['gini', 'entropy']}
grid_clf_2 = GridSearchCV(dtc, grid_parameters, scoring = 'f1', cv=10)
grid_clf_2.fit(X_train, y_train)
grid_clf_2.best_params_

In [None]:
#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
dtc_pipeline = make_pipeline(DecisionTreeClassifier(max_depth=9, min_samples_leaf=2, criterion = 'gini', random_state=0))
dtc_pipeline.fit(X_train, y_train)

Compare our models

In [None]:
from sklearn.metrics import confusion_matrix, recall_score, f1_score
from sklearn.model_selection import cross_val_score


pred = dtc_pipeline.predict(X_test)
# tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
dtc_scores = (f1_score(y_test, pred),
    recall_score(y_test, pred),
    dtc_pipeline.score(X_test, y_test),
    cross_val_score(dtc_pipeline, X_train, y_train, cv=10).mean())

pred = grid_clf_1.predict(X_test)
grid_1_scores = (f1_score(y_test, pred),
    recall_score(y_test, pred),
    dtc_pipeline.score(X_test, y_test),
    cross_val_score(grid_clf_1, X_train, y_train, cv=10).mean())

pred = grid_clf_2.predict(X_test)
grid_2_scores = (f1_score(y_test, pred),
    recall_score(y_test, pred),
    dtc_pipeline.score(X_test, y_test),
    cross_val_score(grid_clf_2, X_train, y_train).mean())

In [None]:

decision_tree_methods_scores = [dtc_scores, grid_1_scores, grid_2_scores]
names_of_dtc_methods = ["dtc_scores", "grid_1_scores", "grid_2_scores"]

for i in range(4):
    largest = 0
    index_of_largest = 0
    for j in range(3):
        current = decision_tree_methods_scores[j][i]
        if current > largest:
            largest = current
            index_of_largest = j
    print(names_of_dtc_methods[index_of_largest])

In [None]:
confusion_matrix(y_test, pred)
###

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score

RFC_pipeline = make_pipeline(RandomForestClassifier(max_depth=3))
RFC_pipeline.fit(X_res, y_res)

In [None]:
pred = RFC_pipeline.predict(X_test)

print(round(RFC_pipeline.score(X_test, y_test), 2))
recall_score(y_test, pred)

In [None]:
f1_score(y_test, pred)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=7)
KNN.fit(X_train, y_train)
pred = KNN.predict(X_test)

print(round(KNN.score(X_test, y_test), 2))
f1_score(y_test, pred)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01,
    max_depth=1, min_samples_leaf = 5,random_state=0).fit(X_res, y_res)
clf.score(X_test, y_test)
pred = clf.predict(X_test)
print(round(clf.score(X_train, y_train), 2))
Y = LabelEncoder()
pred_en = Y.fit_transform(pred)
y_en = Y.fit_transform(y_test)
print(round(clf.score(X_test, y_test), 2))
recall_score(y_en, pred_en)
f1_score(y_en, pred_en)