## Creating a decision tree


We will compile a report indicating the following:

- descriptive statistics of the data set
- if the data set is balanced or not
- if cancer could be predicted based on some kind of detection measurement
- indicate the accuracy of the model.

## 1. Prepare your workstation

In [None]:
# import all the necessary packages
import numpy as np
import pandas as pd

## 2. Import the data set

In [None]:
# import data into Python
df = pd.read_csv('breast_cancer_data.csv', 
                 index_col='id')

df.info()

In [None]:
# determine null values
df.isnull().sum()

In [None]:
# descriptive statistics
df.describe()

In [None]:
# All values are null. We'll drop them.
df.drop(labels='Unnamed: 32', axis=1, inplace=True)

In [None]:
df['diagnosis'].value_counts(normalize=True)

In [None]:
# determine if data set is balanced
df['diagnosis'].value_counts()

## 3. Create a decision tree model

In [None]:
# import necessary packages
import imblearn
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

target_col = 'diagnosis'
feature_cols = [c for c in df.columns if c != target_col]

X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier 

# Create Decision Tree classifer object
dtc = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)

# Train Decision Tree Classifer
dtc = dtc.fit(X, y)

#Predict the response for test dataset
y_pred = dtc.predict(X_test)

## 4. Calculate accuracy of model

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics

confusion_matrix = confusion_matrix(y_test, y_pred)

print(confusion_matrix)

In [None]:
# same option as with previous models for comparison between models
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

## 5. Plot the decision tree

In [None]:
import matplotlib.pyplot as plt
from sklearn import tree

#plot the decision tree based on Gini Index
fig, ax = plt.subplots(figsize=(10, 10))
tree.plot_tree(dtc, fontsize=10)

plt.show()
#tree.plot_tree(dtc)