## Creating a random forest


We will compile a report of the following:

- descriptive statistics of the data set
- if the data set is balanced or not
- if cancer could be predicted based on some kind of detection measurement
- indicate the accuracy of the model.

## 1. Prepare your workstation

In [None]:
# import all the necessary packages
import numpy as np
import pandas as pd

## 2. Import the data set

In [None]:
# import data into Python
df = pd.read_csv('breast_cancer_data.csv', 
                 index_col='id')

df.info()

In [None]:
# determine null values
df.isnull().sum()

In [None]:
# descriptive statistics
df.describe()

In [None]:
# All values are null. We'll drop them.
df.drop(labels='Unnamed: 32', axis=1, inplace=True)

In [None]:
df['diagnosis'].value_counts(normalize=True)

In [None]:
# determine if data set is balanced
df['diagnosis'].value_counts()

## 3. Create a random forest model

In [None]:
# import necessary packages
import imblearn
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

# divide data into attributes and labels - all columns
X = df.iloc[:, 0:4].values
y = df.iloc[:, 4].values

target_col = 'diagnosis'
feature_cols = [c for c in df.columns if c != target_col]

X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest=RandomForestClassifier(n_estimators=200, criterion='gini', 
                              min_samples_split=2, min_samples_leaf=2, 
                              max_features='auto', bootstrap=True, n_jobs=-1, 
                              random_state=42)

forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

## 4. Calculate accuracy of model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

## 5. Plot the random forest

In [None]:
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.tree import export_graphviz

fig, axes = plt.subplots(nrows = 1, ncols = 1,figsize = (4,4), dpi=800)
tree.plot_tree(forest.estimators_[0],
               filled = True);
fig.savefig('rf_individualtree.png')