In [1]:
# Required Python Packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
# File Path
INPUT_PATH = "PythonData/breast-cancer-wisconsin.data"

In [3]:
# Headers 
headers = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion",
               "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses",
               "CancerType"]

In [19]:
# Load the dataset into Pandas data frame
dataset = pd.read_csv(INPUT_PATH)

In [20]:
# Add the headers to the loaded dataset
dataset.columns = headers

In [23]:
 # This function simply prints out the descriptive stats for the dataset
print(dataset.describe())

         CodeNumber  ClumpThickness  UniformityCellSize  UniformityCellShape  \
count  6.980000e+02      698.000000          698.000000           698.000000   
mean   1.071807e+06        4.416905            3.137536             3.210602   
std    6.175323e+05        2.817673            3.052575             2.972867   
min    6.163400e+04        1.000000            1.000000             1.000000   
25%    8.702582e+05        2.000000            1.000000             1.000000   
50%    1.171710e+06        4.000000            1.000000             1.000000   
75%    1.238354e+06        6.000000            5.000000             5.000000   
max    1.345435e+07       10.000000           10.000000            10.000000   

       MarginalAdhesion  SingleEpithelialCellSize  BlandChromatin  \
count        698.000000                698.000000      698.000000   
mean           2.809456                  3.217765        3.438395   
std            2.856606                  2.215408        2.440056   
min

In [24]:
# We are missing values in the BareNuclie field - this command will remove those records
dataset = dataset[dataset[headers[6]] != '?']

In [25]:
print(dataset.describe())

         CodeNumber  ClumpThickness  UniformityCellSize  UniformityCellShape  \
count  6.820000e+02      682.000000          682.000000           682.000000   
mean   1.076833e+06        4.441349            3.153959             3.218475   
std    6.210926e+05        2.822751            3.066285             2.989568   
min    6.337500e+04        1.000000            1.000000             1.000000   
25%    8.774540e+05        2.000000            1.000000             1.000000   
50%    1.171820e+06        4.000000            1.000000             1.000000   
75%    1.238741e+06        6.000000            5.000000             5.000000   
max    1.345435e+07       10.000000           10.000000            10.000000   

       MarginalAdhesion  SingleEpithelialCellSize  BlandChromatin  \
count        682.000000                682.000000      682.000000   
mean           2.832845                  3.236070        3.445748   
std            2.865805                  2.224214        2.451435   
min

In [27]:
 # Split dataset into train and test dataset
train_x, test_x, train_y, test_y = train_test_split(dataset[headers[1:-1]], dataset[headers[-1]],
                                                        train_size=0.7)



In [28]:
# Train and Test dataset size details
print( "Train_x Shape :: ", train_x.shape)
print( "Train_y Shape :: ", train_y.shape)
print( "Test_x Shape :: ", test_x.shape)
print( "Test_y Shape :: ", test_y.shape)

Train_x Shape ::  (477, 9)
Train_y Shape ::  (477,)
Test_x Shape ::  (205, 9)
Test_y Shape ::  (205,)


In [30]:
# Here we create the Random Forest Classifier
  
clf = RandomForestClassifier()
trained_model = clf.fit(train_x, train_y)

In [31]:
print( "Trained model :: ", trained_model)

Trained model ::  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [32]:
# Check accuracy on test set
predictions = trained_model.predict(test_x)

In [33]:
print( "Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
print( "Test Accuracy  :: ", accuracy_score(test_y, predictions))

Train Accuracy ::  1.0
Test Accuracy  ::  0.941463414634


In [34]:
print( " Confusion matrix ", confusion_matrix(test_y, predictions))

 Confusion matrix  [[125   8]
 [  4  68]]


In [1]:
# Here is another dataset included in sklearn
from sklearn.datasets import load_wine

In [2]:
# We load the dataset ...
wine = load_wine()