<a href="https://colab.research.google.com/github/Daksh-Raghuvanshi/Machine-Learnig-Projects/blob/main/Decision_Tree_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Decision Tree Classifier on Breast Cancer Detection


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [None]:
pd.set_option('display.max_columns' , None)

In [None]:
data = load_breast_cancer()

In [None]:
data.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [None]:
data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [None]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [None]:
df = pd.DataFrame(np.c_[data.data, data.target] , columns = [list(data.feature_names) + ['target']])
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [None]:
X = df.iloc[: , 0:-1]
y = df.iloc[: , -1]

In [None]:
X_train , X_test , y_train , y_test = train_test_split( X , y , test_size= 0.2 , random_state = 51)
print(f"{X_train.shape} ,{X_test.shape} , {y_train.shape} , {y_test.shape} ")

(455, 30) ,(114, 30) , (455,) , (114,) 


## **Lets try building model without a scaled data**

In [None]:
dtc = DecisionTreeClassifier(criterion='gini')
dtc.fit(X_train , y_train)
dtc.score(X_test , y_test)*100



92.98245614035088

In [None]:
dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(X_train , y_train)
dtc.score(X_test , y_test)*100



92.10526315789474

# And now with the scaled data

In [None]:
sc = StandardScaler()
sc.fit(X_train)
print(sc.mean_)
print(sc.scale_)

[1.41470967e+01 1.94833626e+01 9.21961099e+01 6.57439780e+02
 9.69954725e-02 1.07245626e-01 9.26326191e-02 5.03446659e-02
 1.82349011e-01 6.32115165e-02 4.18128352e-01 1.24280374e+00
 2.96717604e+00 4.17664286e+01 7.13407912e-03 2.65736088e-02
 3.35115815e-02 1.21546022e-02 2.10194484e-02 3.92049912e-03
 1.63035714e+01 2.58970549e+01 1.07628132e+02 8.84022857e+02
 1.32800615e-01 2.59909516e-01 2.80797582e-01 1.16446024e-01
 2.91661978e-01 8.45212088e-02]
[3.54999241e+00 4.28490844e+00 2.44819239e+01 3.53257254e+02
 1.45825736e-02 5.38700064e-02 8.10907160e-02 3.89378424e-02
 2.84083913e-02 7.41578359e-03 2.87019742e-01 5.60393480e-01
 2.11272475e+00 4.79004633e+01 3.09766785e-03 1.88405177e-02
 3.24732099e-02 6.47725842e-03 8.78080011e-03 2.82722945e-03
 4.80787310e+00 6.16164154e+00 3.35092238e+01 5.65240059e+02
 2.36065041e-02 1.59786053e-01 2.13343562e-01 6.61843255e-02
 6.35296361e-02 1.88147890e-02]




In [None]:
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)



Model Training and Testing using Decision Tree Classifier

In [None]:
dtc_gini = DecisionTreeClassifier(criterion='gini')
dtc_gini.fit(X_train_sc, y_train)
dtc_gini.score(X_test_sc , y_test)*100


93.85964912280701

In [None]:
dtc_entropy = DecisionTreeClassifier(criterion='entropy')
dtc_entropy.fit(X_train_sc, y_train)
dtc_entropy.score(X_test_sc , y_test)*100

92.10526315789474

## **It is not necessary for us to provide Scaled Data to a decision tree, it does not affect the accuracy that much.**

Lets try predition on an actual data

In [None]:
patient_1 = [17.99,
 10.38,
 122.8,
 1001.0,
 0.1184,
 0.2776,
 0.3001,
 0.1471,
 0.2419,
 0.07871,
 1.095,
 0.9053,
 8.589,
 153.4,
 0.006399,
 0.04904,
 0.05373,
 0.01587,
 0.03003,
 0.006193,
 25.38,
 17.33,
 184.6,
 2019.0,
 0.1622,
 0.6656,
 0.7119,
 0.2654,
 0.4601,
 0.1189]

In [None]:
prediction = dtc.predict(np.array([patient_1]))

In [None]:
if prediction[0] == 0:
  print("Malignant - The patient has Cancer")
else:
  print("Benign - The patient has NO Cancer")

Malignant - The patient has Cancer
