# Using Balance-Scale dataset

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
col = [ 'Class Name','Left weight','Left distance','Right weight','Right distance']
df = pd.read_csv('balance-scale.data', names=col)

In [None]:
df.info()

In [8]:
x = df.drop(['Class Name'], axis=1)
y = df['Class Name']

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state=42)

In [10]:
sc = StandardScaler()

scaler = sc.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [27]:
# Create Decision Tree classifer object
dec = DecisionTreeClassifier()

In [28]:
# Train Decision Tree Classifer
history = dec.fit(x_train_scaled,y_train)

In [29]:
#Predict the response for train dataset
y_pred_ = dec.predict(x_train_scaled)

In [30]:
#Predict the response for test dataset
y_pred = dec.predict(x_test_scaled)

In [15]:
print("Training accuracy:",accuracy_score(y_train, y_pred_))
print("Testing accuracy:",accuracy_score(y_test, y_pred))

Training accuracy: 1.0
Testing accuracy: 0.776


This model is overfitting

In [None]:
# Create Decision Tree classifer object
dec = DecisionTreeClassifier(max_depth=3)

# Train Decision Tree Classifer
history = dec.fit(x_train_scaled,y_train)

#Predict the response for train and test dataset
y_pred_ = dec.predict(x_train_scaled)
y_pred = dec.predict(x_test_scaled)

print("Training accuracy:",accuracy_score(y_train, y_pred_))
print("Testing accuracy:",accuracy_score(y_test, y_pred))

In [None]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image  
import pydotplus

feature_cols = ['Left weight','Left distance','Right weight','Right distance']

dot_data = StringIO()

# export_graphviz function converts decision tree classifier into dot file and pydotplus convert this dot file to png
export_graphviz(dec, out_file=dot_data,  
                filled=True, rounded=True,
                feature_names = feature_cols,class_names=['B','L','R'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [22]:
path = dec.cost_complexity_pruning_path(x_train_scaled, y_train)
ccp_alphas = path.ccp_alphas

In [23]:
decs = []
for ccp_alpha in ccp_alphas:
    dec = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    dec.fit(x_train_scaled, y_train)
    decs.append(dec)

In [None]:
import matplotlib.pyplot as plt

train_acc = []
test_acc = []
for d in decs:
    y_train_pred = d.predict(x_train_scaled)
    y_test_pred = d.predict(x_test_scaled)
    train_acc.append(accuracy_score(y_train_pred,y_train))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy')
plt.plot(ccp_alphas,test_acc,label='test_accuracy')
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

In [None]:
dec_ = DecisionTreeClassifier(random_state=0,ccp_alpha=0.02)
dec_.fit(x_train_scaled,y_train)

In [None]:
y_pred_ = dec_.predict(x_train_scaled)
y_pred = dec_.predict(x_test_scaled)

print("Training accuracy:",accuracy_score(y_train, y_pred_))
print("Testing accuracy:",accuracy_score(y_test, y_pred))

## Using wine quality dataset

In [None]:
df = pd.read_csv('winequality.csv')

In [None]:
df.info()

In [None]:
df = df.dropna()

In [None]:
x = df.drop(['type'], axis=1)
y = df['type']

## Using cancer data

In [None]:
df = pd.read_csv('Cancer.csv')

In [None]:
df = df.drop(['id', 'Unnamed: 32'], axis=1)

In [None]:
x = df.drop('diagnosis', axis=1)
y = df['diagnosis']