In [90]:
# Importing the required packages 
import numpy as np 
import pandas as pd 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_score

In [5]:
# Function importing Dataset 
def importdata(): 
    balance_data = pd.read_csv( 
'https://archive.ics.uci.edu/ml/machine-learning-'+
'databases/balance-scale/balance-scale.data', 
    sep= ',', header = None) 
      
    # Printing the dataswet shape 
    print ("Dataset Length: ", len(balance_data)) 
    print ("Dataset Shape: ", balance_data.shape) 
      
    # Printing the dataset obseravtions 
    print ("Dataset: ",balance_data.head()) 
    return balance_data 

In [8]:
balance_data = importdata()

Dataset Length:  625
Dataset Shape:  (625, 5)
Dataset:     0  1  2  3  4
0  B  1  1  1  1
1  R  1  1  1  2
2  R  1  1  1  3
3  R  1  1  1  4
4  R  1  1  1  5


In [9]:
balance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625 entries, 0 to 624
Data columns (total 5 columns):
0    625 non-null object
1    625 non-null int64
2    625 non-null int64
3    625 non-null int64
4    625 non-null int64
dtypes: int64(4), object(1)
memory usage: 24.5+ KB


In [11]:
balance_data.head(3)

Unnamed: 0,0,1,2,3,4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3


In [24]:
len(balance_data[balance_data.iloc[:, 0] == 'B'])

49

In [25]:
len(balance_data[balance_data.iloc[:, 0] == 'R'])

288

In [26]:
len(balance_data[balance_data.iloc[:, 0] == 'L'])

288

In [40]:
# Function to split the dataset 
def splitdataset(balance_data): 
  
    # Seperating the target variable 
    X = balance_data.values[:, 1:5] 
    Y = balance_data.values[:, 0] 
  
    # Spliting the dataset into train and test 
    X_train, X_test, y_train, y_test = train_test_split(  
    X, Y, test_size = 0.3, random_state = 0) 
      
    return X, Y, X_train, X_test, y_train, y_test

In [84]:
# Function to perform training with giniIndex. 
def train_using_gini(X_train, X_test, y_train): 
  
    # Creating the classifier object 
    clf_gini = DecisionTreeClassifier(criterion = "gini", 
            random_state = 0, max_depth=5, min_samples_leaf=2) 
    # Performing training 
    clf_gini.fit(X_train, y_train) 
    return clf_gini 
  

In [85]:
# Function to perform training with entropy. 
def train_using_entropy(X_train, X_test, y_train): 
  
    # Decision tree with entropy 
    clf_entropy = DecisionTreeClassifier( 
            criterion = "entropy", random_state = 0, 
            max_depth = 5, min_samples_leaf = 2) 
  
    # Performing training 
    clf_entropy.fit(X_train, y_train) 
    return clf_entropy 

In [86]:
# Function to make predictions 
def prediction(X_test, clf_object): 
  
    # Predicton on test with giniIndex 
    y_pred = clf_object.predict(X_test) 
    print("Predicted values:") 
    print(y_pred) 
    return y_pred 

In [87]:
# Function to calculate accuracy 
def cal_accuracy(y_test, y_pred): 
      
    print("Confusion Matrix: ")
    print(confusion_matrix(y_test, y_pred)) 
      
    print ("Accuracy : ", 
    accuracy_score(y_test,y_pred)*100) 
      
    print("Report : ", 
    classification_report(y_test, y_pred)) 
  

In [105]:
# Driver code 
def main(): 
      
    # Building Phase 
    data = importdata() 
    X, Y, X_train, X_test, y_train, y_test = splitdataset(data) 
    clf_gini = train_using_gini(X_train, X_test, y_train) 
    clf_entropy = train_using_entropy(X_train, X_test, y_train) 
      
    # Operational Phase 
    print('')
    print("Results Using Gini Index:") 
      
    # Prediction using gini 
    y_pred_gini = prediction(X_test, clf_gini) 
    cal_accuracy(y_test, y_pred_gini) 
    scores_gini = cross_val_score(clf_gini, X, Y, cv=5, scoring='accuracy')
    print('Cross Validation Accuracy Score:')
    print(scores_gini)
    scores_gini2 = cross_val_score(clf_gini, X_train, y_train, cv=5, scoring='accuracy')
    print('Cross Validation Accuracy Score (train):')
    print(scores_gini2)
    
    print('')
    print("Results Using Entropy:") 
    # Prediction using entropy 
    y_pred_entropy = prediction(X_test, clf_entropy) 
    cal_accuracy(y_test, y_pred_entropy)
    scores_entropy = cross_val_score(clf_entropy, X, Y, cv=5, scoring='accuracy')
    print('Cross Validation Accuracy Score')
    print(scores_entropy)
    scores_entropy2 = cross_val_score(clf_entropy, X_train, y_train, cv=5, scoring='accuracy')
    print('Cross Validation Accuracy Score (Train)')
    print(scores_entropy2)    

In [106]:
# Calling main function 
if __name__=="__main__": 
    main() 

Dataset Length:  625
Dataset Shape:  (625, 5)
Dataset:     0  1  2  3  4
0  B  1  1  1  1
1  R  1  1  1  2
2  R  1  1  1  3
3  R  1  1  1  4
4  R  1  1  1  5

Results Using Gini Index:
Predicted values:
['L' 'R' 'L' 'L' 'L' 'R' 'L' 'L' 'R' 'R' 'R' 'R' 'R' 'R' 'L' 'L' 'L' 'L'
 'L' 'L' 'L' 'R' 'R' 'L' 'L' 'L' 'L' 'L' 'R' 'R' 'L' 'L' 'L' 'L' 'R' 'L'
 'L' 'L' 'R' 'B' 'L' 'L' 'L' 'L' 'L' 'B' 'L' 'R' 'R' 'R' 'R' 'R' 'L' 'L'
 'R' 'B' 'R' 'L' 'L' 'L' 'L' 'R' 'L' 'L' 'R' 'L' 'R' 'R' 'L' 'R' 'R' 'R'
 'R' 'B' 'L' 'R' 'L' 'R' 'L' 'R' 'R' 'R' 'L' 'R' 'L' 'R' 'L' 'L' 'L' 'R'
 'L' 'L' 'L' 'R' 'L' 'R' 'L' 'L' 'R' 'B' 'L' 'L' 'L' 'B' 'L' 'R' 'L' 'R'
 'R' 'L' 'L' 'R' 'L' 'R' 'L' 'R' 'L' 'L' 'R' 'B' 'L' 'R' 'L' 'L' 'R' 'L'
 'L' 'R' 'R' 'R' 'L' 'L' 'R' 'R' 'R' 'R' 'R' 'R' 'L' 'L' 'R' 'R' 'R' 'L'
 'L' 'R' 'L' 'L' 'R' 'L' 'L' 'L' 'R' 'R' 'R' 'R' 'L' 'R' 'L' 'L' 'L' 'R'
 'L' 'L' 'R' 'L' 'R' 'L' 'L' 'B' 'L' 'R' 'L' 'B' 'R' 'R' 'L' 'R' 'R' 'L'
 'R' 'L' 'R' 'L' 'R' 'R' 'L' 'L']
Confusion Matrix: 
[[ 2  4  5]
 [