<a href="https://colab.research.google.com/github/RinzCSGO/machine-learning/blob/learn/DecisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Title**:  Balance Scale Weight & Distance Database

[data source](https://archive.ics.uci.edu/ml/datasets/balance+scale)

 **Relevant Information**

This data set was generated to model psychological
	experimental results.  Each example is classified as having the
	balance scale tip to the right, tip to the left, or be
	balanced.  The attributes are the left weight, the left
	distance, the right weight, and the right distance.  The
	correct way to find the class is the greater of 
	(left-distance * left-weight) and (right-distance *
	right-weight).  If they are equal, it is balanced.*italicized text*


  **Number of Instances**: 625 (49 balanced, 288 left, 288 right)

  **Number of Attributes**: 4 (numeric) + class name = 5

**Attribute Information**:

	1. Class Name: 3 (L, B, R)
	2. Left-Weight: 5 (1, 2, 3, 4, 5)
	3. Left-Distance: 5 (1, 2, 3, 4, 5)
	4. Right-Weight: 5 (1, 2, 3, 4, 5)
	5. Right-Distance: 5 (1, 2, 3, 4, 5)

**Missing Attribute Values**: none

**Class Distribution**: 

     1. 46.08 percent are L
     2. 07.84 percent are B
     3. 46.08 percent are R


In [None]:
# import
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

In [None]:
# Function import Dataset
def importdata():
  url='https://raw.githubusercontent.com/RinzCSGO/machine-learning/main/data/ml-basics/balance-scale.csv'
  balance_data = pd.read_csv(url,sep=',',header = None)

  #print the dataset shape
  print("Dataset Length: ",
        len(balance_data))
  print("Dataset Shape: ",
        balance_data.shape)

  #print the dataset obseravtions
  print("Dataset: ",
        balance_data.head())
  
  return balance_data

In [None]:
# Function split the dataset
def splitdataset(balance_data):

  #Separating the target variable
  X = balance_data.values[:,1:5]
  Y = balance_data.values[:,0]

  #Split the dataset into train and test
  X_train, X_test, y_train, y_test =  train_test_split(X, Y, test_size = 0.3, random_state = 100)

  return X,Y,X_train,X_test, y_train, y_test

In [None]:
# Function to perform training with giniIndex
def train_using_gini(X_train, X_test, y_train):

  #create the classifier object
  clf_gini = DecisionTreeClassifier(
      criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf=5)

  # performing training
  clf_gini.fit(X_train, y_train)
  return clf_gini

In [None]:
# Function to perform training with entropy
def train_using_entropy(X_train, X_test, y_train):

  # DecisionTree with entropy
  clf_entropy = DecisionTreeClassifier(
      criterion= "entropy", random_state = 100,
      max_depth= 3, min_samples_leaf= 5
  )

  #performing training
  clf_entropy.fit(X_train, y_train)
  return clf_entropy

In [None]:
# Function to make predictions
def prediction(X_test, clf_object):

  #prediction on test with giniIndex
  y_pred = clf_object.predict(X_test)
  print("Predicted values: ")
  print(y_pred)
  return y_pred

In [None]:
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):

  print("Confusion Matrix: ",
        confusion_matrix(y_test,y_pred))
  
  print("Accuracy :",
        accuracy_score(y_test, y_pred)*100)

  print("Report : ",
        classification_report(y_test, y_pred))

In [None]:
# main
def main():
  # Building Phase
  data = importdata()
  X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
  clf_gini = train_using_gini(X_train, X_test, y_train)
  clf_entropy = train_using_entropy(X_train, X_test, y_train) 

  # Operational Phase 
  print("Results Using Gini Index:") 

  # Prediction using gini 
  y_pred_gini = prediction(X_test, clf_gini)
  cal_accuracy(y_test, y_pred_gini)

  print("Results Using Entropy:") 
  # Prediction using entropy 
  y_pred_entropy = prediction(X_test, clf_entropy) 
  cal_accuracy(y_test, y_pred_entropy) 
                              

In [None]:
# Calling main function 
if __name__=="__main__": 
  main()

Dataset Length:  625
Dataset Shape:  (625, 5)
Dataset:     0  1  2  3  4
0  B  1  1  1  1
1  R  1  1  1  2
2  R  1  1  1  3
3  R  1  1  1  4
4  R  1  1  1  5
Results Using Gini Index:
Predicted values: 
['R' 'L' 'R' 'R' 'R' 'L' 'R' 'L' 'L' 'L' 'R' 'L' 'L' 'L' 'R' 'L' 'R' 'L'
 'L' 'R' 'L' 'R' 'L' 'L' 'R' 'L' 'L' 'L' 'R' 'L' 'L' 'L' 'R' 'L' 'L' 'L'
 'L' 'R' 'L' 'L' 'R' 'L' 'R' 'L' 'R' 'R' 'L' 'L' 'R' 'L' 'R' 'R' 'L' 'R'
 'R' 'L' 'R' 'R' 'L' 'L' 'R' 'R' 'L' 'L' 'L' 'L' 'L' 'R' 'R' 'L' 'L' 'R'
 'R' 'L' 'R' 'L' 'R' 'R' 'R' 'L' 'R' 'L' 'L' 'L' 'L' 'R' 'R' 'L' 'R' 'L'
 'R' 'R' 'L' 'L' 'L' 'R' 'R' 'L' 'L' 'L' 'R' 'L' 'R' 'R' 'R' 'R' 'R' 'R'
 'R' 'L' 'R' 'L' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'R' 'L' 'R' 'L' 'L' 'L' 'L'
 'L' 'L' 'L' 'R' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'L' 'L' 'R' 'L' 'R' 'L' 'R'
 'L' 'L' 'R' 'L' 'L' 'R' 'L' 'R' 'L' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'R' 'R'
 'L' 'L' 'R' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'L' 'R' 'L' 'L' 'L' 'L' 'R' 'R'
 'L' 'R' 'R' 'L' 'L' 'R' 'R' 'R']
Confusion Matrix:  [[ 0  6  7]
 [

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
