In [43]:
#Run this cell
#Importing necessary libraries 
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt
import json
ans=[None]*5

In [45]:
#Download Dataset from https://www.kaggle.com/uciml/zoo-animal-classification
data = pd.read_csv('zoo.csv')

In [46]:
data.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [47]:
animals = pd.read_csv('class.csv')
animals.head()

Unnamed: 0,Class_Number,Number_Of_Animal_Species_In_Class,Class_Type,Animal_Names
0,1,41,Mammal,"aardvark, antelope, bear, boar, buffalo, calf,..."
1,2,20,Bird,"chicken, crow, dove, duck, flamingo, gull, haw..."
2,3,5,Reptile,"pitviper, seasnake, slowworm, tortoise, tuatara"
3,4,13,Fish,"bass, carp, catfish, chub, dogfish, haddock, h..."
4,5,4,Amphibian,"frog, frog, newt, toad"


In [48]:
#Import the dataset and define the feature as well as the target datasets / columns 
X = data.loc[:, data.columns != 'class_type']
y = data.loc[:, data.columns == 'class_type']
#We drop the animal names since this is not a good feature to split the data on  
X = X.loc[:, X.columns != 'animal_name']
X = np.array(X)
y = np.array(y)

In [131]:
#Write a function to find the entropy on a split "target_col"
def entropy(target_col):
    #target column as a pandas dataframe is taken as an argument
    vals={}
    col = np.array(target_col)
    col = col.ravel()
    l = col.size
    a = []
    size=0
    for val in col:
        if val in vals.keys():
            a[vals[val]] = a[vals[val]]+1
        else:
            vals[val] = size
            a.append(1)
            size = size+1
    a = np.array(a)
    a = a/l
    log_array = np.log2(a)
    g = np.multiply(log_array, a)
    res = -np.sum(g)
    return res

In [132]:
#Save all the feature names in an array "feature names"
feature_names=['hair','feathers','eggs','milk','airborne','aquatic','predator','toothed','backbone', 
               'breathes','venomous','fins','legs','tail','domestic','catsize']
#Find the entropy of all the features in the dataset
entropy_value = {}
for feature in feature_names:
    column = data.loc[:, data.columns == feature]
    entropy_value[feature] = entropy(column)
print(entropy_value)

{'hair': 0.9840304711717018, 'feathers': 0.7179499765002912, 'eggs': 0.9794662187017298, 'milk': 0.9743197211096903, 'airborne': 0.7910662980902585, 'aquatic': 0.9396846718728562, 'predator': 0.9914266810680206, 'toothed': 0.9685867165455516, 'backbone': 0.6761627418829198, 'breathes': 0.7374895672137456, 'venomous': 0.3993820824245975, 'fins': 0.653839880626333, 'legs': 2.0338113440641234, 'tail': 0.8228368841492257, 'domestic': 0.5538976334852962, 'catsize': 0.9880162151534646}


In [133]:
#Find the entropy of the feature "toothed"
ans[0]=entropy_value["toothed"]

In [148]:
#Write a function to calculate Information Gain on a split attribute and a target column
def InfoGain(data,split_attribute_name,target_name="class"):       
    #Calculate the entropy of the total dataset  
    col = data.loc[:, data.columns == split_attribute_name]
    target = data.loc[:, data.columns == target_name]
    original_entropy = entropy(target)
    #Calculate the values and the corresponding counts for the split attribute   
    split = {}
    split_count = {}
    array_col = np.array(col).ravel()
    array_target = np.array(target).ravel()
    total_size = array_col.size
    i=0
    for attribute in array_col:
        if attribute in split.keys():
            split[attribute].append(array_target[i])
        else:
            a = []
            split[attribute] = a
            split[attribute].append(array_target[i])
            split_count[attribute] = 0
        i=i+1
        split_count[attribute] = split_count[attribute] + 1
    #Calculate the weighted entropy 
    entropies = []
    weights = []
    for attribute in split:
        df = pd.DataFrame(split[attribute])
        entropies.append(entropy(df))
        weights.append(split_count[attribute]/total_size)
    entropies = np.array(entropies)
    weights = np.array(weights)
    weighted_entropies = np.multiply(entropies, weights)
    #Calculate the information gain  
    info_gain = original_entropy - np.sum(weighted_entropies)
    return info_gain

In [150]:
#Find the information gain having split attribute "hair" and the target feature name "milk"
ans[1]=InfoGain(data, "hair", "milk")

In [154]:
#Find the Info gain having "milk" as the split attribute and all the other features as target features one at a time
IG = {}
for target in feature_names:
    if target == 'milk':
        continue
    else:
        IG[target] = InfoGain(data, "milk", target)
for target in IG:
    print("Information Gain taking milk as split attribute and {target_name} as target feature = {val}".format(target_name = target, val = IG[target]))
        

Information Gain taking milk as split attribute and hair as target feature = 0.6599660577558698
Information Gain taking milk as split attribute and feathers as target feature = 0.17242769884415887
Information Gain taking milk as split attribute and eggs as target feature = 0.7870598185734241
Information Gain taking milk as split attribute and airborne as target feature = 0.11370352314621812
Information Gain taking milk as split attribute and aquatic as target feature = 0.10181386403185955
Information Gain taking milk as split attribute and predator as target feature = 0.0006367772440212249
Information Gain taking milk as split attribute and toothed as target feature = 0.3465412540071715
Information Gain taking milk as split attribute and backbone as target feature = 0.15262359382508262
Information Gain taking milk as split attribute and breathes as target feature = 0.18259765312929555
Information Gain taking milk as split attribute and venomous as target feature = 0.06284178150207947
I

In [49]:
#Import Decision Tree Classifier from sklearn
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
#Split the given data into 80 percent training data and 20 percent testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [50]:
#Fit the given data
X_train = np.array(X_train)
y_train = np.array(y_train)
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [106]:
#Make a prediction on the test data and return the percentage of accuracy
from sklearn.metrics import accuracy_score
X_test =  np.array(X_test)
y_test = np.array(y_test)
y_pred = tree.predict(X_test)
ans[2]=accuracy_score(y_test, y_pred)*100

100.0

In [119]:
#Run this cell to visualize the decision tree
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
export_graphviz(tree, out_file=dot_data, feature_names=feature_names,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

InvocationException: GraphViz's executables not found

In [155]:
#Use sklearn to make a classification report and a confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
cr = classification_report(y_pred, y_test)
cm = confusion_matrix(y_test, y_pred)

In [None]:
#Find the recall,f1-score for class type '3'
from sklearn.metrics import f1_score
recall_3 =  
ans[3]=

In [None]:
#Calculate Mean Absolute Error,Mean Squared Error and Root Mean Squared Error
...
...
...

In [None]:
#Find the mean absolute error and root mean square error, save then in a list [mae,rmse]
ans[4]=

In [None]:
ans = [item for item in ans]
with open('ans.json', 'w') as f:
    json.dump(ans, f)
! ~/submit

In [None]:
pip install sklearn

In [None]:
pip install --upgrade pip

In [None]:
pip install sklearn

In [8]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp310-cp310-win_amd64.whl (8.3 MB)
     ---------------------------------------- 8.3/8.3 MB 4.0 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.3.2
  Downloading scipy-1.10.0-cp310-cp310-win_amd64.whl (42.5 MB)
     ---------------------------------------- 42.5/42.5 MB 4.5 MB/s eta 0:00:00
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     -------------------------------------- 298.0/298.0 kB 4.6 MB/s eta 0:00:00
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.1 scipy-1.10.0 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [107]:
pip install six

Note: you may need to restart the kernel to use updated packages.


In [110]:
!pip install --upgrade scikit-learn==0.20.3

Collecting scikit-learn==0.20.3
  Downloading scikit-learn-0.20.3.tar.gz (11.8 MB)
     ---------------------------------------- 11.8/11.8 MB 3.4 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.1
    Uninstalling scikit-learn-1.2.1:
      Successfully uninstalled scikit-learn-1.2.1
  Running setup.py install for scikit-learn: started
  Running setup.py install for scikit-learn: finished with status 'error'
  Rolling back uninstall of scikit-learn
  Moving to c:\users\user\appdata\local\programs\python\python310\lib\site-packages\scikit_learn-1.2.1.dist-info\
   from C:\Users\User\AppData\Local\Programs\Python\Python310\Lib\site-packages\~cikit_learn-1.2.1.dist-info
  Moving to c:\users\user\appdata\local\programs\python\python310\lib\site-packages\sklearn\
   from C:\Users\User\AppData\L

  DEPRECATION: scikit-learn is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559
  error: subprocess-exited-with-error
  
  Running setup.py install for scikit-learn did not run successfully.
  exit code: 1
  
  [682 lines of output]
  Partial import of sklearn during the build process.
  INFO: blas_opt_info:
  INFO: blas_armpl_info:
  INFO: No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
  INFO: customize MSVCCompiler
  INFO:   libraries armpl_lp64_mp not found in ['C:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python310\\lib', 'C:\\', 'C:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python310\\libs']
  INFO:   NOT AVAILABLE
  INFO:
  INFO: blas_mkl_info:
  INFO:   libr

  creating build\lib.win-amd64-3.10\sklearn\feature_selection
  copying sklearn\feature_selection\base.py -> build\lib.win-amd64-3.10\sklearn\feature_selection
  copying sklearn\feature_selection\from_model.py -> build\lib.win-amd64-3.10\sklearn\feature_selection
  copying sklearn\feature_selection\mutual_info_.py -> build\lib.win-amd64-3.10\sklearn\feature_selection
  copying sklearn\feature_selection\rfe.py -> build\lib.win-amd64-3.10\sklearn\feature_selection
  copying sklearn\feature_selection\univariate_selection.py -> build\lib.win-amd64-3.10\sklearn\feature_selection
  copying sklearn\feature_selection\variance_threshold.py -> build\lib.win-amd64-3.10\sklearn\feature_selection
  copying sklearn\feature_selection\__init__.py -> build\lib.win-amd64-3.10\sklearn\feature_selection
  creating build\lib.win-amd64-3.10\sklearn\feature_selection\tests
  copying sklearn\feature_selection\tests\test_base.py -> build\lib.win-amd64-3.10\sklearn\feature_selection/tests
  copying sklearn\feat

  copying sklearn\externals\joblib\hashing.py -> build\lib.win-amd64-3.10\sklearn\externals\joblib
  copying sklearn\externals\joblib\logger.py -> build\lib.win-amd64-3.10\sklearn\externals\joblib
  copying sklearn\externals\joblib\memory.py -> build\lib.win-amd64-3.10\sklearn\externals\joblib
  copying sklearn\externals\joblib\my_exceptions.py -> build\lib.win-amd64-3.10\sklearn\externals\joblib
  copying sklearn\externals\joblib\numpy_pickle.py -> build\lib.win-amd64-3.10\sklearn\externals\joblib
  copying sklearn\externals\joblib\numpy_pickle_compat.py -> build\lib.win-amd64-3.10\sklearn\externals\joblib
  copying sklearn\externals\joblib\numpy_pickle_utils.py -> build\lib.win-amd64-3.10\sklearn\externals\joblib
  copying sklearn\externals\joblib\parallel.py -> build\lib.win-amd64-3.10\sklearn\externals\joblib
  copying sklearn\externals\joblib\pool.py -> build\lib.win-amd64-3.10\sklearn\externals\joblib
  copying sklearn\externals\joblib\testing.py -> build\lib.win-amd64-3.10\sklea

In [112]:
import six
import sys
sys.modules['sklearn.externals.six'] = six

In [114]:
pip install pydotplus

Collecting pydotplus
  Downloading pydotplus-2.0.2.tar.gz (278 kB)
     ------------------------------------ 278.7/278.7 kB 818.4 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: pydotplus
  Running setup.py install for pydotplus: started
  Running setup.py install for pydotplus: finished with status 'done'
Successfully installed pydotplus-2.0.2
Note: you may need to restart the kernel to use updated packages.


  DEPRECATION: pydotplus is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559


In [116]:
pip install graphviz

Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     -------------------------------------- 47.0/47.0 kB 261.0 kB/s eta 0:00:00
Installing collected packages: graphviz
Successfully installed graphviz-0.20.1
Note: you may need to restart the kernel to use updated packages.


    animal_name  hair  feathers  eggs  milk  airborne  aquatic  predator  \
0      aardvark     1         0     0     1         0        0         1   
1      antelope     1         0     0     1         0        0         0   
2          bass     0         0     1     0         0        1         1   
3          bear     1         0     0     1         0        0         1   
4          boar     1         0     0     1         0        0         1   
..          ...   ...       ...   ...   ...       ...      ...       ...   
96      wallaby     1         0     0     1         0        0         0   
97         wasp     1         0     1     0         1        0         0   
98         wolf     1         0     0     1         0        0         1   
99         worm     0         0     1     0         0        0         0   
100        wren     0         1     1     0         1        0         0   

     toothed  backbone  breathes  venomous  fins  legs  tail  domestic  \
0          1 

0      0
1      0
2      0
3      0
4      0
      ..
96     0
97     1
98     0
99     0
100    0
Name: venomous, Length: 101, dtype: int64
