In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("shared/zoo.csv")

In [2]:
df

Unnamed: 0,animal_name,species,class_type,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
0,aardvark,Mammal,1,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
1,antelope,Mammal,1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1
2,bass,Fish,4,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0
3,bear,Mammal,1,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
4,boar,Mammal,1,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,vulture,Bird,2,0,1,1,0,1,0,1,0,1,1,0,0,2,1,0,1
96,wallaby,Mammal,1,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,1
97,wasp,Bug,6,1,0,1,0,1,0,0,0,0,1,1,0,6,0,0,0
98,wolf,Mammal,1,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1


In [3]:
# Drop animal_name column and adjust class_type to be from 0 to 6
df['class_type'] = df['class_type'] - 1
df

Unnamed: 0,animal_name,species,class_type,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
0,aardvark,Mammal,0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
1,antelope,Mammal,0,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1
2,bass,Fish,3,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0
3,bear,Mammal,0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
4,boar,Mammal,0,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,vulture,Bird,1,0,1,1,0,1,0,1,0,1,1,0,0,2,1,0,1
96,wallaby,Mammal,0,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,1
97,wasp,Bug,5,1,0,1,0,1,0,0,0,0,1,1,0,6,0,0,0
98,wolf,Mammal,0,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1


##### 我们要预测的是每个动物的分类，所以目标变量（target variable）是 class_type。其余的列，除了 animal_name，都是预测器（predictors），因为 animal_name 本身就几乎包含了关于物种的所有信息，不应该用于预测 class_type。

In [4]:
# Prepare data; Separate predictors and target variable
X = df.drop(columns=['class_type', 'animal_name', 'species'], axis=1)
y = df['class_type']

In [5]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)


In [6]:
X_train.dtypes

hair        int64
feathers    int64
eggs        int64
milk        int64
airborne    int64
aquatic     int64
predator    int64
toothed     int64
backbone    int64
breathes    int64
venomous    int64
fins        int64
legs        int64
tail        int64
domestic    int64
catsize     int64
dtype: object

In [7]:
# Build and train the model
model = XGBClassifier(use_label_encoder=False, objective= 'binary:logistic')
model.fit(X_train, y_train)



In [8]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

In [9]:
accuracy

0.95

In [11]:
import numpy as np
# Define the features of the new animal
new_animal = np.array([[0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 2, 1, 0, 0]])
# Use the model to predict the species
prediction = model.predict(new_animal)
prediction

array([1])