In [156]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from random import seed
from random import randrange
from csv import reader
from math import sqrt

iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df['label'] = df['label'].astype(int)

def TransformToList(data): #把資料轉成 list 呈現
    data_list = list()
    for i in range(len(data)):
        data_list.append(list(data.iloc[i]))
    return data_list

train_data, test_data = train_test_split(df, random_state=123, train_size=0.8)

dataset = TransformToList(df)
train_list = TransformToList(train_data) 
test_list = TransformToList(test_data)

In [158]:
# Split a dataset based on an attribute and an attribute value
def data_split(index, value, dataset):
	left, right = [],[]
	for row in dataset: #讀取資料集中的每筆資料，判斷這在個特徵下的每筆資料屬於左邊還右邊
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right

In [159]:
def gini_index(groups, classes):
	# count all samples at split point
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups: #左子樹與右子樹
		size = float(len(group))
		if size == 0: # avoid divide by zero
			continue
		score = 0.0
		for class_val in classes: 	# score the group based on the score for each class
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p
		gini += (1.0 - score) * (size / n_instances) # 根據左子樹與右子樹樣本的比例加權gini index
	return gini

In [160]:
def get_split(dataset, n_features):
	class_values = list(set(row[-1] for row in dataset))
	best_index, best_value, best_gini, best_groups = 999, 999, 999, None
	features = list()
	while len(features) < n_features: #隨機選擇n個特徵
		index = randrange(len(dataset[0])-1) #隨機選一個特徵
		if index not in features:
			features.append(index)
	for index in features: 	#歷遍所有特徵的所有值
		for row in dataset: #每筆資料
			groups = data_split(index, row[index], dataset) #分割資料
			gini = gini_index(groups, class_values) #分割後的gini
			if gini < best_gini:
				best_index = index
				best_value = row[index]
				best_gini = gini
				best_groups = groups
	return {'index':best_index, 'value':best_value, 'groups':best_groups}

In [161]:
# Create a terminal node value
def to_terminal(group):
	outcomes = [row[-1] for row in group]
	return max(set(outcomes), key=outcomes.count)
 
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, n_features, depth):
	left, right = node['groups']
	del(node['groups'])
	if not left or not right: 	# check for a no split 
		node['left'] = node['right'] = to_terminal(left + right) #無法分割時回傳結果
		return
	if depth >= max_depth:	# check for max depth
		node['left'], node['right'] = to_terminal(left), to_terminal(right) #達最大深度時回傳結果
		return

	if len(left) <= min_size:	# process left child
		node['left'] = to_terminal(left)
	else: #左子樹超過最小樣本，繼續分割
		node['left'] = get_split(left, n_features)
		split(node['left'], max_depth, min_size, n_features, depth+1) #每次深度depth都會增加

	if len(right) <= min_size:	# process right child
		node['right'] = to_terminal(right)
	else: #右子樹超過最小樣本，繼續分割
		node['right'] = get_split(right, n_features)
		split(node['right'], max_depth, min_size, n_features, depth+1)

In [162]:
# Build a decision tree
def decision_tree(train, max_depth, min_size, n_features):
	root = get_split(train, n_features)
	split(root, max_depth, min_size, n_features, 1)
	return root
 
# Make a prediction with a decision tree
def predict(tree, row):
	if row[tree['index']] < tree['value']: # 預測值小於某個特徵的某個值，放到左子樹
		if isinstance(tree['left'], dict): # 檢查左子樹的型態是否是dictionary
			return predict(tree['left'], row)
		else:
			return tree['left']
	else:
		if isinstance(tree['right'], dict):
			return predict(tree['right'], row)
		else:
			return tree['right']

In [163]:
# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
    sample = list()
    n_sample = round(len(dataset) * ratio)
    while len(sample) < n_sample:
        index = randrange(len(dataset))
        sample.append(dataset[index])
    return sample
 
# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key=predictions.count)
 
# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
    trees = []
    for i in range(n_trees):
        sample = subsample(train, sample_size)
        tree = decision_tree(sample, max_depth, min_size, n_features)
        trees.append(tree)
    predictions = [bagging_predict(trees, row) for row in test]
    return(predictions)

In [164]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

In [165]:
import numpy as np

X = df.drop(['label'],axis=1)
Y = df['label']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, random_state=123)
Y_test = np.array(Y_test)

In [166]:
#n_folds = 5
max_depth = 10
min_size = 1
sample_size = 0.8
n_trees =100
n_features = int(sqrt(len(dataset[0])-1))

#tree = decision_tree(train_list, max_depth, min_size, n_features)
#pred = [ predict(tree,row) for row in test_list]
#accuracy_metric(actual,pred)

pred = random_forest(train_list,test_list, max_depth, min_size, sample_size, n_trees , n_features)
accuracy_metric(Y_test,pred)

96.66666666666667

In [167]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth = 10 ,min_samples_leaf=1, min_samples_split=2 ,max_features='sqrt',n_estimators= 100)
rf.fit(X_train,Y_train)
test_pred = rf.predict(X_test)
accuracy_metric(Y_test,test_pred)

96.66666666666667