In [None]:
import codecademylib3
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#https://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data
cols = ['name','landmass','zone', 'area', 'population', 'language','religion','bars','stripes','colours',
'red','green','blue','gold','white','black','orange','mainhue','circles',
'crosses','saltires','quarters','sunstars','crescent','triangle','icon','animate','text','topleft','botright']
df= pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data", names = cols)

#variable names to use as predictors
var = [ 'red', 'green', 'blue','gold', 'white', 'black', 'orange', 'mainhue','bars','stripes', 'circles','crosses', 'saltires','quarters','sunstars','triangle','animate']

#Print number of countries by landmass, or continent
print("Number of countries by continent")
print(df['landmass'].value_counts())

#Create a new dataframe with only flags from Europe and Oceania
df_36 = df[df['landmass'].isin([3, 6])]

#Print the average values of the predictors for Europe and Oceania
print("Average values of predictors for Europe and Oceania")
df_avg_36 = df_36.groupby('landmass')[var].mean()
print(df_avg_36)

#Create labels for only Europe and Oceania
labels = df['landmass'].isin([3,6])

#Print the variable types for the predictors
dtypes = df_36[var].dtypes
print("Data types of predictor variables:")
print(dtypes)

#Create dummy variables for categorical predictors
data = pd.get_dummies(df[var])
print("One hot encoding of predictor variables:")
print(data.head())
#Split data into a train and test set
X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=1, test_size=0.4)

#Fit a decision tree for max_depth values 1-20; save the accuracy score in acc_depth
acc_depth = []
depths = range(1, 21)
for depth in depths:
  dt = DecisionTreeClassifier(max_depth=depth)
  dt.fit(X_train, y_train)
  acc = dt.score(X_test, y_test)
  acc_depth.append(acc)

#Plot the accuracy vs depth
plt.plot(depths, acc_depth)
plt.xlabel("Depth of Decision Tree")
plt.ylabel("Accuract of Decision Tree")
plt.show()

#Find the largest accuracy and the depth this occurs
idx = acc_depth.index(np.max(acc_depth))
print("IDX: %f" % idx)
optimal_depth = idx + 1
print("Optimal depth:")
print(optimal_depth)

#Refit decision tree model with the highest accuracy and plot the decision tree
dt = DecisionTreeClassifier(max_depth = optimal_depth)
dt.fit(X_train, y_train)
tree.plot_tree(dt, feature_names=var)
plt.show()
#Create a new list for the accuracy values of a pruned decision tree.  Loop through
#the values of ccp and append the scores to the list
acc_pruned = []
ccp = [x/100 for x in range(1, 100)]# [0.0001, 0.001, 0.01, 0.1, 1]
for value in ccp:
  dt = DecisionTreeClassifier(max_depth=optimal_depth, ccp_alpha=value)
  dt.fit(X_train, y_train)
  acc = dt.score(X_test, y_test)
  acc_pruned.append(acc)
#Plot the accuracy vs ccp_alpha
plt.figure()
plt.plot(ccp, acc_pruned)
plt.xlabel("CCP alpha Value")
plt.ylabel("Accuracy")
plt.title("Accuracy vs CCP Alpha Value")
plt.show()

#Find the largest accuracy and the ccp value this occurs
largest = np.max(acc_pruned)
optimal_ccp = ccp[acc_pruned.index(largest)]
print("Optimal CCP:")
print(optimal_ccp)

#Fit a decision tree model with the values for max_depth and ccp_alpha found above
dt = DecisionTreeClassifier(max_depth=optimal_depth, ccp_alpha=optimal_ccp)
dt.fit(X_train, y_train)
plt.figure()
tree.plot_tree(dt, feature_names=var)
plt.show()


#Plot the final decision tree
