# DECISION TREE

# LOAD THE DEPENDANCIES

## Pandas

In [49]:
import pandas as pd
from pandas import set_option
from pandas.plotting import scatter_matrix
from pandas_profiling import ProfileReport

## Numpy

In [50]:
import numpy as np
from numpy import set_printoptions

## Matplotlib & Seaborn

In [51]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import graphviz

## Sklearn

In [52]:
# from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
# from sklearn.tree.export import 
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score,recall_score
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA 

## Math & Statistics

In [53]:
from scipy import stats
from scipy.stats import norm
import math

## System

In [54]:
import os
import sys
import pprint
# os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

## notebook widgets

In [55]:
# import ipywidgets as widgets
from IPython.display import Image
from IPython.display import display, Math, Latex
from IPython.core.interactiveshell import InteractiveShell  
InteractiveShell.ast_node_interactivity = "all"

In [56]:
import configparser

# FUNCTIONS

## Label Encoding

In [57]:
def label_encoding(dataset,input_headers):
    
    for i in input_headers:
        
        the_data_type=dataset[i].dtype.name
        if (the_data_type=='object'):
            lable_enc=preprocessing.LabelEncoder()
            lable_enc.fit(dataset[i])
            labels=lable_enc.classes_   #this is an array
            labels=list(labels) #converting the labels array to a list
            print(labels)
            dataset[i]=lable_enc.transform(dataset[i])

            return labels
    
        else:
            c=list(np.unique(dataset[i]))
            return [str(x) for x in c]

## Feature Scaling

In [58]:
def feature_scaling(X_train,X_test):
    sc_X=StandardScaler()
    X_train=sc_X.fit_transform(X=X_train,y=None)
    X_test=sc_X.fit_transform(X=X_test,y=None)

    print(sc_X.fit(X_train))
    print(X_train[0:5])
    
    
    
    return X_train, X_test

## Visualization

### Plot the data space (scatter)

In [59]:
def plot_of_data_space(dataset,data,labels,input_headers):
    
    
    xx_1=pd.DataFrame(data[:,0]) 
    xx_2=pd.DataFrame(data[:,1]) 
    y=pd.DataFrame(labels)
    
   
    plt.figure(figsize=(15,10)) 
    b=plt.scatter(xx_1[y==0],xx_2[y==0],color='b') 
    r=plt.scatter(xx_1[y==1],xx_2[y==1],color='r')
    g=plt.scatter(xx_1[y==2],xx_2[y==2],color='g') 
    bl=plt.scatter(xx_1[y==3],xx_2[y==3],color='black')
    
    
#     for i in range(0,len(xx_1)):
#         print(y[i])
#         if (y[i]==0):
#             a=plt.scatter(xx_1[i],xx_2[i],marker='o',color='blue',s=30)
#         if (y[i]==1):
#             b=plt.scatter(xx_1[i],xx_2[i],marker='o',color='red',s=30)
#         if (y[i]==2):
#             c=plt.scatter(xx_1[i],xx_2[i],marker='o',color='green',s=30)
#         if (y[i]==3):
#             d=plt.scatter(xx_1[i],xx_2[i],marker='o',color='black',s=30)
        
#     plt.xlabel(f1);plt.ylabel(f2);
#     plt.legend((a,b),tuple(np.unique(labels)))

    plt.xlabel(input_headers[0])
    plt.ylabel(input_headers[1])

    plt.grid()
    plt.legend((b,r,g,bl),tuple(np.unique(labels)))
    plt.show()


### Feature Distributions (histograms)

In [60]:
def feature_distributions(df,target_header,*args):
    
    
    data=df.drop(target_header,axis=1,inplace=False)

    num_plot_rows=len(data.columns)

    print (f'classes {classes}')
    
    label_encoder = preprocessing.LabelEncoder()
    df[target_header]=label_encoder.fit_transform(df[target_header])
    labels=label_encoder.classes_   #this is an array
    labels=list(labels) #converting the labels array to a list
    print (f'labels {labels}')

    fig = plt.figure(figsize = (20,num_plot_rows*4))
    j = 0

    ax=[]
    colors=['b','r','g','black']
    for i in data.columns:
        plt.subplot(num_plot_rows, 4, j+1)
        j += 1
        for k in range(len(labels)):
    #         print(k)
            a=sns.distplot(data[i][df[target_header]==k], color=colors[k], label = str(labels[k])+classes[k]);
            ax.append(a)
        plt.legend(loc='best')
    
    fig.suptitle(target_header+ ' Data Analysis')
    fig.tight_layout()
    # fig.subplots_adjust(top=0.95)
    plt.show()










## Preprocessing: Splitting the dataset

In [61]:
def split_the_dataset(dataset,input_headers,target_header):
    
    X=dataset[input_headers]
    y=dataset[target_header]
    
    X.head()
    
    return X,y


## Replacing Zeros

In [62]:
def replacing_zeros(dataset,the_headers):
    """Function used to remove zeros from numeric features when 0 is not practical"""

    for header in the_headers:
        dataset[header]=dataset[header].replace(0,np.nan)
        mean=int(dataset[header].mean(skipna=True))
        dataset[header]=dataset[header].replace(np.nan,mean)
        
    return dataset

## Feature Correlations

In [63]:
def correlation_matrix(dataset,input_headers,target_header):
    
    feature_matrix=dataset[input_headers]
    corr=feature_matrix.corr()
    corr
    
    plt.figure(figsize=(10,10))
    corr_plot=sns.heatmap(corr,cmap="Reds",annot=True)
    
    corr_pair=sns.pairplot(dataset,hue=target_header[0])
    plt.show()
    
    return corr,corr_plot,corr_pair 

## Drop Unwanted Features

In [64]:


def feature_drop(dataset,headers_to_drop):
    
    dataset.drop(labels=headers_to_drop,axis=1,inplace=True)
    dataset.head()

## Principal Component Analysis (PCA)

In [65]:
def pca(dataset,input_headers,target_header,*args):
    
    feature_matrix=dataset[input_headers]
    model = PCA(n_components=2)            # 2. Instantiate the model with hyperparameters
    model.fit(feature_matrix)  # 3. Fit to data. Notice y is not specified!
    X_2D = model.transform(feature_matrix)         # 4. Transform the data to two dimensions


    dataset['PCA1'] = X_2D[:, 0]
    dataset['PCA2']= X_2D[:, 1]

    sns.lmplot("PCA1", "PCA2", hue=target_header[0], data=dataset, fit_reg=False);
    
    return dataset['PCA1'],dataset['PCA2']
    
#     sns.distplot(dataset['PCA1'][dataset[target_header[0]]==0], color='b', label = '0')
#     sns.distplot(dataset['PCA1'][dataset[target_header[0]]==1], color='r', label = '1')
#     # sns.distplot(df['PCA1'][df[target_header]==2], color='r', label = '2')
#     plt.legend(loc='best')
#     plt.show()

ACQUIRE PROGRAM PARAMETERS & SETTINGS FROM Model_Parameters.ini FILE

In [66]:
def get_program_settings():
    config = configparser.ConfigParser()
    config.read('C:/Users/Crystal/Desktop/Programs/machine_learning/Machine-Learning-Classification-scikit-learn/model_parameters.ini')

    para=dict()
    para['location']=config['Data Select']['data file']
    para['dataset report']=config['Report Option']['dataset report']
    para['feature report']=config['Report Option']['feature report']
    para['selected features']=(config['Data Select']['features']).split(',')
    para['target']=config['Data Select']['target']

    para['test size']=float(config['Train-Test Data']['test size'])
    para['random state']=int(config['Train-Test Data']['random state'])

    para['cv']=int(config['Cross Validation']['cv'])



    print(para)

    return para



# MAIN PROGRAM

## Get Data

In [67]:
if __name__ == "__main__":

    # parameters=get_program_settings()
    # location=parameters['location']
    
    # dataset=pd.read_csv(location)

    dataset=load_iris()
    
    # dataset.info()
    # dataset.head()
    # dataset.describe()
    
    


In [68]:
dataset

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [None]:
# if (parameters['dataset report']=='YES'):
    dataset_report = ProfileReport(dataset,minimal=True)
    dataset_report.to_file(output_file='all_data_eda.html')

## Drop unwanted features (columns)

In [None]:
all_cols=list(dataset.columns)
all_cols

In [None]:
the_target=[]
the_target.append(parameters['target'])
selected_cols=parameters['selected features']+the_target
# selected_cols=['time','ejection_fraction',the_target[0]]
selected_cols

In [None]:
drop_these=list(set(all_cols).difference(set(selected_cols)))
drop_these

In [None]:
drop_columns=drop_these
if (drop_columns!=[]):
    q1=input('Do you need to drop any columns in the dataset?')
    if (q1.lower()=='y'):
        feature_drop(dataset,drop_columns)

In [None]:
dataset.head()

## Selecting inputs and targets

In [None]:
target_header=the_target
selected_cols.remove(target_header[0])
input_headers=selected_cols
print(target_header)
print(input_headers)
target_label=label_encoding(dataset,target_header)

classes=target_label
print (classes)
test_label=label_encoding(dataset,input_headers)

dataset=dataset[input_headers+target_header]
X,y=split_the_dataset(dataset,input_headers,target_header)

print(X.head())

## Replace zeros with the mean where needed.


In [None]:
rz=input('Do you need to replace any zeros in the dataset?')
if (rz.lower()=='y'):
    the_headers=b
    dataset=replacing_zeros(dataset,the_headers)
    dataset.head()

## Data Visualizations

### Data space

In [None]:
if (X.values.shape[1]==2):
    plot_of_data_space(dataset,X.values,y.values,input_headers)
else:
    x1,x2=pca(dataset,input_headers,target_header)
target_header[0]



### Feature distributions

In [None]:
dataset.head()
feature_distributions(dataset,target_header[0],classes)

# X.head()

## Correlation Matrix

In [None]:
correlation_matrix(dataset,input_headers,target_header)
y.head()

## Splitting the Train-Test data

In [None]:
X.head()

In [None]:
if (parameters['feature report']=='YES'):
    feature_report = ProfileReport(X)
    feature_report.to_file(output_file='feature_eda.html')

In [None]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=parameters['test size'],random_state=parameters['random state'])
ytest.shape

In [None]:
Xtest[0:5]

## Scale the data  

In [None]:
Xtrain, Xtest=feature_scaling(Xtrain,Xtest)
ytest.head()

## Decision Tree Model

In [None]:
model = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Fit model to training data

In [None]:
model.fit(Xtrain,ytrain)

## Model prediction on test data

In [None]:
y_model=model.predict(Xtest)
y_model

In [None]:
# y_model_prob=model.predict_proba(Xtest)
# y_model_prob

## Model score & performance

In [None]:
accuracy_score(ytest,y_model)

In [None]:
recall_score(ytest, y_model,average=None)

In [None]:
precision_score(ytest, y_model,average=None)

### Confusion Matrix

In [None]:
cm=confusion_matrix(ytest, y_model)

In [None]:
cm

In [None]:
fig, ax = plt.subplots()
cmap=plt.cm.Blues
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)

ax.set(xticks=np.arange(cm.shape[1]),
       yticks=np.arange(cm.shape[0]),
       xticklabels=classes,
       yticklabels=classes,
       title="confusion",
       ylabel='True label',
       xlabel='Predicted label')



# Loop over data dimensions and create text annotations.
normalize=False
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], fmt),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()



plt.show()

### Cross Validation

In [None]:
score=cross_val_score(model,X,y,cv=parameters['cv'])

In [None]:
score

In [None]:
score.mean()

In [None]:
sns.boxplot(x=score,orient='v')
plt.show()

In [None]:
sns.heatmap(cm,square=True,annot=True,cbar=False)
plt.xlabel('predicted value')
plt.ylabel('true value')
plt.show()

## Decision Tree Chart

In [None]:
dot_data=tree.export_graphviz(model, out_file=None, 
                     feature_names=input_headers,  
                     class_names=classes,  
                     filled=True, rounded=True,  
                     special_characters=True) 
graph = graphviz.Source(dot_data) 

In [None]:
graph.render(r'C:\Users\Crystal\Desktop\Programs\machine_learning\results')

In [None]:
# graph

In [None]:
# decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
# decision_tree = decision_tree.fit(Xtrain,ytrain)
# r = export_text(model, feature_names=input_headers)
# print(r)

# EXTRAS