# Classification Problem on Urban Tree Dataset

## PreProcessing

### Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import time
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score, roc_curve, auc,confusion_matrix, RocCurveDisplay

### Importing the Data set, Cleaning Data and Handling Missing Values

In [None]:
df = pd.read_csv('sgl-arbres-urbains-wgs84.csv')

In [None]:
df= df.drop(["ID_ARBRE", "commune", "controle", "insecte_collet", "insecte_tronc", "insecte_houppier","circonference (en cm)", "observation_collet", "observation_tronc", "observation_houppier"], axis=1 )
#Handling Missing Values
mode_cote_voirie=df.cote_voirie.mode()[0]
mode_espece_arbre=df.espece_arbre.mode()[0]
mode_situation=df.situation.mode()[0]
mode_plaie_collet=df.plaie_collet.mode()[0]
mode_champignon_tronc=df.champignon_tronc.mode()[0]
mode_fissure_tronc=df.fissure_tronc.mode()[0]
mode_rejet_tronc=df.rejet_tronc.mode()[0]
mode_plaie_tronc=df.plaie_tronc.mode()[0]
mode_champignon_houppier=df.champignon_houppier.mode()[0]
mode_fissure_houppier=df.fissure_houppier.mode()[0]
mode_ecorce_incluse_houppier=df.ecorce_incluse_houppier.mode()[0]
mode_bois_mort_houppier=df.bois_mort_houppier.mode()[0]
mode_plaie_houppier=df.plaie_houppier.mode()[0]
mode_contrainte=df.contrainte.mode()[0]
mode_classification_diagnostic=df.classification_diagnostic.mode()[0]
median_esperance_maintien=df.esperance_maintien.median()

df=df.fillna({"cote_voirie": mode_cote_voirie, "espece_arbre": mode_espece_arbre, "situation": mode_situation,
            "plaie_collet": mode_plaie_collet, "champignon_tronc": mode_champignon_tronc,
            "fissure_tronc": mode_fissure_tronc, "rejet_tronc": mode_rejet_tronc, "plaie_tronc": mode_plaie_tronc, 
              "champignon_houppier": mode_champignon_houppier, "fissure_houppier":mode_fissure_houppier, "ecorce_incluse_houppier": mode_ecorce_incluse_houppier,
            "bois_mort_houppier": mode_bois_mort_houppier, "plaie_houppier": mode_plaie_houppier, "contrainte": mode_contrainte,
            "classification_diagnostic": mode_classification_diagnostic, "esperance_maintien": median_esperance_maintien})

In [None]:
df.head()

In [None]:
#define function to swap columns so that all numerical data will be in the end columns
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    a, b = col_list.index(col1), col_list.index(col2)
    col_list[b], col_list[a] = col_list[a], col_list[b]
    df = df[col_list]
    return df

#swaping columns
df = swap_columns(df, 'matricule_arbre', 'contrainte')
df = swap_columns(df, 'surf_permeable', 'plaie_houppier')
df = swap_columns(df, 'date_plantation', 'bois_mort_houppier')
df = swap_columns(df, 'hauteur', 'ecorce_incluse_houppier')
df = swap_columns(df, 'diametre', 'fissure_houppier')

df.info()

In [None]:
_ = df.hist(figsize=(20, 14))

In [None]:
cor_matrix = df.corr()
cor_matrix

In [None]:
df.nunique()

In [None]:
df['classification_diagnostic'].value_counts()

In [None]:
target_name = "classification_diagnostic"
target = df[target_name]

### One Hot Encoding for Categorical features and Standard Scaling for Numerical Features

In [None]:
data= df

In [None]:
from sklearn.compose import make_column_selector as selector
#Separating Numerical and Categorical columns
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [None]:
from sklearn.compose import ColumnTransformer
## Creating separate Transformers
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

## Building Model Pipeline

### Logistic Regression Model with One Hot Encoding

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model_lr_ohe = make_pipeline(preprocessor, LogisticRegression(max_iter=500, random_state=0))

In [None]:
from sklearn import set_config
set_config(display='diagram')
model_lr_ohe

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data, target, random_state=0, test_size=0.1)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
%%time
logistic_regression_ohe_model = model_lr_ohe.fit(X_train, y_train)

In [None]:
logistic_regression_ohe_model.score(X_test, y_test)

In [None]:
logistic_regression_ohe_model.predict(X_test)[:5]

#### Model Evaluation with Cross-Validation

In [None]:
from sklearn.model_selection import cross_validate
start = time.process_time()
cv_results_lrm_ohe = cross_validate(logistic_regression_ohe_model, X_train, y_train, cv=5)
print("Test Scores for the Logistic Regression Model with 5-Fold Cross-Validation:")
print(cv_results_lrm_ohe["test_score"])
scores = cv_results_lrm_ohe["test_score"]
print("The Mean Cross-Validation Accuracy is: "f"{scores.mean():.3f} with Standard Deviation of +/- {scores.std():.3f}")
print("Total time taken: {:}".format(time.process_time() - start))

### Logistic Regression with Ordinal Encoding

In [None]:
from sklearn.compose import make_column_selector as selector
#Separating Numerical and Categorical columns
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

categorical_columns = categorical_columns_selector(data)
data_categorical = data[categorical_columns]
data_categorical.head()

In [None]:
print(f"The dataset is composed of {data_categorical.shape[1]} Categorical features")

In [None]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

categorical_encoder = OrdinalEncoder()
data_encoded = categorical_encoder.fit_transform(data_categorical)
data_encoded.shape

In [None]:
data_encoded[:5]

In [None]:
print(
    f"The dataset encoded contains {data_encoded.shape[1]} features")

In [None]:
len(categorical_columns)

In [None]:
categorical_columns

In [None]:
#Adding Numpy array into pandas data frame (Numerical columns)
data_cat_encoded_to_num = pd.DataFrame(data_encoded, columns=['quartier',
 'site',
 'cote_voirie',
 'contrainte',
 'genre_arbre',
 'espece_arbre',
 'situation',
 'type_sol',
 'plaie_houppier',
 'bois_mort_houppier',
 'classe_age',
 'ecorce_incluse_houppier',
 'classe_hauteur',
 'fissure_houppier',
 'classe_circonference',
 'port_arbre',
 'vigueur_pousse',
 'champignon_collet',
 'plaie_collet',
 'champignon_tronc',
 'fissure_tronc',
 'rejet_tronc',
 'tuteurage_arbre',
 'canisse_arbre',
 'plaie_tronc',
 'champignon_houppier', 'classification_diagnostic'])

In [None]:
data_cat_encoded_to_num.head()

In [None]:
data_numerical = data[numerical_columns]
data_numerical_all_features= pd.concat([data_cat_encoded_to_num, data_numerical], axis=1)

In [None]:
data_numerical_all_features.info()

In [None]:
data_numerical_all_features.head()

In [None]:
_ = data_numerical_all_features.hist(figsize=(16, 15))

In [None]:
cor_matrix = df.corr()
cor_matrix

In [None]:
numerical_columns = numerical_columns_selector(data_numerical_all_features)
numerical_preprocessor = StandardScaler()
from sklearn.compose import ColumnTransformer
## Creating separate Transformers
preprocessor = ColumnTransformer([
    ('Standard_scaler', numerical_preprocessor, numerical_columns)])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model_lr_oe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, random_state=0))

In [None]:
from sklearn import set_config
set_config(display='diagram')
model_lr_oe

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data_numerical_all_features, target, random_state=0, test_size=0.1)

In [None]:
%%time
logistic_regression_oe_model = model_lr_oe.fit(X_train, y_train)

In [None]:
logistic_regression_oe_model.score(X_test, y_test)

In [None]:
logistic_regression_oe_model.predict(X_test)[:5]

In [None]:
from sklearn.model_selection import cross_validate
start = time.process_time()
cv_results_lrm_oe = cross_validate(logistic_regression_oe_model, X_train, y_train, cv=5)
print("Test Scores for the Logistic Regression Model with 5-Fold Cross-Validation:")
print(cv_results_lrm_oe["test_score"])
scores = cv_results_lrm_oe["test_score"]
print("The Mean Cross-Validation Accuracy is: "f"{scores.mean():.3f} with Standard Deviation of +/- {scores.std():.3f}")
print("Total time taken: {:}".format(time.process_time() - start))

### Decision Tree Model (Without Pruning)

In [None]:
from sklearn.tree import DecisionTreeClassifier

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer([
    ('cat_preprocessor', categorical_preprocessor, categorical_columns)],
    remainder='passthrough', sparse_threshold=0)
model_dt = make_pipeline(preprocessor, DecisionTreeClassifier(random_state=0, criterion='entropy'))
X_train, X_test, y_train, y_test = train_test_split(
    data, target, random_state=0, test_size=0.1)

In [None]:
from sklearn import set_config
set_config(display='diagram')
model_dt

In [None]:
%%time
decision_tree_model_no_pruning = model_dt.fit(X_train, y_train)

In [None]:
decision_tree_model_no_pruning.score(X_test, y_test)

In [None]:
decision_tree_model_no_pruning.predict(X_test)[:5]

In [None]:
start = time.process_time()
cv_results_dtmnp = cross_validate(decision_tree_model_no_pruning, X_train, y_train, cv=5)
print("Test Scores for the Decision Tree Model with 5-Fold Cross-Validation:")
print(cv_results_dtmnp["test_score"])
scores = cv_results_dtmnp["test_score"]
print("The Mean Cross-Validation Accuracy is: "f"{scores.mean():.3f} with Standard Deviation of +/- {scores.std():.3f}")
print("Total time taken: {:}".format(time.process_time() - start))

### Pruning decision trees with cost complexity pruning and analysing results
Total impurity of leaves vs effective alphas of pruned tree
Minimal cost complexity pruning recursively finds the node with the “weakest link”. The weakest link is characterized by an effective alpha, where the nodes with the smallest effective alpha are pruned first. To get an idea of what values of ccp_alpha could be appropriate, scikit-learn provides DecisionTreeClassifier.cost_complexity_pruning_path that returns the effective alphas and the corresponding total leaf impurities at each step of the pruning process. As alpha increases, more of the tree is pruned, which increases the total impurity of its leaves.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data_numerical_all_features, target, random_state=0, test_size=0.1)
classifier = DecisionTreeClassifier(random_state=0, criterion= "entropy")
classifier.fit(X_train, y_train)
path = classifier.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In the following plot, the maximum effective alpha value is removed, because it is the trivial tree with only one node.

In [None]:
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:], impurities[:], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

Next, we train a decision tree using the effective alphas. The last value in ccp_alphas is the alpha value that prunes the whole tree, leaving the tree, clfs[-1], with one node.

In [None]:
classifiers = []
for ccp_alpha in ccp_alphas:
    classifier = DecisionTreeClassifier(random_state=0, criterion="entropy", ccp_alpha=ccp_alpha)
    classifier.fit(X_train, y_train)
    classifiers.append(classifier)
print(
    "Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        classifiers[-1].tree_.node_count, ccp_alphas[-1]
    )
)

For the remainder of this example, we remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node. Here we show that the number of nodes and tree depth decreases as alpha increases.

In [None]:
#classifiers = classifiers[:-1]
#ccp_alphas = ccp_alphas[:-1]

node_counts = [classifier.tree_.node_count for classifier in classifiers]
depth = [classifier.tree_.max_depth for classifier in classifiers]
fig, ax = plt.subplots(2, 1)
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
#ax[1].set_yscale("log")
#ax[1].set_xscale("log")
fig.tight_layout()

Accuracy vs alpha for training and testing sets
When ccp_alpha is set to zero and keeping the other default parameters of DecisionTreeClassifier, the tree overfits, leading to a 100% training accuracy and 88% testing accuracy. As alpha increases, more of the tree is pruned, thus creating a decision tree that generalizes better. In this example, setting ccp_alpha=0.015 maximizes the testing accuracy.

In [None]:
train_scores = [classifier.score(X_train, y_train) for classifier in classifiers]
test_scores = [classifier.score(X_test, y_test) for classifier in classifiers]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
train_scores = [classifier.score(X_train, y_train) for classifier in classifiers]
test_scores = [classifier.score(X_test, y_test) for classifier in classifiers]
node_counts = [classifier.tree_.node_count for classifier in classifiers]
fig, ax = plt.subplots()
ax.set_xlabel("total no. of nodes")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs Total no.of nodes for training and testing sets")
ax.plot(node_counts, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(node_counts, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
train_scores = [classifier.score(X_train, y_train) for classifier in classifiers]
test_scores = [classifier.score(X_test, y_test) for classifier in classifiers]
depth = [classifier.tree_.max_depth for classifier in classifiers]
fig, ax = plt.subplots()
ax.set_xlabel("tree depth")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs Tree depth for training and testing sets")
ax.plot(depth, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(depth, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
test_scores = np.asarray(test_scores)
test_scores.shape

In [None]:
print(np.concatenate((ccp_alphas.reshape(len(ccp_alphas),1), test_scores.reshape(len(test_scores),1)),1))

### Tuning for Best Pruning Parameters with randomized-search

In [None]:
from scipy.stats import loguniform


class loguniform_int:
    """Integer valued version of the log-uniform distribution"""
    def __init__(self, a, b):
        self._distribution = loguniform(a, b)

    def rvs(self, *args, **kwargs):
        """Random variable sample"""
        return self._distribution.rvs(*args, **kwargs).astype(int)

from scipy.stats import uniform
class uniform_int:
    """Integer valued version of the log-uniform distribution"""
    def __init__(self, a, b):
        self._distribution = uniform(a, b)

    def rvs(self, *args, **kwargs):
        """Random variable sample"""
        return self._distribution.rvs(*args, **kwargs).astype(int)

In [None]:
from sklearn.tree import DecisionTreeClassifier

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer([
    ('cat_preprocessor', categorical_preprocessor, categorical_columns)],
    remainder='passthrough', sparse_threshold=0)

In [None]:
from sklearn.pipeline import Pipeline

model_dtpp = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(criterion= "entropy", random_state=0)),
])


In [None]:
from sklearn import set_config
set_config(display='diagram')
model_dtpp

In [None]:
%%time
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'classifier__ccp_alpha': uniform(0.005, 0.02),
    'classifier__max_depth': loguniform_int(5, 11),
    'classifier__max_leaf_nodes': loguniform_int(34, 76),
}

model_random_search_dtpp = RandomizedSearchCV(
    model_dtpp, param_distributions=param_distributions, n_iter=2000,
    cv=5, verbose=1, random_state =0
)
model_random_search_dtpp.fit(X_train, y_train)

In [None]:
accuracy = model_random_search_dtpp.score(X_test, y_test)

print(f"The test accuracy score of the best model is "
      f"{accuracy:.2f}")

In [None]:
from pprint import pprint

print("The best parameters are:")
pprint(model_random_search_dtpp.best_params_)


We can inspect the results using the attributes `cv_results` as we did
previously.

In [None]:
# get the parameter names
column_results = [
    f"param_{name}" for name in param_distributions.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]

cv_results = pd.DataFrame(model_random_search_dtpp.cv_results_)
cv_results = cv_results[column_results].sort_values(
    "mean_test_score", ascending=False)

def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name

cv_results = cv_results.rename(shorten_param, axis=1)
cv_results.head(200)

In [None]:
def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name

cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

In [None]:
import seaborn as sns

df_tuning = pd.DataFrame(
    {
        "max_leaf_nodes": cv_results["max_leaf_nodes"],
        "ccp_alpha": cv_results["ccp_alpha"],
        "score_bin": pd.cut(
            cv_results["mean_test_score"], bins=np.linspace(0.832296, 0.847970, 7)
        ),
    }
)
sns.set_palette("YlGnBu_r")
ax = sns.scatterplot(
    data=df_tuning,
    x="ccp_alpha",
    y="max_leaf_nodes",
    hue="score_bin",
    s=28,
    color="k",
    edgecolor=None,
)
#ax.set_xscale("log")
#ax.set_yscale("log")

_ = ax.legend(title="mean_test_score", loc="center left", bbox_to_anchor=(1, 0.5))

In [None]:
import seaborn as sns

df_tuning = pd.DataFrame(
    {
        "max_depth": cv_results["max_depth"],
        "ccp_alpha": cv_results["ccp_alpha"],
        "score_bin": pd.cut(
            cv_results["mean_test_score"], bins=np.linspace(0.832296, 0.847970, 7)
        ),
    }
)
sns.set_palette("YlGnBu_r")
ax = sns.scatterplot(
    data=df_tuning,
    x="ccp_alpha",
    y="max_depth",
    hue="score_bin",
    s=15,
    color="k",
    edgecolor=None,
)
#ax.set_xscale("log")
#ax.set_yscale("log")

_ = ax.legend(title="mean_test_score", loc="center left", bbox_to_anchor=(1, 0.5))

In [None]:
#pd.to_numeric(cv_results["max_depth", "ccp_alpha", "max_leaf_nodes"]) 
cv_results["max_depth"] = pd.to_numeric(cv_results["max_depth"])
cv_results["ccp_alpha"] = pd.to_numeric(cv_results["ccp_alpha"])
cv_results["max_leaf_nodes"] = pd.to_numeric(cv_results["max_leaf_nodes"])
cv_results.info()

In [None]:
cv_results_plot= cv_results.drop(["std_test_score"], axis=1)
import plotly.express as px

fig = px.parallel_coordinates(
    cv_results_plot,
    color="mean_test_score",
    color_continuous_scale=px.colors.sequential.Viridis,
)
fig.show()

### Final Decision Tree Model with Pruning

In [None]:
model_dtp = make_pipeline(preprocessor, DecisionTreeClassifier(criterion='entropy', random_state=0, max_depth=8, max_leaf_nodes= 90, ccp_alpha=0.00667))

In [None]:
from sklearn import set_config
set_config(display='diagram')
model_dtp

In [None]:
%%time
decision_tree_model_pruning = model_dtp.fit(X_train, y_train)

In [None]:
decision_tree_model_pruning.score(X_test, y_test)

In [None]:
start = time.process_time()
cv_results_dtmp = cross_validate(decision_tree_model_pruning, X_train, y_train, cv=5)
print("Test Scores for the Final Decision Tree Model (Pruning) with 5-Fold Cross-Validation:")
print(cv_results_dtmp["test_score"])
scores = cv_results_dtmp["test_score"]
print("The Mean Cross-Validation Accuracy is: "f"{scores.mean():.3f} with Standard Deviation of +/- {scores.std():.3f}")
print("Total time taken: {:}".format(time.process_time() - start))