In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import sklearn
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier   
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier 
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt


%matplotlib inline
plt.rcParams['font.size'] = 24


from Classification import *

# 1. Basic Exploration

#### (1) Import Data and get basic information

In [None]:
# Import data
data=pd.read_csv("ObesityData.csv")
data

In [None]:
data.dtypes

In [None]:
data.describe()

In [None]:
# List of all variables
num_var=['Age','FCVC','NCP','CH2O','FAF','TUE']
cat_var=['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS']
all_var=num_var+cat_var
NObeyesdad_labels=[
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]
NObeyesdad_labels_for_exploration={}
for i in range(len(NObeyesdad_labels)):
    NObeyesdad_labels_for_exploration[NObeyesdad_labels[i]]=str(i)+'_'+NObeyesdad_labels[i]
for i in range(len(data)):
    data.loc[i,'NObeyesdad']=NObeyesdad_labels_for_exploration[data.loc[i,'NObeyesdad']]

In [None]:
single_trees=[]
for x in all_var:
    single_trees.append(Classification(data,x,'NObeyesdad'))

#### (2) Explore the target 'NObeyesdad'

In [None]:
y_order=NObeyesdad_labels_for_exploration.values()
single_trees[0].y_diagram(y_order)

#### (3) Explore each predictor

In [None]:
for t in single_trees:
    if t.x in num_var:
        t.x_diagram()
    else:
        t.x_diagram(numerical=False)

#### (4) Explore relations between each predictor and the target

In [None]:
for t in single_trees:
    if t.x in num_var:
        t.xy_diagram(y_order)
    else:
        t.xy_diagram(order=y_order,numerical=False)


# 2. Data Cleaning & Preprocessing

#### (1) Process invalid data

In [None]:
data1=data[all_var+['NObeyesdad']]
# Remove invalid values
data1.replace(to_replace=r'^\s*$',value=np.nan,regex=True,inplace=True)
data1.drop_duplicates()
data1.dropna()

#### (2) Encode categorical predictors to numerical

In [None]:
# Encode all categorical variables to numerical
oe = OrdinalEncoder()
data1[cat_var] = oe.fit_transform(data1[cat_var]) 
data1

# 3.Single Decision Tree on Each Predictor

#### (1) Create trees on each predictor

In [None]:
# Through experiments, depth=10 is the best for single decision trees

In [None]:
single_trees=[]
for x in all_var:
    t=Classification(data1,x,'NObeyesdad')
    t.apply_tree(max_depth=6)
    single_trees.append(t)

#### (2) Print goodness of each tree

In [None]:
print("Goodness of each model:")
for t in single_trees:
    t.print_goodness()

#### (3) Draw confusion matrices for each tree

In [None]:
print("Confusion_matrices of each model:")
for t in single_trees:
    t.draw_matrix()

# 4. Multi-Variate Decision Tree

#### (1) Create the multi-variate decision tree and configure the parameter "max_depth"

In [None]:
multitree=Classification(data1,all_var,'NObeyesdad')
best_depth=0
best_test_accuracy=0
accuracy_list=[0]
for d in range(1,21):    
    multitree.apply_tree(max_depth=d)
    train_accuracy=round(multitree.tree.score(multitree.x_train, multitree.y_train)*100,1)
    test_accuracy=round(multitree.tree.score(multitree.x_test, multitree.y_test)*100,1)
    accuracy_list.append((train_accuracy,test_accuracy))
    if test_accuracy>best_test_accuracy:
        best_test_accuracy=test_accuracy
        best_depth=d
for i in range(1,len(accuracy_list)):
    print(f"Depth: {i}, Accuracy: {accuracy_list[i]}")


#### (2) Apply the best depth (with highest prediction accuracy on test set)

In [None]:
multitree.apply_tree(max_depth=best_depth)
print("Best depth:",best_depth)
multitree.print_goodness()

#### (3) Draw the confusion matrix 

In [None]:
multitree.draw_matrix()

# 5. Random Forest

#### (1) Create the random forest and configure the parameter "max_depth"

In [None]:
multitree=Classification(data1,all_var,'NObeyesdad')
best_depth=0
best_test_accuracy=0
accuracy_list=[0]
for d in range(1,21):    
    multitree.apply_RandomForest(n_estimators=50,max_depth=d)
    train_accuracy=round(multitree.tree.score(multitree.x_train, multitree.y_train)*100,1)
    test_accuracy=round(multitree.tree.score(multitree.x_test, multitree.y_test)*100,1)
    accuracy_list.append((train_accuracy,test_accuracy))
    if test_accuracy>best_test_accuracy:
        best_test_accuracy=test_accuracy
        best_depth=d
for i in range(1,len(accuracy_list)):
    print(f"Depth: {i}, Accuracy: {accuracy_list[i]}")

In [24]:
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
import multiprocessing
from copy import deepcopy
multitree=Classification(data1,all_var,'NObeyesdad')

best_n=[0]
best_depth=[0]
best_split=[0]
best_leaf=[0]

best_test_accuracy=[0]

#args=multitree,n,d,s,l,lists=[best_n,best_depth,best_split,best_leaf,best_test_accuracy]
def training(args):
    multitree,n,d,s,l,lists=args
    best_n,best_depth,best_split,best_leaf,best_test_accuracy=lists

    tree_copy=deepcopy(multitree)
    tree_copy.apply_RandomForest(
        n_estimators=n,
        max_depth=d,
        min_samples_split=s,
        min_samples_leaf=l)
    test_accuracy=round(tree_copy.tree.score(multitree.x_test, multitree.y_test)*100,1)
    if test_accuracy>best_test_accuracy[0]:
        best_n[0]=n
        best_depth[0]=d
        best_split[0]=s
        best_leaf[0]=l
        best_test_accuracy[0]=test_accuracy

def multi_process(inputs):
    with ProcessPoolExecutor() as pool:
        pool.map(training,inputs)


inputs=[]
for n in range(40,60):  
    for d in range(1,21):  
        for s in range(2,6):
            for l in range(1,6):
                '''
                multitree.apply_RandomForest(
                    n_estimators=50,
                    max_depth=d,
                    min_samples_split=s,
                    min_samples_leaf=l)
                train_accuracy=round(multitree.tree.score(multitree.x_train, multitree.y_train)*100,1)
                test_accuracy=round(multitree.tree.score(multitree.x_test, multitree.y_test)*100,1)
                if test_accuracy>best_test_accuracy:
                    best_test_accuracy=test_accuracy
                    best_n=n
                    best_depth=d
                    best_split=s
                    best_leaf=l
                '''
                inputs.append([multitree,n,d,s,l,[best_n,best_depth,best_split,best_leaf,best_test_accuracy]])

multi_process(inputs)


Process SpawnProcess-12:
Traceback (most recent call last):
  File "/Users/apple/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/apple/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/apple/anaconda3/lib/python3.11/concurrent/futures/process.py", line 244, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/apple/anaconda3/lib/python3.11/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'training' on <module '__main__' (built-in)>
Process SpawnProcess-13:
Traceback (most recent call last):
  File "/Users/apple/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/apple/anaconda3/lib/python3.11/multiprocessing/process.py"

BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore

In [None]:
print("Best depth:",best_depth)
multitree.apply_RandomForest(n_estimators=50,max_depth=best_depth)
multitree.print_goodness()

In [None]:
'''
n_estimators= [10, 50, 100, 200, 400]
max_depth= [None, 10, 20, 30, 50]
min_samples_split= [2, 5, 10]
min_samples_leaf= [1, 2, 4]
max_features=['log2', 'sqrt']

max_scores=[0,0]
max_combinations=[0,0,0,0,0]
for a in n_estimators:
    for b in max_depth:
        for c in min_samples_split:
            for d in min_samples_leaf:
                for e in max_features:
                    multitree.apply_RandomForest(
                        n_estimators=a,
                        max_depth=b,
                        min_samples_split=c,
                        min_samples_leaf=d,
                        max_features=e,
                    )
                    train_score=round(multitree.tree.score(multitree.x_train, multitree.y_train),2)
                    test_score=round(multitree.tree.score(multitree.x_test, multitree.y_test),2)
                    multitree.print_goodness()
                    if test_score>max_scores[1]:
                        max_scores=[train_score,test_score]
                        max_combinations=[a,b,c,d,e]
multitree.apply_RandomForest(
    n_estimators=max_combinations[0],
    max_depth=max_combinations[1],
    min_samples_split=max_combinations[2],
    min_samples_leaf=max_combinations[3],
    max_features=max_combinations[4],
)
multitree.print_goodness()
'''

#### (2) Draw the confusion matrix 

In [None]:
multitree.draw_matrix()

# 6. More models

#### (1) These are some other models for classification

In [None]:
models = [
    ('DecisionTree', DecisionTreeClassifier(random_state=42)),
    ('RandomForest', RandomForestClassifier(random_state=42)),
    ('GradientBoost', GradientBoostingClassifier(random_state=42)),
    ('LogisticRegression', LogisticRegression(max_iter=1000, random_state=42)),
    ('SVM', SVC(random_state=42)),
]

#### (2) Apply each model (very slow)

In [None]:
multitree.apply_more_models(models)

#### (3) Print the result of each model

In [None]:
multitree.print_more_models_result()