In [1]:
#Imports
import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv(f"./data/penguins_train.csv")
df = df.rename(columns={"Clutch Completion": "Clutch_Completion"})
print(df.info)


<bound method DataFrame.info of         Island Clutch_Completion  Culmen Length (mm)  Culmen Depth (mm)  \
0       Biscoe               Yes                38.8               17.2   
1    Torgersen               Yes                41.1               18.6   
2       Biscoe               Yes                39.0               17.5   
3        Dream                No                39.7               17.9   
4       Biscoe               Yes                47.5               14.2   
..         ...               ...                 ...                ...   
269     Biscoe               Yes                44.4               17.3   
270      Dream               Yes                36.4               17.0   
271      Dream               Yes                42.2               18.5   
272     Biscoe               Yes                37.8               18.3   
273  Torgersen               Yes                42.9               17.6   

     Flipper Length (mm)  Body Mass (g)     Sex  Delta 15 N (o/oo) 

In [3]:
#Data Preprocessing
def check_null_rows(df):
    df_count_row_nan = df[df.isnull().any(axis=1)]
    # print(f"No.of rows with NaN values: {df_count_row_nan}")
    #Contains only 15 rows with NaN values
    df = df.dropna(axis=0)
    df.index = range(0, df.shape[0])
    print(df.isnull().sum().sum())
    return df 
    
df = check_null_rows(df)
df.columns

0


Index(['Island', 'Clutch_Completion', 'Culmen Length (mm)',
       'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)', 'Sex',
       'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)', 'Species'],
      dtype='object')

In [15]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import accuracy_score

def train_test_split(df, train_frac):
    #Shuffle the dataframe
    df = df.sample(frac=1)
    df.index = range(0, df.shape[0])
    train_df = df.loc[0:int(train_frac * len(df)) - 1,:]
    test_df = df.loc[int(train_frac * len(df)):, :]
    return train_df, test_df 

def encode_features(df, features):
    label_encoder = LabelEncoder()
    ohe = OneHotEncoder(sparse=True)
    #Label Encoder followed by One Hot Encoding
    for feature in features:
        integer_encoded = label_encoder.fit_transform(df[feature]).reshape(-1,1)
        # print(label_encoder.classes_)
        array_hot_encoded = ohe.fit_transform(integer_encoded).toarray()
        # print(ohe.categories_)
        for i_iter, categories in enumerate(label_encoder.classes_):
            df[f"ohe_" + feature + "_" + categories] = array_hot_encoded[:, i_iter]
    # df_check = df.loc[:, ['Island', 'ohe_Island_Biscoe', 'ohe_Island_Dream', 'ohe_Island_Torgersen']]
    # print(df_check)
    # df_check_1 = df.loc[:, ['Sex', 'ohe_Sex_MALE', 'ohe_Sex_FEMALE']]
    # print(df_check_1)
    # print(f"Waiting for input...")
    return df 

def encode_label(df, labelname):
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(df[labelname]).reshape(-1,1)
    df['encoded_' + labelname] = integer_encoded
    return df 



train_frac = 0.8
df_subset = df.loc[:,:]
df_subset = encode_features(df_subset, ["Island", "Sex", "Clutch_Completion"])
df_subset = encode_label(df_subset, "Species")
#Train Test Split
df_train, df_test = train_test_split(df_subset, train_frac)



In [22]:
def create_bootstrap_train(df):
    t_k = df.sample(n = df.shape[0], replace=True)
    return t_k

def create_dt(X_train, Y_train):
    clf = DecisionTreeClassifier(criterion="gini", max_features="log2")
    clf.fit(X_train, Y_train)
    return clf 

#Creating the decision tree
# #Testing the accuracy on the test set
# yhat = clf.predict(X_test)
# print(f"Waiting for input...")
# accuracy_score(Y_test, yhat)
n = 100
feature_ls = ['ohe_Island_Biscoe', 'ohe_Island_Dream', 'ohe_Island_Torgersen', 'ohe_Sex_FEMALE', 'ohe_Sex_MALE',\
               'ohe_Clutch_Completion_No', 'ohe_Clutch_Completion_Yes', 'Culmen Length (mm)', 'Culmen Depth (mm)',\
                'Flipper Length (mm)', 'Body Mass (g)', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)']
label_ls = ['encoded_Species']
clf_ls = []
for i_iter in range(0, n):
    df_bootstrap_train = create_bootstrap_train(df_train)
    X_train, Y_train = df_bootstrap_train.loc[:, feature_ls].to_numpy(), df_bootstrap_train.loc[:, label_ls].to_numpy()
    clf = create_dt(X_train, Y_train)
    clf_ls.append(clf)


#Prediction
X_test, Y_test = df_test.loc[:, feature_ls].to_numpy(), df_test.loc[:, label_ls].to_numpy()
for i_iter, classifier in enumerate(clf_ls):
    Y_pred = classifier.predict(X_test).reshape(1,-1)
    if i_iter == 0:
        Y_pred_arr = Y_pred
    else:
        Y_pred_arr = np.concatenate((Y_pred_arr, Y_pred), axis=0)

Y_pred_best_ls = []
for j_iter in range(0, Y_pred_arr.shape[1]):
    counts = np.bincount(Y_pred_arr[:,j_iter]) #count the number of elements of each value
    Y_pred_best_ls.append(np.argwhere(counts == np.max(counts))[-1][0])


print(accuracy_score(Y_test, Y_pred_best_ls))

1.0
