In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
from scipy.stats import mode
import time

In [2]:
data = sns.load_dataset('titanic')

In [3]:
df = data.drop(['deck','class','who','adult_male','embark_town','alive','alone'],axis=1)

In [4]:
df['age'] = df['age'].fillna(df.groupby('sex')['age'].transform('mean'))

In [5]:
df['fam'] = df['parch'] + df['sibsp']

In [6]:
df = pd.get_dummies(df, columns=['sex','embarked','pclass'], dtype=int)

In [8]:
y = df['survived']
X = df.drop('survived', axis=1)

In [9]:
df.head()

Unnamed: 0,survived,age,sibsp,parch,fare,fam,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,pclass_1,pclass_2,pclass_3
0,0,22.0,1,0,7.25,1,0,1,0,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,1,0,1,0,0,1,0,0
2,1,26.0,0,0,7.925,0,1,0,0,0,1,0,0,1
3,1,35.0,1,0,53.1,1,1,0,0,0,1,1,0,0
4,0,35.0,0,0,8.05,0,0,1,0,0,1,0,0,1


In [10]:
all_columns = df.columns.to_list()
random_columns = [np.random.choice(all_columns, 3, replace=True).tolist() for i in range (10)]

In [11]:
random_columns

[['age', 'pclass_3', 'pclass_2'],
 ['age', 'survived', 'fare'],
 ['pclass_1', 'embarked_S', 'survived'],
 ['age', 'embarked_C', 'fare'],
 ['pclass_3', 'embarked_Q', 'fare'],
 ['survived', 'fam', 'pclass_2'],
 ['pclass_1', 'pclass_1', 'parch'],
 ['sex_female', 'sibsp', 'sex_male'],
 ['pclass_2', 'pclass_3', 'pclass_1'],
 ['pclass_1', 'survived', 'embarked_S']]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
def build_decision_trees(X_train, y_train, num_models=10, num_columns=3):
    all_columns = X_train.columns.to_list()
    models = []
    for i in range(num_models):
        selected_columns = np.random.choice(all_columns, num_columns, replace=True).tolist()
        X_train_subset = X_train[selected_columns]
        tree = DecisionTreeClassifier(random_state=42)
        tree.fit(X_train_subset, y_train)
        models.append((tree, selected_columns))
    return models

In [14]:
models = build_decision_trees(X_train, y_train, num_models=10, num_columns=3)

In [15]:
models

[(DecisionTreeClassifier(random_state=42), ['fam', 'sex_female', 'sex_male']),
 (DecisionTreeClassifier(random_state=42), ['embarked_S', 'sex_male', 'fare']),
 (DecisionTreeClassifier(random_state=42), ['sex_male', 'parch', 'parch']),
 (DecisionTreeClassifier(random_state=42), ['embarked_C', 'parch', 'age']),
 (DecisionTreeClassifier(random_state=42), ['fare', 'pclass_3', 'pclass_3']),
 (DecisionTreeClassifier(random_state=42),
  ['embarked_Q', 'parch', 'embarked_Q']),
 (DecisionTreeClassifier(random_state=42), ['pclass_3', 'fare', 'pclass_3']),
 (DecisionTreeClassifier(random_state=42),
  ['embarked_C', 'age', 'embarked_Q']),
 (DecisionTreeClassifier(random_state=42), ['fam', 'sibsp', 'pclass_1']),
 (DecisionTreeClassifier(random_state=42), ['embarked_C', 'fare', 'fam'])]

In [16]:
def measure_accuracy(models, X_train, y_train):
    predictions = []
    for tree, selected_columns in models:
        X_train_subset = X_train[selected_columns]
        pred = tree.predict(X_train_subset)
        predictions.append(pred)
    predictions = np.array(predictions).T
    final_predictions, i = mode(predictions, axis=1)
    final_predictions = final_predictions.flatten()
    accuracy = accuracy_score(y_train, final_predictions)
    #print(predictions)
    #print(final_predictions)
    return accuracy