In [143]:


import numpy as np
import pandas as pd

data = pd.read_csv('train_and_test2.csv')
data=data.drop(columns=['zero'])
for i in range (1,19):
  data=data.drop(columns=[f'zero.{i}'])
data.rename(columns={'2urvived':'Survived'},inplace=True)  
print(data)


class DecisionTree:
    class Node:
        def __init__(self, *, feature_index=None, threshold=None, left=None, right=None, value=None):
            self.feature_index = feature_index
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value 

    def __init__(self, criterion=None, max_depth=None, min_samples_leaf=1):
        if criterion not in ('gini', 'entropy'):
            raise ValueError("criterion must be 'gini' or 'entropy'")
        self.criterion = criterion
        self.max_depth = float('inf') if max_depth is None else int(max_depth)
        self.min_samples_leaf = max(1, int(min_samples_leaf))
        self.tree_ = None

    def _gini(self, y):
        classes,counts=np.unique(y,return_counts=True)
        probs = counts / counts.sum()
        return 1.0 - np.sum(probs ** 2)

    def _entropy(self, y):
        classes,counts=np.unique(y,return_counts=True)
        probs = counts / counts.sum()
        probs=probs[probs>0]
        return -np.sum(probs * np.log2(probs))

    def _impurity(self, y):
        return self._gini(y) if self.criterion == 'gini' else self._entropy(y)

    def _majority_class(self, y): #finds most frequent class label
        counts=np.bincount(y)
        return np.argmax(counts)

    def _best_split(self, X, y):
        m, n_features = X.shape
        if m < 2 * self.min_samples_leaf:
            return None, None, None, None, 0.0

        parent_impurity = self._impurity(y)
        best_gain = 0.0
        best_feat = None
        best_thresh = None
        best_left_mask = None
        best_right_mask = None

        # Feature bagging: randomly choose sqrt(n_features) features
        max_features = max(1, int(np.sqrt(n_features)))
        feature_indices = np.random.choice(n_features, max_features, replace=False)

        for feat in feature_indices:
            values = X[:, feat]
            sorted_idx = np.argsort(values)
            sorted_vals = values[sorted_idx]

            for i in range(1, m):
                if sorted_vals[i] == sorted_vals[i-1]:
                    continue
                thresh = (sorted_vals[i] + sorted_vals[i-1]) / 2.0
                left_mask = values <= thresh
                right_mask = ~left_mask

                if left_mask.sum() < self.min_samples_leaf or right_mask.sum() < self.min_samples_leaf:
                    continue

                gain = parent_impurity - (
                    (left_mask.sum() / m) * self._impurity(y[left_mask]) +
                    (right_mask.sum() / m) * self._impurity(y[right_mask])
                )

                if gain > best_gain:
                    best_gain = gain
                    best_feat = feat
                    best_thresh = thresh
                    best_left_mask = left_mask
                    best_right_mask = right_mask

        return best_feat, best_thresh, best_left_mask, best_right_mask, best_gain



    def _build_tree(self, X, y, depth=0):
        if depth >= self.max_depth or len(y) <= self.min_samples_leaf or len(set(y)) == 1:
            return DecisionTree.Node(value=self._majority_class(y))

        feat, thresh, left_mask, right_mask, gain = self._best_split(X, y)
        if feat is None or gain <= 0.0:
            return DecisionTree.Node(value=self._majority_class(y))

        left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
        return DecisionTree.Node(feature_index=feat, threshold=thresh, left=left, right=right)

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.tree_ = self._build_tree(np.array(X), np.array(y))
        return self

    def _predict_one(self, x, node):
        while node.value is None:
            node = node.left if x[node.feature_index] <= node.threshold else node.right
        return node.value

    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        return np.array([self._predict_one(row, self.tree_) for row in np.array(X)])

    # def _print_tree(self, node=None, feature_names=None, depth=0):
    #     if node is None:
    #         node = self.tree_
    #     prefix = "  " * depth
    #     if node.value is not None:
    #         print(f"{prefix}Leaf: predict = {node.value}")
    #     else:
    #         fname = feature_names[node.feature_index] if feature_names else f"X{node.feature_index}"
    #         print(f"{prefix}If {fname} <= {node.threshold:.4f}:")
    #         self._print_tree(node.left, feature_names, depth+1)
    #         print(f"{prefix}Else (if {fname} > {node.threshold:.4f}):")
    #         self._print_tree(node.right, feature_names, depth+1)

    def to_dict(self, node=None):
        if node is None:
            node = self.tree_
        if node.value is not None:
            return {'leaf': True, 'value': int(node.value)}
        return {
            'leaf': False,
            'feature_index': int(node.feature_index),
            'threshold': float(node.threshold),
            'left': self.to_dict(node.left),
            'right': self.to_dict(node.right)
        }
    

    def confusion_matrix(self, y_true, y_pred):
        classes = np.unique(np.concatenate((y_true, y_pred)))
        class_to_index = {cls: idx for idx, cls in enumerate(classes)}
        matrix = np.zeros((len(classes), len(classes)), dtype=int)
        for actual, pred in zip(y_true, y_pred):
            matrix[class_to_index[actual]][class_to_index[pred]] += 1
        return matrix


    def classification_metrics(self, cm):
        TP=cm[1,1]
        TN=cm[0,0]
        FP=cm[0,1]
        FN=cm[1,0]

        accuracy=(TP+TN)/(TP+TN+FP+FN)
        precision=TP/(TP+FP) if TP+FP>0 else 0
        recall=TP/(TP+FN) if TP+FN>0 else 0
        f1_score=2*((precision*recall)/(precision+recall)) if precision+recall>0 else 0

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1_score
        }




      Passengerid   Age      Fare  Sex  sibsp  Parch  Pclass  Embarked  \
0               1  22.0    7.2500    0      1      0       3       2.0   
1               2  38.0   71.2833    1      1      0       1       0.0   
2               3  26.0    7.9250    1      0      0       3       2.0   
3               4  35.0   53.1000    1      1      0       1       2.0   
4               5  35.0    8.0500    0      0      0       3       2.0   
...           ...   ...       ...  ...    ...    ...     ...       ...   
1304         1305  28.0    8.0500    0      0      0       3       2.0   
1305         1306  39.0  108.9000    1      0      0       1       0.0   
1306         1307  38.5    7.2500    0      0      0       3       2.0   
1307         1308  28.0    8.0500    0      0      0       3       2.0   
1308         1309  28.0   22.3583    0      1      1       3       0.0   

      Survived  
0            0  
1            1  
2            1  
3            1  
4            0  
...      

In [144]:
import numpy as np

def bootstrap_sampling(X, y, n_bootstraps=10, random_state=None):
    np.random.seed(random_state)
    n_samples = X.shape[0]
    bootstraps = []

    for _ in range(n_bootstraps):
        # نمونه‌برداری با جایگزینی از شاخص‌های داده‌ها
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        
        X_boot = X.iloc[indices] if isinstance(X, pd.DataFrame) else X[indices]
        y_boot = y.iloc[indices] if isinstance(y, pd.Series) else y[indices]
        
        bootstraps.append((X_boot, y_boot))
    
    return bootstraps



def majority_vote(predictions):
    n_estimator,n_sample=predictions.shape
    final_preds=np.zeros(n_sample)

    for i in range(n_sample):
        counts=np.bincount(predictions[:,i].astype(int))
        final_preds[i]=np.argmax(counts)

    return final_preds    



In [145]:
import numpy as np
from sklearn.model_selection import train_test_split
features=['Age','Fare','Sex','sibsp','Parch','Pclass','Embarked']
X=data[features]
Y=data['Survived']

x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

n_estimator=10
all_pred=[]

bootstraps=bootstrap_sampling(x_train,y_train,n_bootstraps=n_estimator,random_state=42)

for boot_x,boot_y in bootstraps:
    tree_model=DecisionTree('gini',5,5)
    tree_model.fit(boot_x,boot_y)
    pred=tree_model.predict(x_test)
    all_pred.append(pred)


all_pred=np.array(all_pred)

final_y_pred=majority_vote(all_pred)

cm=tree_model.confusion_matrix(y_test,final_y_pred)
metrics=tree_model.classification_metrics(cm)

print(metrics)








{'accuracy': 0.7442748091603053, 'precision': 0.5882352941176471, 'recall': 0.273972602739726, 'f1_score': 0.37383177570093457}
