<a href="https://colab.research.google.com/github/Anirudh11011/Decision-Trees-and-Boosting-for-Predictive-Modeling-/blob/main/ML_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: I want to unzip titanic.zip file

!unzip titanic.zip


Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Store our test passenger IDs for easy access
PassengerId = test['PassengerId']

# Showing overview of the train dataset
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
import numpy as np

class DecisionTree:
    def __init__(self, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def _gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        p = counts / counts.sum()
        return 1 - np.sum(p ** 2)

    def _entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        p = counts / counts.sum()
        return -np.sum(p * np.log2(p + 1e-9))  # Adding epsilon to avoid log(0)

    def _misclassification(self, y):
        classes, counts = np.unique(y, return_counts=True)
        p = counts / counts.sum()
        return 1 - np.max(p)

    def _best_split(self, X, y):
        best_feature, best_threshold, best_gain = None, None, -np.inf
        current_impurity = self._compute_impurity(y)

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_idx = X[:, feature] <= threshold
                right_idx = ~left_idx

                if left_idx.sum() < self.min_samples_leaf or right_idx.sum() < self.min_samples_leaf:
                    continue

                left_impurity = self._compute_impurity(y[left_idx])
                right_impurity = self._compute_impurity(y[right_idx])

                weighted_impurity = (left_idx.sum() * left_impurity + right_idx.sum() * right_impurity) / len(y)
                gain = current_impurity - weighted_impurity

                if gain > best_gain:
                    best_feature, best_threshold, best_gain = feature, threshold, gain

        return best_feature, best_threshold

    def _compute_impurity(self, y):
        if self.criterion == 'gini':
            return self._gini(y)
        elif self.criterion == 'entropy':
            return self._entropy(y)
        elif self.criterion == 'misclassification':
            return self._misclassification(y)
        else:
            raise ValueError("Invalid criterion. Choose 'gini', 'entropy', or 'misclassification'.")

    def _build_tree(self, X, y, depth=0):
        if len(set(y)) == 1 or (self.max_depth and depth >= self.max_depth) or len(y) < self.min_samples_split:
            return np.argmax(np.bincount(y))  # Return majority class

        feature, threshold = self._best_split(X, y)
        if feature is None:
            return np.argmax(np.bincount(y))

        left_idx = X[:, feature] <= threshold
        right_idx = ~left_idx

        node = {
            'feature': feature,
            'threshold': threshold,
            'left': self._build_tree(X[left_idx], y[left_idx], depth + 1),
            'right': self._build_tree(X[right_idx], y[right_idx], depth + 1)
        }
        return node

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _predict_sample(self, x, node):
        if isinstance(node, dict):
            if x[node['feature']] <= node['threshold']:
                return self._predict_sample(x, node['left'])
            else:
                return self._predict_sample(x, node['right'])
        return node

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])


In [None]:
class RandomForest:
    def __init__(self, base_classifier, num_trees=10, min_features=2):
        self.base_classifier = base_classifier
        self.num_trees = num_trees
        self.min_features = min_features
        self.trees = []
        self.features = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.trees = []
        self.features = []

        for _ in range(self.num_trees):
            sample_indices = np.random.choice(n_samples, n_samples, replace=True)
            feature_indices = np.random.choice(n_features, np.random.randint(self.min_features, n_features + 1), replace=False)
            self.features.append(feature_indices)

            tree = self.base_classifier()
            tree.fit(X[sample_indices][:, feature_indices], y[sample_indices])
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X[:, features]) for tree, features in zip(self.trees, self.features)])
        return np.array([Counter(predictions[:, i]).most_common(1)[0][0] for i in range(X.shape[0])])

class AdaBoost:
    def __init__(self, weak_learner, num_learners=50, learning_rate=1.0):
        self.weak_learner = weak_learner
        self.num_learners = num_learners
        self.learning_rate = learning_rate
        self.models = []
        self.alphas = []

    def fit(self, X, y):
        n_samples = len(y)
        w = np.ones(n_samples) / n_samples

        for _ in range(self.num_learners):
            model = self.weak_learner()
            model.fit(X, y)
            predictions = model.predict(X)

            error = np.sum(w * (predictions != y)) / np.sum(w)
            if error > 0.5:
                continue

            alpha = self.learning_rate * np.log((1 - error) / (error + 1e-9))
            self.alphas.append(alpha)
            self.models.append(model)

            w *= np.exp(-alpha * y * predictions)
            w /= np.sum(w)

    def predict(self, X):
        predictions = np.array([alpha * model.predict(X) for alpha, model in zip(self.alphas, self.models)])
        return np.sign(np.sum(predictions, axis=0))

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class DecisionTree:
    def __init__(self, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def _gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        p = counts / counts.sum()
        return 1 - np.sum(p ** 2)

    def _entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        p = counts / counts.sum()
        return -np.sum(p * np.log2(p + 1e-9))

    def _misclassification(self, y):
        classes, counts = np.unique(y, return_counts=True)
        p = counts / counts.sum()
        return 1 - np.max(p)

    def _best_split(self, X, y):
        best_feature, best_threshold, best_gain = None, None, -np.inf
        current_impurity = self._compute_impurity(y)

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_idx = X[:, feature] <= threshold
                right_idx = ~left_idx

                if left_idx.sum() < self.min_samples_leaf or right_idx.sum() < self.min_samples_leaf:
                    continue

                left_impurity = self._compute_impurity(y[left_idx])
                right_impurity = self._compute_impurity(y[right_idx])

                weighted_impurity = (left_idx.sum() * left_impurity + right_idx.sum() * right_impurity) / len(y)
                gain = current_impurity - weighted_impurity

                if gain > best_gain:
                    best_feature, best_threshold, best_gain = feature, threshold, gain

        return best_feature, best_threshold

    def _compute_impurity(self, y):
        if self.criterion == 'gini':
            return self._gini(y)
        elif self.criterion == 'entropy':
            return self._entropy(y)
        elif self.criterion == 'misclassification':
            return self._misclassification(y)
        else:
            raise ValueError("Invalid criterion. Choose 'gini', 'entropy', or 'misclassification'.")

    def _build_tree(self, X, y, depth=0):
        if len(set(y)) == 1 or (self.max_depth and depth >= self.max_depth) or len(y) < self.min_samples_split:
            return np.argmax(np.bincount(y))

        feature, threshold = self._best_split(X, y)
        if feature is None:
            return np.argmax(np.bincount(y))

        left_idx = X[:, feature] <= threshold
        right_idx = ~left_idx

        node = {
            'feature': feature,
            'threshold': threshold,
            'left': self._build_tree(X[left_idx], y[left_idx], depth + 1),
            'right': self._build_tree(X[right_idx], y[right_idx], depth + 1)
        }
        return node

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _predict_sample(self, x, node):
        if isinstance(node, dict):
            if x[node['feature']] <= node['threshold']:
                return self._predict_sample(x, node['left'])
            else:
                return self._predict_sample(x, node['right'])
        return node

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])

class RandomForest:
    def __init__(self, base_classifier, num_trees=10, min_features=3):
        self.base_classifier = base_classifier
        self.num_trees = num_trees
        self.min_features = min_features
        self.trees = []
        self.features = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.trees = []
        self.features = []

        for _ in range(self.num_trees):
            sample_indices = np.random.choice(n_samples, n_samples, replace=True)
            feature_indices = np.random.choice(n_features, np.random.randint(self.min_features, n_features + 1), replace=False)
            self.features.append(feature_indices)

            tree = self.base_classifier()
            tree.fit(X[sample_indices][:, feature_indices], y[sample_indices])
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X[:, features]) for tree, features in zip(self.trees, self.features)])
        return np.array([Counter(predictions[:, i]).most_common(1)[0][0] for i in range(X.shape[0])])

# Load and preprocess Titanic dataset
def load_titanic_data():
    df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
    df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
    X = df.drop(columns=['Survived']).values
    y = df['Survived'].values
    return train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = load_titanic_data()

# Train and evaluate models
dt = DecisionTree(criterion='gini', max_depth=5)
dt.fit(X_train, y_train)
dt_predictions = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)

rf = RandomForest(base_classifier=lambda: DecisionTree(criterion='gini', max_depth=5), num_trees=10, min_features=3)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

# Display results
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


Decision Tree Accuracy: 0.7989
Random Forest Accuracy: 0.7989
