# Implementation Decision tree using Titanic dataset and also mathematical intuition

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Example usage:
# Load the dataset

In [None]:
titanic_data = pd.read_csv(r"C:\Users\Ankit\Desktop\All folders\BrainyBeam tasks\train.csv")

In [None]:
titanic_data = titanic_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_data['Fare'].fillna(titanic_data['Fare'].median(), inplace=True)
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})

# Prepare the features and target variable

In [None]:

X = titanic_data.drop('Survived', axis=1).values
y = titanic_data['Survived'].values
# X1 = titanic_data.drop('Sex', axis=1)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
class Node:
    def __init__(self, feature_idx=None, threshold=None, label=None):
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.label = label
        self.left = None
        self.right = None


class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if self.max_depth is not None and depth >= self.max_depth:
            label = self._get_most_common_label(y)
            return Node(label=label)

        if self._is_pure(y):
            label = self._get_most_common_label(y)
            return Node(label=label)

        num_features = X.shape[1]
        best_gain = 0.0
        best_feature_idx = None
        best_threshold = None

        for feature_idx in range(num_features):
            values = X[:, feature_idx]
            unique_values = np.unique(values)
            thresholds = (unique_values[:-1] + unique_values[1:]) / 2

            for threshold in thresholds:
                gain = self._information_gain(X, y, feature_idx, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature_idx = feature_idx
                    best_threshold = threshold

        if best_feature_idx is None:
            label = self._get_most_common_label(y)
            return Node(label=label)

        left_mask = X[:, best_feature_idx] <= best_threshold
        right_mask = ~left_mask

        left_subtree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        node = Node(feature_idx=best_feature_idx, threshold=best_threshold)
        node.left = left_subtree
        node.right = right_subtree

        return node

    def _get_most_common_label(self, y):
        unique_labels, counts = np.unique(y, return_counts=True)
        most_common_label = unique_labels[np.argmax(counts)]
        return most_common_label

    def _is_pure(self, y):
        return len(np.unique(y)) == 1

    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _information_gain(self, X, y, feature_idx, threshold):
        parent_entropy = self._entropy(y)

        left_mask = X[:, feature_idx] <= threshold
        right_mask = ~left_mask

        left_y = y[left_mask]
        right_y = y[right_mask]

        left_entropy = self._entropy(left_y)
        right_entropy = self._entropy(right_y)

        num_left = len(left_y)
        num_right = len(right_y)
        total = num_left + num_right

        gain = parent_entropy - ((num_left / total) * left_entropy + (num_right / total) * right_entropy)
        return gain

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.label is not None:
            return node.label

        feature_value = x[node.feature_idx]
        if feature_value <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)



# # Create and train the decision tree classifier

In [4]:
# from sklearn.tree import DecisionTreeClassifier 
# # from sklearn.linear_model import LogisticRegression
tree = DecisionTree(max_depth=5)
# tree =LogisticRegression()
tree.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = tree.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.770949720670391


In [86]:
# from sklearn.tree import plot_tree
# import matplotlib.pyplot as plt
# # Visualize the decision tree
# plt.figure(figsize=(12, 6))
# plot_tree(tree, filled=True, feature_names=X1.columns,class_names=[str(i) for i in y_train])
# plt.show()

In [80]:
X

array([[ 0.    ,  3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    ,  1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 1.    ,  3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [ 0.    ,  3.    , 28.    ,  1.    ,  2.    , 23.45  ],
       [ 1.    ,  1.    , 26.    ,  0.    ,  0.    , 30.    ],
       [ 0.    ,  3.    , 32.    ,  0.    ,  0.    ,  7.75  ]])

In [32]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,28.0,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0000,C148,C


In [13]:
titanic_data["Age"].sort_values()

803     0.42
755     0.67
644     0.75
469     0.75
831     0.83
       ...  
116    70.50
96     71.00
493    71.00
851    74.00
630    80.00
Name: Age, Length: 891, dtype: float64

In [47]:
df=titanic_data.sort_values('Age', ascending=True)

In [48]:
titanic_data["Age"].nunique()

88

In [49]:
titanic_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,0,22.0,1,0,7.2500
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.9250
3,1,1,1,35.0,1,0,53.1000
4,0,3,0,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000
887,1,1,1,19.0,0,0,30.0000
888,0,3,1,28.0,1,2,23.4500
889,1,1,0,26.0,0,0,30.0000


In [63]:
df=df.drop(df[df["Age"] < 1].index)

In [55]:
df.loc[df["Age"]<1]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
803,1,3,0,0.42,0,1,8.5167
755,1,2,0,0.67,1,1,14.5
644,1,3,1,0.75,2,1,19.2583
469,1,3,1,0.75,2,1,19.2583
831,1,2,0,0.83,1,1,18.75
78,1,2,0,0.83,0,2,29.0
305,1,1,0,0.92,1,2,151.55


In [64]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
164,0,3,0,1.0,4,1,39.6875
827,1,2,0,1.0,0,2,37.0042
172,1,3,1,1.0,1,1,11.1333
386,0,3,0,1.0,5,2,46.9000
381,1,3,1,1.0,0,2,15.7417
...,...,...,...,...,...,...,...
116,0,3,0,70.5,0,0,7.7500
96,0,1,0,71.0,0,0,34.6542
493,0,1,0,71.0,0,0,49.5042
851,0,3,0,74.0,0,0,7.7750


In [130]:
correlations = df.corr()

In [131]:
target_corr = correlations['Total_Profit'].abs().sort_values(ascending=False)

In [132]:
target_corr

Total_Profit     1.000000
Total_Revenue    0.897327
Unit_Cost        0.467214
Unnamed: 9            NaN
Unnamed: 10           NaN
Name: Total_Profit, dtype: float64

In [80]:
df=pd.read_csv("Sales.csv")

In [81]:
import matplotlib.pyplot as plt


In [101]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# Create a sample dataset
data = {
    'Color': ['Brown', 'Blue', 'Green', 'Blue', 'Brown', 'Green', 'Blue', 'Brown', 'Green'],
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small', 'Large', 'Medium', 'Small', 'Medium'],
    'Habitat': ['Forest', 'Ocean', 'Forest', 'Ocean', 'Desert', 'Forest', 'Ocean', 'Desert', 'Forest']
}

df = pd.DataFrame(data)

# Convert categorical features to numerical using label encoding
df['Color'] = pd.Categorical(df['Color']).codes
df['Size'] = pd.Categorical(df['Size']).codes

# Separate features and target
X = df.drop('Habitat', axis=1)
y = df['Habitat']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit a Decision Tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Make predictions on the test data
predictions = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 1.0
