In [1]:
import numpy as np
import pandas as pd
data=pd.read_csv('classification.csv')

In [2]:
data


Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0
...,...,...,...
395,46,41000,1
396,51,23000,1
397,50,20000,1
398,36,33000,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Age              400 non-null    int64
 1   EstimatedSalary  400 non-null    int64
 2   Purchased        400 non-null    int64
dtypes: int64(3)
memory usage: 9.5 KB


In [4]:
data.describe()

Unnamed: 0,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0
mean,37.655,69742.5,0.3575
std,10.482877,34096.960282,0.479864
min,18.0,15000.0,0.0
25%,29.75,43000.0,0.0
50%,37.0,70000.0,0.0
75%,46.0,88000.0,1.0
max,60.0,150000.0,1.0


In [5]:
data.isnull().sum()

Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [14]:
def calculate_gini_split(data, column, target, split_value):
    left_subset = data[data[column] <= split_value]
    right_subset = data[data[column] > split_value]

    total = len(data)
    total_left = len(left_subset)
    total_right = len(right_subset)

    gini_left = 1 - sum((len(left_subset[left_subset[target] == x]) / total_left) ** 2 for x in left_subset[target].unique())
    gini_right = 1 - sum((len(right_subset[right_subset[target] == y]) / total_right) ** 2 for y in right_subset[target].unique())

    gini_split = (total_left / total) * gini_left + (total_right / total) * gini_right

    return gini_split

Salary_gini = calculate_gini_split(data, column='EstimatedSalary', target='Purchased', split_value=70000)
Age_gini = calculate_gini_split(data, column='Age', target='Purchased', split_value=30)
print(f"Gini Index for Age: {Age_gini}")
print(f"Gini Index for Age: {Salary_gini}")

Gini Index for Age: 0.388655194987375
Gini Index for Age: 0.43159810957465433


# question 2

In [2]:
from collections import Counter
import numpy as np

class DecisionTree:
    def __init__(self):
        self.tree = {}

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _calculate_entropy(self, y):
        counter = Counter(y)
        entropy = 0
        total = len(y)
        for label in counter:
            probability = counter[label] / total
            entropy -= probability * np.log2(probability)
        return entropy

    def _split_data(self, X, y, feature_index, value):
        left_X, left_y, right_X, right_y = [], [], [], []
        for i, val in enumerate(X[:, feature_index]):
            if val == value:
                left_X.append(X[i])
                left_y.append(y[i])
            else:
                right_X.append(X[i])
                right_y.append(y[i])
        return np.array(left_X), np.array(left_y), np.array(right_X), np.array(right_y)

    def _find_best_split(self, X, y):
        best_entropy = float('inf')
        best_feature_index = -1
        best_value = None

        for feature_index in range(X.shape[1]):
            unique_values = set(X[:, feature_index])
            for value in unique_values:
                left_X, left_y, right_X, right_y = self._split_data(X, y, feature_index, value)
                total_entropy = (len(left_y) / len(y)) * self._calculate_entropy(left_y) + \
                                (len(right_y) / len(y)) * self._calculate_entropy(right_y)
                if total_entropy < best_entropy:
                    best_entropy = total_entropy
                    best_feature_index = feature_index
                    best_value = value

        return best_feature_index, best_value

    def _build_tree(self, X, y):
        if len(set(y)) == 1:
            return {'prediction': y[0], 'probabilities': {y[0]: 1.0}}
        
        if len(X) == 0:
            most_common_label = Counter(y).most_common(1)[0][0]
            return {'prediction': most_common_label, 'probabilities': {most_common_label: 1.0}}

        best_feature_index, best_value = self._find_best_split(X, y)
        left_X, left_y, right_X, right_y = self._split_data(X, y, best_feature_index, best_value)

        return {
            'feature_index': best_feature_index,
            'value': best_value,
            'left': self._build_tree(left_X, left_y),
            'right': self._build_tree(right_X, right_y)
        }

    def _predict_single(self, sample, tree):
        if 'prediction' in tree:
            return tree['prediction'], tree['probabilities']
        feature_index = tree['feature_index']
        value = tree['value']
        if sample[feature_index] == value:
            return self._predict_single(sample, tree['left'])
        else:
            return self._predict_single(sample, tree['right'])

    def predict(self, X):
        predictions = []
        probabilities = []
        for sample in X:
            prediction, probs = self._predict_single(sample, self.tree)
            predictions.append(prediction)
            probabilities.append(probs)
        return predictions, probabilities


# Example usage:
X_train = np.array([
    ['Sunny', 'Hot', 'High', 'Weak'],
    ['Sunny', 'Hot', 'High', 'Strong'],
    ['Overcast', 'Hot', 'High', 'Weak'],
    ['Rain', 'Mild', 'High', 'Weak'],
    ['Rain', 'Cool', 'Normal', 'Weak'],
    ['Rain', 'Cool', 'Normal', 'Strong'],
    ['Overcast', 'Cool', 'Normal', 'Strong'],
    ['Sunny', 'Mild', 'High', 'Weak'],
    ['Sunny', 'Cool', 'Normal', 'Weak'],
    ['Rain', 'Mild', 'Normal', 'Weak'],
    ['Sunny', 'Mild', 'Normal', 'Strong'],
    ['Overcast', 'Mild', 'High', 'Strong'],
    ['Overcast', 'Hot', 'Normal', 'Weak'],
    ['Rain', 'Mild', 'High', 'Strong']
])

y_train = np.array(['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No'])

X_test = np.array([
    ['Sunny', 'Hot', 'Normal', 'Weak'],
    ['Overcast', 'Cool', 'Normal', 'Strong'],
    ['Rain', 'Mild', 'High', 'Weak'],
    ['Sunny', 'Mild', 'High', 'Strong']
])

tree = DecisionTree()
tree.fit(X_train, y_train)
predictions, probabilities = tree.predict(X_test)
print("Predictions:", predictions)
print("Probabilities:", probabilities)


Predictions: ['Yes', 'Yes', 'Yes', 'No']
Probabilities: [{'Yes': 1.0}, {'Yes': 1.0}, {'Yes': 1.0}, {'No': 1.0}]
