In [1]:
import pandas as pd
import numpy as np

In [6]:
data = pd.read_csv('data.csv')

In [7]:
data

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Decision
0,Sunny,85,85,Weak,No
1,Sunny,80,90,Strong,No
2,Overcast,83,78,Weak,Yes
3,Rain,70,96,Weak,Yes
4,Rain,68,80,Weak,Yes
5,Rain,65,70,Strong,No
6,Overcast,64,65,Strong,Yes
7,Sunny,72,95,Weak,No
8,Sunny,69,70,Weak,Yes
9,Rain,75,80,Weak,Yes


In [10]:
import pandas as pd
import numpy as np

data = pd.read_csv('data.csv')

def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = -sum((count / sum(counts)) * np.log2(count / sum(counts)) for count in counts)
    return entropy

def gain_ratio(data, split_attribute, target_attribute):
    total_entropy = entropy(data[target_attribute])
    values, counts = np.unique(data[split_attribute], return_counts=True)
    weighted_entropy = sum((counts[i] / np.sum(counts)) * entropy(data.where(data[split_attribute] == values[i]).dropna()[target_attribute]) for i in range(len(values)))
    split_info = -sum((counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(values)))
    if split_info == 0:
        return 0
    gain = total_entropy - weighted_entropy
    return gain / split_info

def best_attribute(data, attributes, target_attribute):
    best_gain = -1
    best_attr = None
    for attr in attributes:
        gain = gain_ratio(data, attr, target_attribute)
        if gain > best_gain:
            best_gain = gain
            best_attr = attr
    return best_attr

def build_tree(data, attributes, target_attribute):
    if len(np.unique(data[target_attribute])) == 1:
        return data[target_attribute].iloc[0]
    if len(attributes) == 0:
        return data[target_attribute.mode()[0]]
    best_attr = best_attribute(data, attributes, target_attribute)
    tree = {best_attr: {}}
    unique_values = np.unique(data[best_attr])
    for value in unique_values:
        subset = data.where(data[best_attr] == value).dropna()
        remaining_attrs = [attr for attr in attributes if attr != best_attr]
        tree[best_attr][value] = build_tree(subset, remaining_attrs, target_attribute)
    return tree

def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree
    attribute = next(iter(tree))
    attribute_value = sample[attribute]
    if attribute_value in tree[attribute]:
        return classify(tree[attribute][attribute_value], sample)
    else:
        return None

attributes = ['Outlook', 'Temperature', 'Humidity', 'Wind']
target_attribute = 'Decision'
decision_tree = build_tree(data, attributes, target_attribute)

print("Decision Tree:")
print(decision_tree)

new_sample = {'Outlook': 'Sunny', 'Temperature': 85, 'Humidity': 80, 'Wind': 'Weak'}
classification = classify(decision_tree, new_sample)

print(f"\nClassification for {new_sample}: {classification}")


Decision Tree:
{'Temperature': {64: 'Yes', 65: 'No', 68: 'Yes', 69: 'Yes', 70: 'Yes', 71: 'No', 72: {'Outlook': {'Overcast': 'Yes', 'Sunny': 'No'}}, 75: 'Yes', 80: 'No', 81: 'Yes', 83: 'Yes', 85: 'No'}}

Classification for {'Outlook': 'Sunny', 'Temperature': 85, 'Humidity': 80, 'Wind': 'Weak'}: No


In [15]:
import pandas as pd
import numpy as np

data = pd.read_csv('data.csv')

def gini_impurity(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    impurity = 1 - sum((count / sum(counts)) ** 2 for count in counts)
    return impurity

def best_split(data, attributes, target_attribute):
    best_gini = float('inf')
    best_attr = None
    best_split_value = None
    
    for attr in attributes:
        values = np.unique(data[attr])
        
        for value in values:
            left_subset = data[data[attr] <= value]
            right_subset = data[data[attr] > value]
            
            if len(left_subset) == 0 or len(right_subset) == 0:
                continue
            
            left_gini = gini_impurity(left_subset[target_attribute])
            right_gini = gini_impurity(right_subset[target_attribute])
            weighted_gini = (len(left_subset) / len(data)) * left_gini + (len(right_subset) / len(data)) * right_gini
            
            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_attr = attr
                best_split_value = value
                
    return best_attr, best_split_value

def build_tree(data, attributes, target_attribute, depth=0):
    if len(np.unique(data[target_attribute])) == 1:
        return data[target_attribute].iloc[0]
    
    if len(attributes) == 0 or depth == 3:
        return data[target_attribute].mode()[0]  # Corrected this line

    best_attr, best_split_value = best_split(data, attributes, target_attribute)
    
    if best_attr is None:
        return data[target_attribute].mode()[0]
    
    tree = {best_attr: {}}
    
    left_subset = data[data[best_attr] <= best_split_value]
    right_subset = data[data[best_attr] > best_split_value]
    
    remaining_attrs = [attr for attr in attributes if attr != best_attr]
    
    tree[best_attr]['<='] = build_tree(left_subset, remaining_attrs, target_attribute, depth + 1)
    tree[best_attr]['>'] = build_tree(right_subset, remaining_attrs, target_attribute, depth + 1)
    
    return tree

def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree
    
    attribute = next(iter(tree))
    attribute_value = sample[attribute]
    
    if attribute_value <= tree[attribute]['<=']:
        return classify(tree[attribute]['<='], sample)
    else:
        return classify(tree[attribute]['>'], sample)

attributes = ['Outlook', 'Temperature', 'Humidity', 'Wind']
target_attribute = 'Decision'
decision_tree = build_tree(data, attributes, target_attribute)

print("Decision Tree:")
print(decision_tree)

new_sample = {'Outlook': 'Sunny', 'Temperature': 85, 'Humidity': 80, 'Wind': 'Weak'}
classification = classify(decision_tree, new_sample)

print(f"Classification for {new_sample}: {classification}")


Decision Tree:
{'Outlook': {'<=': 'Yes', '>': {'Temperature': {'<=': {'Wind': {'<=': 'No', '>': 'Yes'}}, '>': 'No'}}}}
Classification for {'Outlook': 'Sunny', 'Temperature': 85, 'Humidity': 80, 'Wind': 'Weak'}: Yes


In [20]:
import pandas as pd
import numpy as np

# Create dataset
data = pd.DataFrame({
    'Income': ['Low', 'Low', 'Medium', 'Medium', 'High', 'High'],
    'Credit': ['Good', 'Bad', 'Good', 'Bad', 'Good', 'Bad'],
    'Loan Approved': ['Yes', 'No', 'Yes', 'Yes', 'Yes', 'No']
})

def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = -sum((count / sum(counts)) * np.log2(count / sum(counts)) for count in counts)
    return entropy

def gain_ratio(data, split_attribute, target_attribute):
    total_entropy = entropy(data[target_attribute])
    values, counts = np.unique(data[split_attribute], return_counts=True)
    weighted_entropy = sum((counts[i] / np.sum(counts)) * entropy(data[data[split_attribute] == values[i]][target_attribute]) for i in range(len(values)))
    split_info = -sum((counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(values)))
    if split_info == 0:
        return 0
    gain = total_entropy - weighted_entropy
    return gain / split_info

def best_attribute(data, attributes, target_attribute):
    best_gain = -1
    best_attr = None
    for attr in attributes:
        gain = gain_ratio(data, attr, target_attribute)
        if gain > best_gain:
            best_gain = gain
            best_attr = attr
    return best_attr

def build_tree(data, attributes, target_attribute):
    if len(np.unique(data[target_attribute])) == 1:
        return data[target_attribute].iloc[0]
    if len(attributes) == 0:
        return data[target_attribute.mode()[0]]
    
    best_attr = best_attribute(data, attributes, target_attribute)
    tree = {best_attr: {}}
    unique_values = np.unique(data[best_attr])
    
    for value in unique_values:
        subset = data[data[best_attr] == value]
        remaining_attrs = [attr for attr in attributes if attr != best_attr]
        tree[best_attr][value] = build_tree(subset, remaining_attrs, target_attribute)
        
    return tree

def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree
    attribute = next(iter(tree))
    attribute_value = sample[attribute]
    return classify(tree[attribute][attribute_value], sample)

attributes = ['Income', 'Credit']
target_attribute = 'Loan Approved'
decision_tree_c45 = build_tree(data, attributes, target_attribute)

print("C4.5 Decision Tree:")
print(decision_tree_c45)

new_sample = {'Income': 'Medium', 'Credit': 'Bad'}
classification_c45 = classify(decision_tree_c45, new_sample)

print(f"C4.5 Classification for {new_sample}: {classification_c45}")


C4.5 Decision Tree:
{'Credit': {'Bad': {'Income': {'High': 'No', 'Low': 'No', 'Medium': 'Yes'}}, 'Good': 'Yes'}}
C4.5 Classification for {'Income': 'Medium', 'Credit': 'Bad'}: Yes


In [21]:
def gini_impurity(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    impurity = 1 - sum((count / sum(counts)) ** 2 for count in counts)
    return impurity

def best_split(data, attributes, target_attribute):
    best_gini = float('inf')
    best_attr = None
    best_split_value = None
    
    for attr in attributes:
        values = np.unique(data[attr])
        
        for value in values:
            left_subset = data[data[attr] <= value]
            right_subset = data[data[attr] > value]
            
            if len(left_subset) == 0 or len(right_subset) == 0:
                continue
            
            left_gini = gini_impurity(left_subset[target_attribute])
            right_gini = gini_impurity(right_subset[target_attribute])
            weighted_gini = (len(left_subset) / len(data)) * left_gini + (len(right_subset) / len(data)) * right_gini
            
            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_attr = attr
                best_split_value = value
                
    return best_attr, best_split_value

def build_tree_cart(data, attributes, target_attribute):
    if len(np.unique(data[target_attribute])) == 1:
        return data[target_attribute].iloc[0]
    
    if len(attributes) == 0:
        return data[target_attribute].mode()[0]
    
    best_attr, best_split_value = best_split(data, attributes, target_attribute)
    
    if best_attr is None:
        return data[target_attribute].mode()[0]
    
    tree = {best_attr: {}}
    
    left_subset = data[data[best_attr] <= best_split_value]
    right_subset = data[data[best_attr] > best_split_value]
    
    remaining_attrs = [attr for attr in attributes if attr != best_attr]
    
    tree[best_attr]['<='] = build_tree_cart(left_subset, remaining_attrs, target_attribute)
    tree[best_attr]['>'] = build_tree_cart(right_subset, remaining_attrs, target_attribute)
    
    return tree

decision_tree_cart = build_tree_cart(data, attributes, target_attribute)

print("CART Decision Tree:")
print(decision_tree_cart)

classification_cart = classify(decision_tree_cart, new_sample)

print(f"CART Classification for {new_sample}: {classification_cart}")


CART Decision Tree:
{'Credit': {'<=': {'Income': {'<=': 'No', '>': 'Yes'}}, '>': 'Yes'}}


KeyError: 'Bad'

In [18]:
from sklearn.tree import DecisionTreeClassifier

# Prepare data
X = data[['Income', 'Credit']]
y = data['Loan Approved']

# Convert categorical variables to numerical
X = pd.get_dummies(X, drop_first=True)

# Create and train the C4.5 Decision Tree
c45_model = DecisionTreeClassifier(criterion='entropy')
c45_model.fit(X, y)

# Predictions using C4.5
c45_prediction = c45_model.predict(pd.get_dummies(pd.DataFrame(new_sample, index=[0]), drop_first=True))

print(f"C4.5 Prediction for {new_sample}: {c45_prediction[0]}")

# Create and train the CART Decision Tree
cart_model = DecisionTreeClassifier(criterion='gini')
cart_model.fit(X, y)

# Predictions using CART
cart_prediction = cart_model.predict(pd.get_dummies(pd.DataFrame(new_sample, index=[0]), drop_first=True))

print(f"CART Prediction for {new_sample}: {cart_prediction[0]}")


KeyError: "None of [Index(['Income', 'Credit'], dtype='object')] are in the [columns]"