In [2]:
import os
print(os.getcwd())

C:\Users\Arya Sahane\OneDrive\Desktop\CD


In [3]:
import pandas as pd

# Data to be written to the CSV
data = [
    ["sunny", "hot", "high", "weak", "no"],
    ["sunny", "hot", "high", "strong", "no"],
    ["overcast", "hot", "high", "weak", "yes"],
    ["rain", "mild", "high", "weak", "yes"],
    ["rain", "cool", "normal", "weak", "yes"],
    ["rain", "cool", "normal", "strong", "no"],
    ["overcast", "cool", "normal", "strong", "yes"],
    ["sunny", "mild", "high", "weak", "no"],
    ["sunny", "cool", "normal", "weak", "yes"],
    ["rain", "mild", "normal", "weak", "yes"],
    ["sunny", "mild", "normal", "strong", "yes"],
    ["overcast", "mild", "high", "strong", "yes"],
    ["overcast", "hot", "normal", "weak", "yes"],
    ["rain", "mild", "high", "strong", "no"]
]

# Define the column names
columns = ["Outlook", "Temperature", "Humidity", "Wind", "PlayTennis"]

# Create a DataFrame
df = pd.DataFrame(data, columns=columns)

# Save the DataFrame to a CSV file
df.to_csv("dataset.csv", index=False)

print("CSV file 'dataset.csv' created successfully.")

CSV file 'dataset.csv' created successfully.


In [4]:
import pandas as pd

def unique_values(rows, col):
    return set([row[col] for row in rows])

def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

def split_dataset(rows, col, value):
    left, right = [], []
    for row in rows:
        if row[col] == value:
            left.append(row)
        else:
            right.append(row)
    return left, right

def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for label in counts:
        prob_of_label = counts[label] / float(len(rows))
        impurity -= prob_of_label**2
    return impurity

def information_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

def find_best_split(rows):
    best_gain = 0
    best_split = None
    current_uncertainty = gini(rows)
    num_features = len(rows[0]) - 1
    for col in range(num_features):
        values = unique_values(rows, col)
        for val in values:
            left, right = split_dataset(rows, col, val)
            if len(left) == 0 or len(right) == 0:
                continue
            gain = information_gain(left, right, current_uncertainty)
            if gain > best_gain:
                best_gain, best_split = gain, (col, val)
    return best_gain, best_split

class DecisionNode:
    def __init__(self, feature, value, true_branch, false_branch):
        self.feature = feature
        self.value = value
        self.true_branch = true_branch
        self.false_branch = false_branch

def build_tree(rows):
    gain, split = find_best_split(rows)
    if gain == 0:
        return DecisionNode(None, rows[0][-1], None, None)
    col, val = split
    left, right = split_dataset(rows, col, val)
    true_branch = build_tree(left)
    false_branch = build_tree(right)
    return DecisionNode((col, val), None, true_branch, false_branch)

def print_tree(node, indent=""):
    if node is None:
        return
    if node.feature is not None:
        print(indent + "|__ " + header[node.feature[0]] + " = " + str(node.feature[1]))
        print_tree(node.true_branch, indent + "| ")
        print_tree(node.false_branch, indent + "| ")
    else:
        print(indent + "|__ Predict: " + node.value)

def main():
    df = pd.read_csv('dataset.csv')  # Load the dataset from CSV file
    data = df.values.tolist()
    global header
    header = df.columns.tolist()[:-1] # Exclude the last column (target) from header
    decision_tree = build_tree(data)
    print("Decision Tree:")
    print_tree(decision_tree)

if __name__ == "__main__":
    main()


Decision Tree:
|__ Outlook = overcast
| |__ Predict: yes
| |__ Humidity = normal
| | |__ Wind = strong
| | | |__ Outlook = sunny
| | | | |__ Predict: yes
| | | | |__ Predict: no
| | | |__ Predict: yes
| | |__ Outlook = sunny
| | | |__ Predict: no
| | | |__ Wind = strong
| | | | |__ Predict: no
| | | | |__ Predict: yes


In [6]:
import pandas as pd

def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        counts[label] = counts.get(label, 0) + 1
    return counts

def gini(rows):
    counts = class_counts(rows)
    impurity = 1.0
    total = len(rows)
    for label in counts:
        prob_of_label = counts[label] / total
        impurity -= prob_of_label**2
    return impurity

def split_dataset(rows, col, value):
    left = [row for row in rows if row[col] == value]
    right = [row for row in rows if row[col] != value]
    return left, right

def information_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

def find_best_split(rows):
    best_gain = 0
    best_split = None
    current_uncertainty = gini(rows)
    num_features = len(rows[0]) - 1  # Exclude target column
    for col in range(num_features):
        values = set(row[col] for row in rows)
        for val in values:
            left, right = split_dataset(rows, col, val)
            if not left or not right:
                continue
            gain = information_gain(left, right, current_uncertainty)
            if gain > best_gain:
                best_gain, best_split = gain, (col, val)
    return best_split

class DecisionNode:
    def __init__(self, feature, true_branch, false_branch, prediction=None):
        self.feature = feature
        self.true_branch = true_branch
        self.false_branch = false_branch
        self.prediction = prediction

def build_tree(rows):
    split = find_best_split(rows)
    if not split:
        return DecisionNode(None, None, None, rows[0][-1])
    left, right = split_dataset(rows, split[0], split[1])
    return DecisionNode(split, build_tree(left), build_tree(right))

def print_tree(node, indent=""):
    if node.prediction:
        print(indent + "Predict:", node.prediction)
    else:
        col, val = node.feature
        print(indent + f"|__ {header[col]} = {val}")
        print_tree(node.true_branch, indent + "| ")
        print_tree(node.false_branch, indent + "| ")

def main():
    df = pd.read_csv('dataset.csv')
    global header
    header = df.columns.tolist()[:-1]
    tree = build_tree(df.values.tolist())
    print_tree(tree)

if __name__ == "__main__":
    main()


|__ Outlook = overcast
| Predict: yes
| |__ Humidity = normal
| | |__ Wind = strong
| | | |__ Outlook = sunny
| | | | Predict: yes
| | | | Predict: no
| | | Predict: yes
| | |__ Outlook = sunny
| | | Predict: no
| | | |__ Wind = strong
| | | | Predict: no
| | | | Predict: yes


In [9]:
import pandas as pd

# Function to get unique values for a column in the dataset
def unique_values(rows, col):
    return set([row[col] for row in rows])

# Function to count the occurrences of each class in the dataset
def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]  # The target value is assumed to be the last column
        counts[label] = counts.get(label, 0) + 1
    return counts

# Function to split the dataset into two groups based on a feature and its value
def split_dataset(rows, col, value):
    left, right = [], []
    for row in rows:
        if row[col] == value:
            left.append(row)
        else:
            right.append(row)
    return left, right

# Function to calculate the Gini Impurity for a set of rows
def gini(rows):
    counts = class_counts(rows)
    impurity = 1.0
    for label in counts:
        prob_of_label = counts[label] / float(len(rows))
        impurity -= prob_of_label ** 2
    return impurity

# Function to calculate the information gain from a split
def information_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

# Function to find the best feature and value to split on
def find_best_split(rows):
    best_gain = 0
    best_split = None
    current_uncertainty = gini(rows)
    num_features = len(rows[0]) - 1  # Number of features, excluding the target
    for col in range(num_features):
        values = unique_values(rows, col)  # Get unique values for this column
        for val in values:
            left, right = split_dataset(rows, col, val)
            if not left or not right:
                continue  # Skip if there's no effective split
            gain = information_gain(left, right, current_uncertainty)
            if gain > best_gain:
                best_gain, best_split = gain, (col, val)
    return best_split

# Class to represent a decision node or leaf node in the tree
class DecisionNode:
    def __init__(self, feature, true_branch, false_branch, prediction=None):
        self.feature = feature
        self.true_branch = true_branch
        self.false_branch = false_branch
        self.prediction = prediction

# Recursive function to build the decision tree
def build_tree(rows):
    split = find_best_split(rows)
    if not split:
        return DecisionNode(None, None, None, rows[0][-1])  # Leaf node
    col, val = split
    left, right = split_dataset(rows, col, val)
    true_branch = build_tree(left)
    false_branch = build_tree(right)
    return DecisionNode(split, true_branch, false_branch)

# Function to print the decision tree
def print_tree(node, indent=""):
    if node.prediction:
        print(indent + "Predict:", node.prediction)
    else:
        col, val = node.feature
        print(indent + f"|__ {header[col]} = {val}")
        print_tree(node.true_branch, indent + "|   ")
        print_tree(node.false_branch, indent + "|   ")

# Main function to load the dataset and build/print the tree
def main():
    df = pd.read_csv('dataset.csv')  # Load the dataset from CSV file
    data = df.values.tolist()  # Convert the DataFrame to a list of lists
    global header
    header = df.columns.tolist()[:-1]  # Column headers, excluding the target
    decision_tree = build_tree(data)  # Build the tree
    print("Decision Tree:")
    print_tree(decision_tree)  # Print the tree

if __name__ == "__main__":
    main()


Decision Tree:
|__ Outlook = overcast
|   Predict: yes
|   |__ Humidity = normal
|   |   |__ Wind = strong
|   |   |   |__ Outlook = sunny
|   |   |   |   Predict: yes
|   |   |   |   Predict: no
|   |   |   Predict: yes
|   |   |__ Outlook = sunny
|   |   |   Predict: no
|   |   |   |__ Wind = strong
|   |   |   |   Predict: no
|   |   |   |   Predict: yes


In [10]:
import pandas as pd

def unique_values(rows, col):
    return set(row[col] for row in rows)

def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]  # The label is assumed to be the last column
        counts[label] = counts.get(label, 0) + 1
    return counts

def gini(rows):
    counts = class_counts(rows)
    impurity = 1.0
    for label in counts:
        prob_of_label = counts[label] / float(len(rows))
        impurity -= prob_of_label ** 2
    return impurity

def split_dataset(rows, col, value):
    left, right = [], []
    for row in rows:
        if row[col] == value:
            left.append(row)
        else:
            right.append(row)
    return left, right

def information_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

def find_best_split(rows):
    best_gain = 0
    best_split = None
    current_uncertainty = gini(rows)
    num_features = len(rows[0]) - 1  # Exclude the label (target)
    for col in range(num_features):  # Loop through columns (features)
        values = unique_values(rows, col)
        for val in values:
            left, right = split_dataset(rows, col, val)
            if not left or not right:
                continue
            gain = information_gain(left, right, current_uncertainty)
            if gain > best_gain:
                best_gain, best_split = gain, (col, val)
    return best_split

class DecisionNode:
    def __init__(self, feature, true_branch, false_branch, prediction=None):
        self.feature = feature
        self.true_branch = true_branch
        self.false_branch = false_branch
        self.prediction = prediction

def build_tree(rows):
    split = find_best_split(rows)
    if not split:
        return DecisionNode(None, None, None, rows[0][-1])
    col, val = split
    left, right = split_dataset(rows, col, val)
    true_branch = build_tree(left)
    false_branch = build_tree(right)
    return DecisionNode(split, true_branch, false_branch)

def print_tree(node, indent=""):
    if node.prediction:
        print(indent + "Predict:", node.prediction)
    else:
        col, val = node.feature
        print(indent + f"|__ {header[col]} = {val}")
        print_tree(node.true_branch, indent + "|   ")
        print_tree(node.false_branch, indent + "|   ")

def main():
    df = pd.read_csv('dataset.csv')  # Load the dataset from CSV
    data = df.values.tolist()  # Convert to list of lists
    global header
    header = df.columns.tolist()[:-1]  # Exclude label/target
    tree = build_tree(data)
    print_tree(tree)

if __name__ == "__main__":
    main()


|__ Outlook = overcast
|   Predict: yes
|   |__ Humidity = normal
|   |   |__ Wind = strong
|   |   |   |__ Outlook = sunny
|   |   |   |   Predict: yes
|   |   |   |   Predict: no
|   |   |   Predict: yes
|   |   |__ Outlook = sunny
|   |   |   Predict: no
|   |   |   |__ Wind = strong
|   |   |   |   Predict: no
|   |   |   |   Predict: yes
