In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load data from CSV file
df = pd.read_csv("diabetes.csv")
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=42)

# Train the DecisionTreeClassifier
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of DecisionTreeClassifier: {accuracy * 100:.2f}%")

# Example usage for prediction
new_data = np.array([[6, 148, 72, 35, 0, 33.6, 0.627, 50]])
prediction = clf.predict(new_data)
print(f"Prediction: {'Diabetic' if prediction[0] == 1 else 'Non-Diabetic'}")


Accuracy of DecisionTreeClassifier: 74.68%
Prediction: Diabetic




In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Entropy calculation
def entropy(y):
    unique, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

# Information gain calculation
def information_gain(X, y, feature):
    entropy_parent = entropy(y)
    unique_values = np.unique(X[:, feature])
    weighted_entropy_children = 0
    for value in unique_values:
        child_indices = np.where(X[:, feature] == value)[0]
        child_entropy = entropy(y[child_indices])
        weighted_entropy_children += (len(child_indices) / len(y)) * child_entropy
    information_gain = entropy_parent - weighted_entropy_children
    return information_gain

# Find the best feature to split on
def find_best_split(X, y):
    best_information_gain = -1
    best_feature = None
    for feature in range(X.shape[1]):
        current_information_gain = information_gain(X, y, feature)
        if current_information_gain > best_information_gain:
            best_information_gain = current_information_gain
            best_feature = feature
    return best_feature

# Build the decision tree
def build_tree(X, y):
    if len(np.unique(y)) == 1:
        return {'leaf': True, 'class': y[0]}
    if X.shape[1] == 0:
        return {'leaf': True, 'class': np.bincount(y).argmax()}
    best_feature = find_best_split(X, y)
    unique_values = np.unique(X[:, best_feature])
    tree = {'feature': best_feature, 'children': {}}
    for value in unique_values:
        child_indices = np.where(X[:, best_feature] == value)[0]
        child_X = X[child_indices]
        child_y = y[child_indices]
        tree['children'][value] = build_tree(child_X, child_y)
    return tree

# Predict a single sample
# Predict a single sample
def predict_single(tree, sample):
    if 'leaf' in tree and tree['leaf']:
        return tree['class']
    else:
        feature = tree['feature']
        value = sample[feature]
        if value in tree['children']:
            return predict_single(tree['children'][value], sample)
        else:
            # If value not found in training data, return majority class of current node
            child_classes = [node['class'] for node in tree['children'].values() if 'class' in node]
            return max(child_classes, key=child_classes.count)


# Predict multiple samples
def predict(tree, X):
    predictions = []
    for sample in X:
        predictions.append(predict_single(tree, sample))
    return predictions

# Calculate accuracy
def calculate_accuracy(tree, X_test, y_test):
    predictions = predict(tree, X_test)
    correct_predictions = np.sum(predictions == y_test)
    accuracy = correct_predictions / len(y_test)
    return accuracy

# Load data from CSV file
df = pd.read_csv("diabetes.csv")
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build decision tree
tree = build_tree(X_train, y_train)

# Calculate accuracy
accuracy = calculate_accuracy(tree, X_test, y_test)
print(f"Accuracy of custom Decision Tree: {accuracy * 100:.2f}%")

# Example usage
new_data = np.array([[6, 148, 72, 35, 0, 33.6, 0.627, 50]])
prediction = predict(tree, new_data)
print(f"Prediction: {'Diabetic' if prediction[0] == 1 else 'Non-Diabetic'}")

Accuracy of custom Decision Tree: 58.44%
Prediction: Diabetic


In [89]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [90]:
def entropy(y):
    unique,counts=np.unique(y,return_counts=True)
    prob=counts/len(y)
    entropy=-np.sum(prob*np.log2(prob))
    return entropy

In [91]:
def info(x,y,feature):
    ento_p=entropy(y)
    unique_vals=np.unique(x[:,feature])
    wt=0
    for value in unique_vals:
        child_indices=np.where(x[:,feature]==value)[0]
        ento_c=entropy(y[child_indices])
        wt+=(len(child_indices)/len(y))*ento_c
    i_g=ento_p-wt
    return i_g

In [92]:
def best_split(x,y):
    best_ig=-1
    best_feature =None
    for feature in range(x.shape[1]):
        current_ig=info(x,y,feature)
        if current_ig>best_ig:
            best_ig=current_ig
            best_feature=feature
    return best_feature

In [93]:
def btree(x,y):
    if len(np.unique(y))==1:
        return {'leaf':True ,'class':y[0]}
    if x.shape[1]==0:
        return {'leaf' : True,'class':np.bincount(y).argmax()}
    b_feature=best_split(x,y)
    unique_values=np.unique(x[:,b_feature])
    tree={'feature':b_feature,'children':{}}
    for value in unique_values:
        indice=np.where(x[:,b_feature]==value)[0]
        x_child=x[indice]
        y_child=y[indice]
        tree['children'][value]=btree(x_child,y_child)
    return tree

In [94]:
def pre(tree, sample):
    if 'leaf' in tree and tree['leaf']:
        return tree['class']
    else:
        feature = tree['feature']
        value = sample[feature]
        if value in tree['children']:
            return pre(tree['children'][value], sample)
        else:
            # If value not found in training data, return majority class of current node
            child_classes = [node['class'] for node in tree['children'].values() if 'class' in node]
            return max(child_classes, key=child_classes.count)

In [95]:
def predict(tree,x):
    predictions=[]
    for sample in x:
        predictions.append(pre(tree,sample))
    return predictions

In [96]:
def accuracy(tree,x_test,y_test):
    predictions=predict(tree,x_test)
    corr=np.sum(predictions==y_test)
    accuracy=corr/len(y_test)
    print(accuracy)

In [97]:
data=pd.read_csv('diabetes.csv')
x=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

In [98]:
# Split dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Build decision tree
tree = btree(x_train, y_train)

# Calculate accuracy
accuracy = accuracy(tree, x_test, y_test)
print(f"Accuracy of custom Decision Tree: {accuracy:.2f}%")

# Example usage
a= np.array([[6, 148, 72, 35, 0, 33.6, 0.627, 50]])
prediction = predict(tree, a)
print(f"Prediction: {'Diabetic' if prediction[0] == 1 else 'Non-Diabetic'}")

0.5844155844155844


TypeError: unsupported format string passed to NoneType.__format__