In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math,copy

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
train_df=pd.read_csv("/content/drive/MyDrive/binary_classification_train.csv", header=0)

x_tr=train_df.iloc[0:28800,1:21]
y_tr=train_df.iloc[0:28800,21]

x_cv=train_df.iloc[28800:38400,1:21]
y_cv=train_df.iloc[28800:38400,21]

x_tst=train_df.iloc[38400:48001,1:21]
y_tst=train_df.iloc[38400:48001,21]

x_tr1=x_tr.values
y_tr1=y_tr.values

x_cv1=x_cv.values
y_cv1=y_cv.values

x_tst1=x_tst.values
y_tst1=y_tst.values

print(x_tr1.shape)
print(y_tr1.shape)
print(x_cv1.shape)
print(y_cv1.shape)
print(x_tst1.shape)
print(y_tst1.shape)

(28800, 20)
(28800,)
(9600, 20)
(9600,)
(9600, 20)
(9600,)


In [21]:
def zscore_normalize_features(X):
    """
    computes  X, zcore normalized by column

    Args:
      X (ndarray (m,n))     : input data, m examples, n features

    Returns:
      X_norm (ndarray (m,n)): input normalized by column
      mu (ndarray (n,))     : mean of each feature
      sigma (ndarray (n,))  : standard deviation of each feature
    """
    # find the mean of each column/feature
    mu     = np.mean(X, axis=0)                 # mu will have shape (n,)
    # find the standard deviation of each column/feature
    sigma  = np.std(X, axis=0)                  # sigma will have shape (n,)
    # element-wise, subtract mu for that column from each example, divide by std for that column
    X_norm = (X - mu) / sigma

    return (X_norm, mu, sigma)

x_tr_n, x_mu1, x_sig1 = zscore_normalize_features(x_tr1)
x_cv_n, x_mu2, x_sig2 = zscore_normalize_features(x_cv1)
x_tst_n, x_mu3, x_sig3 = zscore_normalize_features(x_tst1)

In [22]:
# Function to calculate Entropy
def entropy(y):
    """
    Calculate the entropy of a label array.
    """
    # Get the frequency of each class
    class_probs = np.bincount(y) / len(y)

    # Calculate entropy
    return -np.sum(class_probs * np.log2(class_probs + 1e-10))  # Adding small value to avoid log(0)

In [23]:
# Function to calculate the Information Gain from a split
def information_gain(X_column, y, threshold):
    """
    Calculate the Information Gain for a given feature and threshold.
    """
    # Left and right split based on threshold
    left_mask = X_column <= threshold
    right_mask = ~left_mask

    left_y = y[left_mask]
    right_y = y[right_mask]

    # Calculate entropy of the left and right splits
    left_entropy = entropy(left_y)
    right_entropy = entropy(right_y)

    # Weighted average of the entropies for the left and right splits
    left_weight = len(left_y) / len(y)
    right_weight = len(right_y) / len(y)

    return entropy(y) - (left_weight * left_entropy + right_weight * right_entropy)

In [24]:
# Function to find the best split (feature and threshold) based on information gain
def best_split(X, y):
    """
    Finds the best feature and threshold to split on.
    """
    m, n = X.shape
    best_feature = None
    best_threshold = None
    best_info_gain = -float('inf')

    # Iterate over all features
    for feature_index in range(n):
        feature_values = X[:, feature_index]

        # Get all unique values in the feature for thresholding
        possible_thresholds = np.unique(feature_values)

        # Iterate over possible thresholds
        for threshold in possible_thresholds:
            info_gain = information_gain(feature_values, y, threshold)

            # If this threshold gives better information gain, save it
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_feature = feature_index
                best_threshold = threshold

    return best_feature, best_threshold

In [25]:
# Recursive function to build the decision tree
def build_tree(X, y, depth=0, max_depth=None):
    """
    Recursively builds a decision tree.
    """
    # If only one class is left or max_depth is reached, return the leaf node
    if len(np.unique(y)) == 1:
        return {'label': y[0]}
    if max_depth is not None and depth >= max_depth:
        majority_class = np.bincount(y).argmax()
        return {'label': majority_class}

    # Find the best split
    feature, threshold = best_split(X, y)

    if feature is None:  # No valid split found (pure leaf)
        majority_class = np.bincount(y).argmax()
        return {'label': majority_class}

    # Split the data based on the best feature and threshold
    left_mask = X[:, feature] <= threshold
    right_mask = ~left_mask

    left_tree = build_tree(X[left_mask], y[left_mask], depth + 1, max_depth)
    right_tree = build_tree(X[right_mask], y[right_mask], depth + 1, max_depth)

    return {
        'feature': feature,
        'threshold': threshold,
        'left': left_tree,
        'right': right_tree
    }

In [26]:
# Function to make predictions on a single sample
def predict_tree(tree, X):
    """
    Make a prediction using the decision tree.
    """
    if 'label' in tree:
        return tree['label']

    feature_value = X[tree['feature']]
    if feature_value <= tree['threshold']:
        return predict_tree(tree['left'], X)
    else:
        return predict_tree(tree['right'], X)

In [27]:
# Function to predict on the entire dataset
def predict(model, X):
    """
    Predict labels for a dataset using the trained decision tree.
    """
    return np.array([predict_tree(model, x) for x in X])

In [28]:
# Function to compute accuracy
def accuracy(y_true, y_pred):
    """
    Calculate the accuracy of the model.
    """
    return np.sum(y_true == y_pred) / len(y_true)


In [29]:
# Train a decision tree classifier (with no max depth)
tree = build_tree(x_tr_n, y_tr1, max_depth=10)

In [30]:

# Make predictions
y_pred2 = predict(tree, x_cv_n)

# Calculate accuracy
acc = accuracy(y_cv1, y_pred2)
print(f"Accuracy of CV dataset: {acc * 100:.2f}%")

# Print out a few predictions
print("Predictions (first 10 samples):", y_pred2[:10])

Accuracy of CV dataset: 93.65%
Predictions (first 10 samples): [1 1 0 0 1 0 0 0 1 0]


In [31]:
# Make predictions
y_pred1 = predict(tree, x_tr_n)

# Calculate accuracy
acc = accuracy(y_tr1, y_pred1)
print(f"Accuracy of Train dataset: {acc * 100:.2f}%")

# Print out a few predictions
print("Predictions (first 10 samples):", y_pred1[:10])

Accuracy of Train dataset: 95.31%
Predictions (first 10 samples): [0 1 0 0 0 0 0 0 0 0]


In [32]:
# Make predictions
y_pred3 = predict(tree, x_tst_n)

# Calculate accuracy
acc = accuracy(y_tst1, y_pred3)
print(f"Accuracy of Test dataset: {acc * 100:.2f}%")

# Print out a few predictions
print("Predictions (first 10 samples):", y_pred3[:10])

Accuracy of Test dataset: 93.27%
Predictions (first 10 samples): [0 0 0 0 0 1 1 1 0 0]


In [33]:
#F1 Score Calc
def f_1_score(x,y):
  '''
  Calculate the F1 Score of the model
  '''
  a=0
  b=0
  for i in range(len(x)):
    if x[i]==1 and x[i]==y[i]:
      a+=1
    elif x[i]==1 and x[i]!=y[i]:
      b+=1
    else:
      pass
  pr=a/sum(x)
  re=a/sum(y)
  f_1=2*pr*re/(pr+re)
  return f_1



In [34]:
print("F1 Score on Train dataset: ",f_1_score(y_pred1,y_tr1))
print("F1 Score on CV dataset: ",f_1_score(y_pred2,y_cv1))
print("F1 Score on Test dataset: ",f_1_score(y_pred3,y_tst1))

F1 Score on Train dataset:  0.9243857390720323
F1 Score on CV dataset:  0.8954046639231824
F1 Score on Test dataset:  0.8890034364261167


In [38]:
test_df=pd.read_csv("/content/drive/MyDrive/binary_classification_test.csv", header=0)
x_test=test_df.iloc[:,1:21].values
print(x_test.shape)

x, x_mu4, x_sig4 = zscore_normalize_features(x_test)

(12000, 20)


In [39]:
# Make predictions in test set
y_pred4 = predict(tree, x)

print("Predictions (first 10 samples):", y_pred4[:10])

Predictions (first 10 samples): [0 0 0 0 0 0 1 0 0 0]


In [44]:
y_pred_df=pd.DataFrame(y_pred4)
type(y_pred_df)

In [45]:
y_pred_df.to_csv("Decision_tree_Binary_test.csv", index=False)

In [47]:
! cat Decision_tree_Binary_test.csv

0
0
0
0
0
0
0
1
0
0
0
0
1
0
1
0
0
1
0
0
0
1
1
1
0
0
1
0
1
0
0
1
0
0
0
1
0
0
1
0
0
0
0
1
1
1
0
0
0
0
0
0
0
0
1
1
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
1
0
0
0
1
1
0
0
1
0
1
0
0
0
0
1
0
1
1
0
1
1
1
1
0
0
1
0
0
1
0
0
0
0
0
1
1
1
1
0
0
1
0
0
0
0
0
0
0
0
1
0
1
1
1
0
1
0
0
1
1
1
0
0
0
1
0
0
0
0
0
1
0
0
0
0
1
1
0
0
1
1
0
1
0
0
1
0
0
1
1
0
1
0
0
1
1
1
1
1
0
0
0
0
0
0
1
0
0
0
1
1
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
1
1
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
1
1
1
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
1
1
0
0
0
0
1
1
0
0
1
1
0
0
0
1
0
0
0
0
1
0
1
0
1
0
1
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
1
1
0
0
0
1
1
0
1
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
1
0
1
0
0
0
0
0
0
1
0
1
0
0
0
1
0
0
1
0
0
1
0
0
0
0
1
0
0
1
0
0
0
0
0
1
0
0
0
0
0
1
1
0
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
1
0
0
1
0
1
1
0
0
1
0
0
0
0
0
0
0
0
1
1
0
1
0
0
1
0
0
0
0
0
0
0
0
1
1
1
0
0
0
0
0
1
0
1
1
0
1
0
0
1
0
1
1
0
1
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
