In [16]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

# --------------------------
# Custom Decision Tree
# --------------------------
class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, *, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

class DecisionTreeClassifierCustom:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        print("[INFO] Training Decision Tree...")
        self.root = self._build_tree(X, y)
        print("[INFO] Training completed!")

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))

        if depth >= self.max_depth or num_labels == 1 or num_samples < self.min_samples_split:
            return TreeNode(value=self._most_common_label(y))

        best_feat, best_thresh = self._best_split(X, y, num_features)

        if best_feat is None:
            return TreeNode(value=self._most_common_label(y))

        left_indices = X[:, best_feat] <= best_thresh
        right_indices = X[:, best_feat] > best_thresh
        left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        return TreeNode(feature_index=best_feat, threshold=best_thresh, left=left, right=right)

    def _best_split(self, X, y, num_features):
        best_gain = -1
        split_index, split_thresh = None, None

        for feature_index in range(num_features):
            X_column = X[:, feature_index]
            thresholds = np.unique(X_column)

            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_index = feature_index
                    split_thresh = threshold

        return split_index, split_thresh

    def _information_gain(self, y, X_column, split_thresh):
        parent_entropy = self._entropy(y)
        left_indices = X_column <= split_thresh
        right_indices = X_column > split_thresh

        if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
            return 0

        n = len(y)
        n_l, n_r = len(y[left_indices]), len(y[right_indices])
        e_l, e_r = self._entropy(y[left_indices]), self._entropy(y[right_indices])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        return parent_entropy - child_entropy

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def print_tree(self, node=None, spacing=""):
        if node is None:
            node = self.root
        if node.is_leaf_node():
            print(spacing + f"Predict: {node.value}")
            return
        print(spacing + f"[X{node.feature_index} <= {node.threshold}]")
        print(spacing + "--> True:")
        self.print_tree(node.left, spacing + "  ")
        print(spacing + "--> False:")
        self.print_tree(node.right, spacing + "  ")

# --------------------------
# Load, Train, Evaluate & Save
# --------------------------
def main():
    print("[INFO] Loading dataset...")
    df = pd.read_csv("train.csv", encoding="ISO-8859-1")
    df = df[df["sentiment"].isin(["positive", "negative"])]
    df["label"] = df["sentiment"].map({"positive": 1, "negative": 0})
    df = df[["text", "label"]].dropna()

    # TF-IDF vectorizer (using scikit-learn)
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(df["text"]).toarray()  # Convert to dense array
    y = df["label"].values

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the Decision Tree
    model = DecisionTreeClassifierCustom(max_depth=10)
    model.fit(X_train, y_train)

    # Predictions and accuracy
    y_pred = model.predict(X_test)
    acc = np.mean(y_test == y_pred)
    print(f"\nAccuracy on Test Set: {acc:.4f}")

    print("\nTop Levels of Decision Tree:")
    model.print_tree()

    # Save model to pickle
    with open("custom_sentiment_tree_model.pkl", "wb") as f:
        pickle.dump(model, f)
    print("\n[INFO] Model saved to 'custom_sentiment_tree_model.pkl'.")

if __name__ == "__main__":
    main()


[INFO] Loading dataset...
[INFO] Training Decision Tree...
[INFO] Training completed!

Accuracy on Test Set: 0.7171

Top Levels of Decision Tree:
[X375 <= 0.0]
--> True:
  [X511 <= 0.13073671162182754]
  --> True:
    [X348 <= 0.1406260890825209]
    --> True:
      [X823 <= 0.0]
      --> True:
        [X356 <= 0.18050647268898878]
        --> True:
          [X702 <= 0.0]
          --> True:
            [X592 <= 0.0]
            --> True:
              [X63 <= 0.0]
              --> True:
                [X822 <= 0.0]
                --> True:
                  [X408 <= 0.0]
                  --> True:
                    Predict: 0
                  --> False:
                    Predict: 1
                --> False:
                  [X603 <= 0.14162404997833528]
                  --> True:
                    Predict: 1
                  --> False:
                    Predict: 1
              --> False:
                [X729 <= 0.0]
                --> True:
                  [X55

In [18]:
    # Save model to pickle
    with open("custom_sentiment_tree_model.pkl", "wb") as f:
        pickle.dump(model, f)
    print("\n[INFO] Model saved to 'custom_sentiment_tree_model.pkl'.")

    # --------------------------
    # Test on Custom Input
    # --------------------------
    test_texts = [
        "I absolutely loved the product, it was amazing!",
        "This is the worst thing I've ever bought.",
        "I don't know what to feel about this."
    ]

    test_vectors = vectorizer.transform(test_texts)

    test_preds = model.predict(test_vectors)

    print("\nCustom Input Predictions:")
    for text, pred in zip(test_texts, test_preds):
        sentiment = "positive" if pred == 1 else "negative"
        print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")



[INFO] Model saved to 'custom_sentiment_tree_model.pkl'.

Custom Input Predictions:
Text: I absolutely loved the product, it was amazing!
Predicted Sentiment: positive

Text: This is the worst thing I've ever bought.
Predicted Sentiment: positive

Text: I don't know what to feel about this.
Predicted Sentiment: positive

