In [25]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, *, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

class DecisionTreeClassifierCustom:
    def __init__(self, max_depth=15, min_samples_split=10, max_thresholds=10):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_thresholds = max_thresholds
        self.root = None

    def fit(self, X, y):
        print("[INFO] Training Decision Tree...")
        self.root = self._build_tree(X, y)
        print("[INFO] Training completed!")

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))

        if depth >= self.max_depth or num_labels == 1 or num_samples < self.min_samples_split:
            return TreeNode(value=self._most_common_label(y))

        best_feat, best_thresh = self._best_split(X, y)

        if best_feat is None:
            return TreeNode(value=self._most_common_label(y))

        left_indices = X[:, best_feat] <= best_thresh
        right_indices = X[:, best_feat] > best_thresh
        left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        return TreeNode(feature_index=best_feat, threshold=best_thresh, left=left, right=right)

    def _best_split(self, X, y):
        best_gain = -1
        best_index, best_thresh = None, None

        for feature_index in range(X.shape[1]):
            X_column = X[:, feature_index]
            unique_values = np.unique(X_column)

            if len(unique_values) > self.max_thresholds:
                thresholds = np.linspace(min(unique_values), max(unique_values), self.max_thresholds)
            else:
                thresholds = unique_values

            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_index = feature_index
                    best_thresh = threshold

        return best_index, best_thresh

    def _information_gain(self, y, feature_column, threshold):
        parent_entropy = self._entropy(y)
        left_mask = feature_column <= threshold
        right_mask = feature_column > threshold

        if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
            return 0

        n = len(y)
        n_l, n_r = len(y[left_mask]), len(y[right_mask])
        e_l = self._entropy(y[left_mask])
        e_r = self._entropy(y[right_mask])
        weighted_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        return parent_entropy - weighted_entropy

    def _entropy(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

    def _most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

# --------------------------
# Load, Train, Evaluate & Save
# --------------------------
def main():
    print("[INFO] Loading dataset...")
    df = pd.read_csv("train.csv", encoding="ISO-8859-1")
    df = df[df["sentiment"].isin(["positive", "negative"])]
    df["label"] = df["sentiment"].map({"positive": 1, "negative": 0})
    df = df[["text", "label"]].dropna()

    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(df["text"]).toarray()
    y = df["label"].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = DecisionTreeClassifierCustom(max_depth=15, min_samples_split=10)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = np.mean(y_test == y_pred)
    print(f"\nAccuracy on Test Set: {accuracy:.4f}")

    test_texts = [
        "I absolutely loved the product, it was amazing!",
        "I am sad and disappointed.",
        "Not sure what I feel about this.",
        "Worst thing ever!",
        "Such a beautiful day!"
    ]
    test_vectors = vectorizer.transform(test_texts).toarray()
    test_preds = model.predict(test_vectors)

    print("\nCustom Input Predictions:")
    for text, pred in zip(test_texts, test_preds):
        sentiment = "positive" if pred == 1 else "negative"
        print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")

    if model.root is not None:
        with open("custom_sentiment_tree_model.pkl", "wb") as f:
            pickle.dump(model, f)
        with open("custom_tfidf_vectorizer.pkl", "wb") as f:
            pickle.dump(vectorizer, f)
        print("[INFO] Model and vectorizer saved successfully!")
    else:
        print("[ERROR] Model training failed. Root node is None. Pickle not saved.")

if __name__ == "__main__":
    main()


[INFO] Loading dataset...
[INFO] Training Decision Tree...
[INFO] Training completed!

Accuracy on Test Set: 0.7363

Custom Input Predictions:
Text: I absolutely loved the product, it was amazing!
Predicted Sentiment: positive

Text: I am sad and disappointed.
Predicted Sentiment: negative

Text: Not sure what I feel about this.
Predicted Sentiment: negative

Text: Worst thing ever!
Predicted Sentiment: negative

Text: Such a beautiful day!
Predicted Sentiment: negative

[INFO] Model and vectorizer saved successfully!


In [26]:
import pickle
import numpy as np

# ----------------------------
# Step 1: Load model and vectorizer
# ----------------------------

try:
    with open("custom_sentiment_tree_model.pkl", "rb") as f:
        model = pickle.load(f)

    with open("custom_tfidf_vectorizer.pkl", "rb") as f:
        vectorizer = pickle.load(f)

    print("[INFO] Model and vectorizer loaded successfully!")
except Exception as e:
    print(f"[ERROR] Loading failed: {e}")
    exit()

# ----------------------------
# Step 2: Get custom input and predict
# ----------------------------

while True:
    custom_input = input("Enter a sentence for sentiment analysis (or type 'exit'): ")
    if custom_input.lower() == 'exit':
        break

    X_custom = vectorizer.transform([custom_input]).toarray()
    prediction = model.predict(X_custom)[0]
    sentiment = "Positive" if prediction == 1 else "Negative"
    print(f"Predicted Sentiment: {sentiment}\n")


[INFO] Model and vectorizer loaded successfully!
Enter a sentence for sentiment analysis (or type 'exit'): vaibhav is happy
Predicted Sentiment: Positive

Enter a sentence for sentiment analysis (or type 'exit'): vaibhav is sad
Predicted Sentiment: Negative

Enter a sentence for sentiment analysis (or type 'exit'): he is fucking happy
Predicted Sentiment: Positive

Enter a sentence for sentiment analysis (or type 'exit'): he is fuckin sad
Predicted Sentiment: Negative

Enter a sentence for sentiment analysis (or type 'exit'): exit
