In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Load the dataset
df = pd.read_csv("supervised_transactions.csv")

# Encode category as a numerical value (using label encoding)
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# Map 'green' to 0 and 'red' to 1 for the status (binary classification)
df['status_encoded'] = df['status'].map({'green': 0, 'red': 1})

# Features (X) and Labels (y)
X = df[['category_encoded', 'amount']]
y = df['status_encoded']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data preprocessing completed.")


Data preprocessing completed.


In [22]:
# Initialize logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Predict the results on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 58.77%


In [23]:
# Calculate the threshold for each category
thresholds = {}

# For each category
for category_id in df['category_encoded'].unique():
    # Get all transactions for this category
    category_data = df[df['category_encoded'] == category_id]
    
    # Get the logistic regression coefficients
    coef = model.coef_[0]  # Coefficients for the features
    
    # The decision boundary is where the probability is 0.5 (logistic regression's threshold)
    # We solve for amount: coef[0] * category_encoded + coef[1] * amount = 0
    # Since category_encoded is constant for each category, we isolate amount
    amount_threshold = -(coef[0] * category_id) / coef[1]
    
    # Store the threshold for this category
    thresholds[label_encoder.inverse_transform([category_id])[0]] = amount_threshold

# Display the thresholds for each category
print("Thresholds for each category:")
for category, threshold in thresholds.items():
    print(f"Category: {category}, Threshold: {threshold:.2f}")


Thresholds for each category:
Category: Education, Threshold: -0.00
Category: Food, Threshold: -0.00
Category: Health, Threshold: -0.00
Category: Miscellaneous, Threshold: -0.00
Category: Utilities, Threshold: -0.00
Category: Entertainment, Threshold: -0.00
Category: Groceries, Threshold: -0.00
Category: Transportation, Threshold: -0.00
Category: Charity, Threshold: -0.00
Category: Rent, Threshold: -0.00
