In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

#Load Data 
df = pd.read_csv('preprocessed_dataset.csv')

# Calculate Tax Burden (Deductions as a % of Income)
df['tax_burden'] = df['total_deductions'] / (df['total_income'] + 1)

# Define Health Score (Inverse of stress: High Stress = Low Health)
df['financial_health_score'] = 100 - df['stress_score']

# 2. Select User-Requested Features
features = ['savings_ratio', 'debt_to_income_ratio_x', 'expense_ratio', 'tax_burden']
X = df[features]
y = df['financial_health_score']

# 3. Split and Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 4. Predictions and Accuracy Estimate
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error: {mae:.2f} points")
print(f"R-squared (Accuracy): {r2:.2%}")

Mean Absolute Error: 3.84 points
R-squared (Accuracy): 84.40%


In [11]:
#Gradient boosting model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Initialize and train the model
gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
gbr.fit(X_train, y_train)

# Predict and Evaluate
preds = gbr.predict(X_test)
print(f"Gradient Boosting MAE: {mean_absolute_error(y_test, preds):.2f}")
print(f"Gradient Boosting R2 Score: {r2_score(y_test, preds):.2%}")

Gradient Boosting MAE: 3.91
Gradient Boosting R2 Score: 84.08%


In [12]:
#Neural network model
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
# Scale features (Required for Neural Networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the model
mlp = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train)

# Predict and Evaluate
preds = mlp.predict(X_test_scaled)
print(f"Neural Network MAE: {mean_absolute_error(y_test, preds):.2f}")
print(f"Neural Network R2 Score: {r2_score(y_test, preds):.2%}")

Neural Network MAE: 4.23
Neural Network R2 Score: 81.08%




In [13]:
#Stacking Ensemble 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, r2_score

# Define base models
estimators = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gbr', GradientBoostingRegressor(n_estimators=100, random_state=42))
]

# Initialize Stacking Regressor with a Meta-Learner
stacking_model = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
stacking_model.fit(X_train, y_train)

# Predict and Evaluate
preds = stacking_model.predict(X_test)
print(f"Stacking Ensemble MAE: {mean_absolute_error(y_test, preds):.2f}")
print(f"Stacking Ensemble R2 Score: {r2_score(y_test, preds):.2%}")

Stacking Ensemble MAE: 3.84
Stacking Ensemble R2 Score: 84.70%


In [29]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 1. Prepare Full Dataset (Excluding the target itself)
X = df.drop(columns=['stress_score', 'stress_level', 'financial_health_score'])
y = df['financial_health_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. High-Performance Gradient Boosting
# Using a lower learning rate and more estimators for maximum precision
model = GradientBoostingRegressor(
    n_estimators=1000, 
    learning_rate=0.01, 
    max_depth=4, 
    subsample=0.8, 
    random_state=42
)

model.fit(X_train, y_train)

# 3. Final Result
score = r2_score(y_test, model.predict(X_test))
print(f"Final Accuracy: {score:.2%}") # Result: ~99.47%
print(f"Mean Absolute Error (MAE): {mae:.4f}")

Final Accuracy: 99.44%
Mean Absolute Error (MAE): 3.8413
