In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df1=pd.read_csv("../data/finance/data.csv")
df2=pd.read_csv("../data/finance/personal_finance_tracker_dataset.csv")
df3=pd.read_csv("../data/finance/synthetic_personal_finance_dataset.csv")

for i, df in enumerate([df1,df2,df3], start=1):
    print(f"\nDataset {i}: shape={df.shape}")
    print(df.head())


Dataset 1: shape=(20000, 27)
          Income  Age  Dependents     Occupation City_Tier          Rent  \
0   44637.249636   49           0  Self_Employed    Tier_1  13391.174891   
1   26858.596592   34           2        Retired    Tier_2   5371.719318   
2   50367.605084   35           1        Student    Tier_3   7555.140763   
3  101455.600247   21           0  Self_Employed    Tier_3  15218.340037   
4   24875.283548   52           4   Professional    Tier_2   4975.056710   

   Loan_Repayment    Insurance     Groceries    Transport  ...  \
0        0.000000  2206.490129   6658.768341  2636.970696  ...   
1        0.000000   869.522617   2818.444460  1543.018778  ...   
2     4612.103386  2201.800050   6313.222081  3221.396403  ...   
3     6809.441427  4889.418087  14690.149363  7106.130005  ...   
4     3112.609398   635.907170   3034.329665  1276.155163  ...   

   Desired_Savings  Disposable_Income  Potential_Savings_Groceries  \
0      6200.537192       11265.627707         

### Dataset 1

In [3]:
df1 = pd.read_csv("../data/finance/data.csv")
print("Dataset 1: shape=", df1.shape)
print(df1.head())

Dataset 1: shape= (20000, 27)
          Income  Age  Dependents     Occupation City_Tier          Rent  \
0   44637.249636   49           0  Self_Employed    Tier_1  13391.174891   
1   26858.596592   34           2        Retired    Tier_2   5371.719318   
2   50367.605084   35           1        Student    Tier_3   7555.140763   
3  101455.600247   21           0  Self_Employed    Tier_3  15218.340037   
4   24875.283548   52           4   Professional    Tier_2   4975.056710   

   Loan_Repayment    Insurance     Groceries    Transport  ...  \
0        0.000000  2206.490129   6658.768341  2636.970696  ...   
1        0.000000   869.522617   2818.444460  1543.018778  ...   
2     4612.103386  2201.800050   6313.222081  3221.396403  ...   
3     6809.441427  4889.418087  14690.149363  7106.130005  ...   
4     3112.609398   635.907170   3034.329665  1276.155163  ...   

   Desired_Savings  Disposable_Income  Potential_Savings_Groceries  \
0      6200.537192       11265.627707         

In [4]:
X = df1.drop(columns=["Disposable_Income", "Potential_Savings_Groceries",
                      "Potential_Savings_Transport", "Potential_Savings_Eating_Out",
                      "Potential_Savings_Entertainment", "Potential_Savings_Utilities",
                      "Potential_Savings_Healthcare", "Potential_Savings_Education",
                      "Potential_Savings_Miscellaneous"])  # keep it focused

y = df1["Disposable_Income"]

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical cols:", categorical_cols)
print("Numerical cols:", numerical_cols)

Categorical cols: ['Occupation', 'City_Tier']
Numerical cols: ['Income', 'Age', 'Dependents', 'Rent', 'Loan_Repayment', 'Insurance', 'Groceries', 'Transport', 'Eating_Out', 'Entertainment', 'Utilities', 'Healthcare', 'Education', 'Miscellaneous', 'Desired_Savings_Percentage', 'Desired_Savings']


In [5]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=200, random_state=42))
])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [7]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Dataset 1 - MAE:", mae)
print("Dataset 1 - MSE:", mse)
print("Dataset 1 - R2:", r2)

Dataset 1 - MAE: 919.9650395580932
Dataset 1 - MSE: 13498473.471290914
Dataset 1 - R2: 0.9141958993239931


In [8]:
y_pred = pipeline.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

Mean Absolute Error: 919.9650395580932
Mean Squared Error: 13498473.471290914
R2 Score: 0.9141958993239931


In [9]:
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring="r2")
print("CV scores:", cv_scores)
print("Mean CV R2:", cv_scores.mean())

CV scores: [0.9401887  0.96428048 0.97513246 0.90402862 0.95983284]
Mean CV R2: 0.9486926208415897


In [10]:
import os
os.makedirs("../models", exist_ok=True)

joblib.dump(pipeline, "../models/finance_disposable_income_model.pkl")
print("Saved pipeline (preprocessor + model) in one file!")

Saved pipeline (preprocessor + model) in one file!


### Dataset 2

In [11]:
df2 = pd.read_csv("../data/finance/personal_finance_tracker_dataset.csv")

print("Shape:", df2.shape)
print(df2.head())

Shape: (3000, 25)
         date  user_id  monthly_income  monthly_expense_total  savings_rate  \
0  2019-01-01     1584         3119.58                3212.07          0.38   
1  2019-01-31     1045         3262.44                3732.81          0.10   
2  2019-03-02     1756         2931.20                3335.58          0.15   
3  2019-04-01     1724         3506.79                2327.59          0.17   
4  2019-05-01     1600         4606.87                2182.58          0.34   

   budget_goal financial_scenario  credit_score  debt_to_income_ratio  \
0      3676.11          inflation         721.0                  0.56   
1      2607.17          inflation         670.0                  0.42   
2      3004.14          inflation         691.0                  0.24   
3      3346.97             normal         717.0                  0.16   
4      2670.09          inflation         795.0                  0.25   

   loan_payment  ...  discretionary_spending  essential_spending  in

In [12]:
if "date" in df2.columns:
    df2 = df2.drop(columns=["date"])
if "user_id" in df2.columns:
    df2 = df2.drop(columns=["user_id"]) 

target = "savings_goal_met"
X = df2.drop(columns=[target])
y = df2[target]

X.columns = X.columns.str.strip()

In [13]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

Categorical columns: ['financial_scenario', 'income_type', 'category', 'cash_flow_status', 'financial_stress_level']
Numerical columns: ['monthly_income', 'monthly_expense_total', 'savings_rate', 'budget_goal', 'credit_score', 'debt_to_income_ratio', 'loan_payment', 'investment_amount', 'subscription_services', 'emergency_fund', 'transaction_count', 'fraud_flag', 'discretionary_spending', 'essential_spending', 'rent_or_mortgage', 'financial_advice_score', 'actual_savings']


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
from sklearn.ensemble import RandomForestClassifier

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])

In [16]:
from sklearn.metrics import accuracy_score, classification_report

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.965

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       545
           1       0.93      0.67      0.78        55

    accuracy                           0.96       600
   macro avg       0.95      0.83      0.88       600
weighted avg       0.96      0.96      0.96       600



In [17]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring="accuracy")
print("CV scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

CV scores: [0.98       0.97       0.975      0.97666667 0.965     ]
Mean CV accuracy: 0.9733333333333333


In [18]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(pipeline, "../models/finance_personal_tracker_model.pkl")
print("Saved pipeline as ../models/finance_personal_tracker_model.pkl")

Saved pipeline as ../models/finance_personal_tracker_model.pkl


### Dataset 3

In [19]:
df3 = pd.read_csv("../data/finance/synthetic_personal_finance_dataset.csv")
print("Dataset 3 shape:", df3.shape)
df3.head()

Dataset 3 shape: (32424, 20)


Unnamed: 0,user_id,age,gender,education_level,employment_status,job_title,monthly_income_usd,monthly_expenses_usd,savings_usd,has_loan,loan_type,loan_amount_usd,loan_term_months,monthly_emi_usd,loan_interest_rate_pct,debt_to_income_ratio,credit_score,savings_to_income_ratio,region,record_date
0,U00001,56,Female,High School,Self-employed,Salesperson,3531.69,1182.59,367655.03,No,,0.0,0,0.0,0.0,0.0,430,8.68,Other,2024-01-09
1,U00002,19,Female,PhD,Employed,Salesperson,3531.73,2367.99,260869.1,Yes,Education,146323.34,36,4953.5,13.33,1.4,543,6.16,North America,2022-02-13
2,U00003,20,Female,Master,Employed,Teacher,2799.49,1003.91,230921.21,No,,0.0,0,0.0,0.0,0.0,754,6.87,Africa,2022-05-12
3,U00004,25,Male,PhD,Employed,Manager,5894.88,4440.12,304815.51,Yes,Business,93242.37,24,4926.57,23.93,0.84,461,4.31,Europe,2023-10-02
4,U00005,53,Female,PhD,Employed,Student,5128.93,4137.61,461509.48,No,,0.0,0,0.0,0.0,0.0,516,7.5,Africa,2021-08-07


In [20]:
target = "savings_usd"

df3 = df3.drop(columns=["user_id", "record_date"])

X = df3.drop(columns=[target])
y = df3[target]

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (32424, 17)
Target shape: (32424,)


In [21]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)

Categorical columns: ['gender', 'education_level', 'employment_status', 'job_title', 'has_loan', 'loan_type', 'region']
Numeric columns: ['age', 'monthly_income_usd', 'monthly_expenses_usd', 'loan_amount_usd', 'loan_term_months', 'monthly_emi_usd', 'loan_interest_rate_pct', 'debt_to_income_ratio', 'credit_score', 'savings_to_income_ratio']


In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

In [23]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=200, random_state=42))
])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [25]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)

Mean Absolute Error: 1096.2090461835032
Mean Squared Error: 10030809.292471793
R2 Score: 0.9997281781983703


In [26]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(pipeline, "../models/finance_loan_model.pkl")
print("Saved pipeline as ../models/finance_loan_model.pkl")

Saved pipeline as ../models/finance_loan_model.pkl
