# Smart Financial Forecasting System

## 1. Project Goal
## To build an end-to-end AI/ML system that analyzes historical financial data and accurately forecasts future income, expenses, and cash flow to   support better financial decision-making.



## 2. Load Dataset

In [1]:
import pandas as pd
import numpy as np

df = pd.read_excel(
    r"D:\1.Projects\Python\SmartFinancialForecastingSystem\Data\India_Household_Spending_raw_file.xlsx"
)

#df.head
#df.shape
#df.info()
#df.describe(include="all")





## 3. Data Cleaning

In [2]:
# Separate columns
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(exclude=np.number).columns

# Fill missing numerical values (MEDIAN)
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Fill missing categorical values (MODE)
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Check duplicates
df.duplicated().sum()

# Convert categorical columns to category dtype
df[cat_cols] = df[cat_cols].astype("category")

# Expense columns 
expense_cols = [
    'Rent', 'Loan_Repayment', 'Insurance', 'Groceries', 'Transport',
    'Eating_Out', 'Entertainment', 'Utilities', 'Healthcare',
    'Education', 'Miscellaneous'
]
for col in expense_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower, upper)
# Total monthly expense
df["Total_Expense"] = df[expense_cols].sum(axis=1)

# Actual savings
df["Actual_Savings"] = df["Income"] - df["Total_Expense"]

# Savings ratio
df["Savings_Ratio"] = df["Actual_Savings"] / df["Income"]


## 4. Check Data Validity

In [3]:
# Negative checks
(df["Actual_Savings"] < 0).sum()

# Expense > Income
(df["Total_Expense"] > df["Income"]).sum()

df.to_csv(
    r"D:\1.Projects\Python\SmartFinancialForecastingSystem\Data\processed.csv",
    index=False
)




## 5. Feature Engineering

In [4]:
df = pd.read_csv(r"D:\1.Projects\Python\SmartFinancialForecastingSystem\Data\processed.csv")
print("Dataset Loaded:", df.shape)

# STEP 1: Define Expense Columns
expense_cols = [
    'Rent', 'Loan_Repayment', 'Insurance', 'Groceries', 'Transport',
    'Eating_Out', 'Entertainment', 'Utilities', 'Healthcare',
    'Education', 'Miscellaneous'
]

# STEP 2: Core Financial Features

# Total Expense
df["Total_Expense"] = df[expense_cols].sum(axis=1)

# Actual Savings
df["Actual_Savings"] = df["Income"] - df["Total_Expense"]

# Actual Savings Percentage
df["Actual_Savings_Percentage"] = (df["Actual_Savings"] / df["Income"]) * 100

# Savings Gap
df["Savings_Gap"] = df["Desired_Savings"] - df["Actual_Savings"]

# STEP 3: Expense-to-Income Ratios

for col in expense_cols:
    df[f"{col}_Ratio"] = df[col] / df["Income"]

# STEP 4: Lifestyle & Behavior Features

essential = [
    'Rent', 'Loan_Repayment', 'Groceries',
    'Utilities', 'Healthcare', 'Education'
]

non_essential = [
    'Eating_Out', 'Entertainment', 'Miscellaneous'
]

df["Essential_Expense"] = df[essential].sum(axis=1)
df["Non_Essential_Expense"] = df[non_essential].sum(axis=1)

# Lifestyle Score
df["Lifestyle_Score"] = df["Non_Essential_Expense"] / df["Total_Expense"]

# STEP 5: Financial Risk Indicators

# Expense Burden
df["Expense_Burden"] = df["Total_Expense"] / df["Income"]

# High Risk Flag
df["High_Risk_Flag"] = np.where(
    (df["Expense_Burden"] > 0.85) | (df["Actual_Savings"] < 0),
    1, 0
)

# STEP 6: Potential Savings Utilization

potential_cols = [col for col in df.columns if "Potential_Savings" in col]

df["Total_Potential_Savings"] = df[potential_cols].sum(axis=1)

df["Savings_Efficiency"] = df["Actual_Savings"] / (
    df["Actual_Savings"] + df["Total_Potential_Savings"]
)

# STEP 7: Age & Income Bucketing

df["Income_Group"] = pd.cut(
    df["Income"],
    bins=[0, 25000, 50000, 100000, np.inf],
    labels=["Low", "Middle", "Upper-Middle", "High"]
)

df["Age_Group"] = pd.cut(
    df["Age"],
    bins=[18, 25, 35, 50, np.inf],
    labels=["Youth", "Young Adult", "Mid Age", "Senior"]
)

# STEP 8: Encode Categorical Features

df_encoded = pd.get_dummies(
    df,
    columns=["Occupation", "City_Tier", "Income_Group", "Age_Group"],
    drop_first=True
)

print("Feature Engineering Completed")
print("Final Dataset Shape:", df_encoded.shape)

# SAVE FEATURE ENGINEERED DATA

df_encoded.to_csv(
    r"D:\1.Projects\Python\SmartFinancialForecastingSystem\Data\feature_engineered_financial_data.csv",
    index=False
)

print("Saved: feature_engineered_financial_data.csv")


Dataset Loaded: (20000, 30)
Feature Engineering Completed
Final Dataset Shape: (20000, 59)
Saved: feature_engineered_financial_data.csv


# 6. Model Selection




## I trained a Random Forest regression model to predict customer savings based on spending behavior, income ratios, and lifestyle indicators, achieving strong predictive performance and interpretable feature importance also I optimized my Random Forest model using hyperparameter tuning.


In [5]:

# STEP 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# STEP 2: Load Feature-Engineered Data

df = pd.read_csv(r"D:\1.Projects\Python\SmartFinancialForecastingSystem\Data\feature_engineered_financial_data.csv")
print("Dataset Shape:", df.shape)

# STEP 3: Define Target and Features

target = "Actual_Savings"

# Remove target and non-useful columns from features
X = df.drop(columns=[target])
y = df[target]
 
# STEP 4: Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training Samples:", X_train.shape[0])
print("Testing Samples:", X_test.shape[0])

# STEP 5: Initialize Random Forest Model

rf_model = RandomForestRegressor(
    n_estimators=200,        # number of trees
    max_depth=None,         # let trees grow fully
    random_state=42,
    n_jobs=-1               # use all CPU cores
)


# STEP 6: Train the Model

rf_model.fit(X_train, y_train)
print("Random Forest Model Trained")

# STEP 7: Make Predictions

y_pred = rf_model.predict(X_test)

# STEP 8: Evaluate Model Performance

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("R² Score:", r2)



# STEP 9: Feature Importance
#

feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(feature_importance.head(10))


Dataset Shape: (20000, 59)
Training Samples: 16000
Testing Samples: 4000
Random Forest Model Trained
Mean Absolute Error (MAE): 216.85382857385122
R² Score: 0.9490667760805755
                      Feature  Importance
0                      Income    0.329912
33            Transport_Ratio    0.147739
32            Groceries_Ratio    0.138270
28                Savings_Gap    0.116600
37           Healthcare_Ratio    0.082249
16          Disposable_Income    0.039706
43             Expense_Burden    0.023571
36            Utilities_Ratio    0.018894
27  Actual_Savings_Percentage    0.017611
26              Savings_Ratio    0.014733


# HyperParamter Tuning

In [None]:
# STEP 1: Imports
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# STEP 2: Load Data

df = pd.read_csv(r"D:\1.Projects\Python\SmartFinancialForecastingSystem\Data\feature_engineered_financial_data.csv")

target = "Actual_Savings"
X = df.drop(columns=[target])
y = df[target]


# STEP 3: Train-Test Split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# STEP 4: Define Parameter Grid

param_grid = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}



# STEP 5: Randomized Search

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_grid,
    n_iter=20,
    cv=3,
    scoring="neg_mean_absolute_error",
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# STEP 6: Best Model

best_rf = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# STEP 7: Evaluate Tuned Model

y_pred = best_rf.predict(X_test)

mae_tuned = mean_absolute_error(y_test, y_pred)
r2_tuned = r2_score(y_test, y_pred)

print("Tuned MAE:", mae_tuned)
print("Tuned R²:", r2_tuned)
print("Hello")