## **Financial Synthetic Data Generator**
**problem statement :** Building a Financial Synthetic Data Generation that Generate Banking Data of Users based on the Data We have, using the pattern of the data.
##### **Project Methodology**

- This Project using the Open Source Data of Company Products information with their DEMOG and BUREAU data.
- Using Python, that load data and then pre-processed and saved in CSV File.
- Loading that same CSV file to insert into Vector DB using Embedding Model from Hugging Face.
- Building RAG QA Chain using Langchain and building the RAG architecture using Zypher 7B LLM (Open Source).
- Checking the Response Generated Data is good to use.

In [None]:
!pip install faker langchain langchain_community  chromadb

In [3]:
from faker import Faker
import random
from datetime import date
from dateutil.relativedelta import relativedelta
import json
from langchain.vectorstores import Chroma

## Build the Data for Your Customers


In [10]:
faker = Faker()   # initialize Faker for generating synthetic data
#define date ranges for inquiries
six_months =  date.today()   - relativedelta(months=6)   # Calculate the date 6 months ago
three_months =  date.today()   - relativedelta(months=3)   # Calculate the date 3 months ago

# Generate Synthetic Data function

def generate_synthetic_data(num_records) :
  synthetic_data = []
  for _ in  range(num_records)  :
      age = random.randint(20,70)
      gender =  random.choice(["male" , "female"])
      marital_status = random.choice(["single" , "married" , "divorced" ,  "Widowed"])
      income_level =  random.choice(["Low" , "Medium" , "High"])
      education =  random.choice(["High School" , "Bachelor" , "Master" , "PhD"])
      occupation =  faker.job()
      residential_status =  random.choice(["Own House" , "Rents" , "Living With Parants"])
      dependents =  random.randint(0,5)
      debt_to_income =  round(random.uniform(0.1, 0.5), 2)
      credit_bureau  =  random.randint(760 , 850)
      last_3months_inquiries = generate_inquiries(three_months)
      last_6months_inquiries = generate_inquiries(six_months)
      # Construct the record
      record = {
            'Customer ID': str(faker.uuid4()),
            'Age': age,
            'Gender': gender,
            'Marital Status': marital_status,
            'Income Level': income_level,
            'Education': education,
            'Occupation': occupation,
            'Residential Status': residential_status,
            'Dependents': dependents,
            'Debt-to-Income': debt_to_income,
            'Credit Bureau': credit_bureau,
            'last_3months_personal_loan_inq': any(i['product_name'] == 'Personal Loan' for i in last_3months_inquiries),
            'last_3months_credit_card_inq': any(i['product_name'] == 'Credit Card' for i in last_3months_inquiries),
            'last_3months_mortgage_inq': any(i['product_name'] == 'Mortgage' for i in last_3months_inquiries),
            'last_6months_personal_loan_inq': any(i['product_name'] == 'Personal Loan' for i in last_6months_inquiries),
            'last_6months_credit_card_inq': any(i['product_name'] == 'Credit Card' for i in last_6months_inquiries),
            'last_6months_mortgage_inq': any(i['product_name'] == 'Mortgage' for i in last_6months_inquiries)
        }
      synthetic_data.append(record)
  return synthetic_data

# Function to generate bureau product inquiries
def generate_inquiries(last_months):
    inquiries = []
    today = faker.date_this_month()
    for _ in range(random.randint(1, 5)):
        inquiry_date = faker.date_between(start_date=last_months, end_date=today)
        product_type = random.choice(['Personal Loan', 'Credit Card', 'Mortgage'])
        inquiries.append({'product_name': product_type, 'date': inquiry_date})
    return inquiries if inquiries else []

# Generate synthetic data
synthetic_data = generate_synthetic_data(50)
synthetic_data[0]

{'Customer ID': '9269bcdc-b92c-48fe-9496-7872794e94b4',
 'Age': 39,
 'Gender': 'female',
 'Marital Status': 'married',
 'Income Level': 'Medium',
 'Education': 'Master',
 'Occupation': 'Passenger transport manager',
 'Residential Status': 'Own House',
 'Dependents': 3,
 'Debt-to-Income': 0.15,
 'Credit Bureau': 775,
 'last_3months_personal_loan_inq': False,
 'last_3months_credit_card_inq': True,
 'last_3months_mortgage_inq': False,
 'last_6months_personal_loan_inq': True,
 'last_6months_credit_card_inq': False,
 'last_6months_mortgage_inq': False}

In [11]:
import pandas as pd
data_df = pd.DataFrame(synthetic_data)
data_df['content'] = [f"Based on the following customer data: {data}, suggest suitable banking lending products." for data in synthetic_data]
data_df.head()


Unnamed: 0,Customer ID,Age,Gender,Marital Status,Income Level,Education,Occupation,Residential Status,Dependents,Debt-to-Income,Credit Bureau,last_3months_personal_loan_inq,last_3months_credit_card_inq,last_3months_mortgage_inq,last_6months_personal_loan_inq,last_6months_credit_card_inq,last_6months_mortgage_inq,content
0,9269bcdc-b92c-48fe-9496-7872794e94b4,39,female,married,Medium,Master,Passenger transport manager,Own House,3,0.15,775,False,True,False,True,False,False,Based on the following customer data: {'Custom...
1,edafd988-288d-4ea8-a4cc-2841881c3e38,49,male,married,High,PhD,Chief Operating Officer,Own House,0,0.15,768,False,False,True,True,True,True,Based on the following customer data: {'Custom...
2,7589e5f4-4788-4782-8c02-e2e207e47458,26,female,married,Medium,Bachelor,Music therapist,Living With Parants,3,0.46,825,False,True,True,False,True,True,Based on the following customer data: {'Custom...
3,ff6ab69e-ed8e-4909-9b1b-9f50e94571fd,50,female,single,High,High School,Civil Service fast streamer,Rents,1,0.38,813,True,True,False,True,True,False,Based on the following customer data: {'Custom...
4,34c3ab3c-f574-4a81-95aa-313c7f04840c,34,male,married,Medium,High School,Building surveyor,Own House,0,0.11,807,False,False,True,True,False,True,Based on the following customer data: {'Custom...


In [12]:
from langchain.docstore.document import Document
documents = []

for  _ , row in data_df.iterrows() :
  documents.append(Document(page_content=row['content'],metadata={ "class" :row["Age"]}))

# **Load Hugging Face Embeddings**

In [None]:
!pip install sentence_transformers

In [15]:
from langchain_community.embeddings import HuggingFaceEmbeddings
hg_embeddings = HuggingFaceEmbeddings()

  hg_embeddings = HuggingFaceEmbeddings()
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Load the Model

In [None]:
!pip install google-generativeai

In [33]:
import google.generativeai as genai
from google.colab import userdata
api = userdata.get('gemini_api')
genai.configure(api_key=api)
llm = genai.GenerativeModel("gemini-1.5-flash")

In [35]:
llm.generate_content("hello").text

'Hello! How can I help you today? \n'

## Store the Data in VectorDB (ChromaDB)


In [19]:
from langchain.vectorstores import Chroma

persist_directory = '/content/'
import chromadb
langchain_chroma = Chroma.from_documents(
    documents=documents,
    collection_name="recommendation_engine",
    embedding=hg_embeddings,
    persist_directory=persist_directory
)

In [21]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceHub
from IPython.display import display, Markdown
import os
import warnings
warnings.filterwarnings('ignore')

In [22]:
template = """
Based on the following customer data, that I Provide, suggest one suitable banking lending products."
Customer Information: {question}
Context: {context}
Answer:
"""
PROMPT = PromptTemplate(input_variables=["context", "query"], template=template)


In [37]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('hugginface_key')

In [38]:
llm = HuggingFaceHub(repo_id="HuggingFaceH4/zephyr-7b-beta", model_kwargs={"temperature":0.5, "max_length":512})

In [39]:
retriever = langchain_chroma.as_retriever(search_kwargs={"k": 1})

qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=retriever, chain_type_kwargs={"prompt": PROMPT}
)


In [None]:
# Convert data to prompt format for training RAG model
prompt_data = [
    {
        'prompt': f"Generate synthetic data for a customer with the following attributes: {json.dumps(record)}",
        'metadata': record
    }
    for record in synthetic_data
]


# Example prompt for generating synthetic data
prompt_text = "Generate synthetic data for a customer with the following attributes: {'Age': 35, 'Gender': 'Male', 'Marital Status': 'Married', 'Income Level': 'High', 'Education': 'College', 'Occupation': 'Engineer', 'Residential Status': 'Owns house', 'Dependents': 2, 'Debt-to-Income': 0.3, 'Credit Bureau': 'Experian', 'last_3months_personal_loan_inq': False, 'last_3months_credit_card_inq': True, 'last_3months_mortgage_inq': False, 'last_6months_personal_loan_inq': True, 'last_6months_credit_card_inq': False, 'last_6months_mortgage_inq': True}"
synthetic_result = generate_synthetic_data_with_rag(prompt_text)
print("Generated Synthetic Data:")
print(synthetic_result)