In [1]:
import pandas as pd
import numpy as np

In [2]:
# number of rows and columns
num_clients = 200000
num_features = 35

# Generate data for each column
np.random.seed(42)  # For reproducibility
data = {
    "bal_now": np.random.randint(0, 10000001, num_clients),  # Current balance
    "age": np.random.randint(18, 91, num_clients),  # Age range: 18-90
    "tenure": np.random.randint(0, 51, num_clients),  # Tenure range: 0-50 years
    "credit_score": np.random.randint(400, 851, num_clients),  # Credit score: 400-850
    "annual_income": np.random.randint(20000, 300001, num_clients),  # Income: $20k-$300k
    "inflow": np.random.randint(10, 20001, num_clients),  # Monthly inflow (income or deposits)
    "outflow": np.random.randint(10, 15001, num_clients),  # Monthly outflow (expenses)
    "mtg_balance": np.random.randint(0, 1000001, num_clients),  # Mortgage balance
    "credit_card_balance": np.random.randint(0, 50001, num_clients),  # Credit card balance
    "loan_balance": np.random.randint(0, 500001, num_clients),  # Other loan balance
    "mobile_banking_ind": np.random.choice([0, 1], num_clients),  # Mobile banking indicator
    "premium_ind": np.random.choice([0, 1], num_clients),  # Premium user indicator
    "newcomer_id": np.random.choice([0, 1], num_clients),  # Newcomer indicator (1 if a new customer)
    "days_since_last_log": np.random.randint(0, 366, num_clients),  # Days since last login (1-365 days)
    "splc_balance": np.random.randint(0, 1000001, num_clients),  # Secured plc
}

# Generate random data for columns 15 to 35 (fea1 to fea35)
for i in range(1, num_features + 1):
    data[f"fea{i}"] = np.random.randint(0, 100001, num_clients)

# Add some correlation or logic for realistic simulation
data["bal_after_6mon"] = (
    data["bal_now"] 
    + 0.1 * data["inflow"]  # Balance may increase with higher inflows
    - 0.05 * data["outflow"]  # Decrease with higher outflows
    + 0.3 * data["credit_score"]  # Credit score could impact balance growth
    + 0.02 * data["annual_income"]  # Higher income may correlate with higher balances
    - 0.2 * data["mtg_balance"]  # Higher mortgage balance may decrease balance growth
    - 0.1 * data["credit_card_balance"]  # Higher credit card balance might reduce growth
    + 0.05 * data["splc_balance"]  # Special balance may slightly increase future balance
    + 0.01 * data["days_since_last_log"]  # Longer inactivity might slightly decrease balance
    + np.random.normal(0, 1000, num_clients)  # Adding some noise
)

# Clip the balance after 6 months to a reasonable range (0 to 11 million)
data["bal_after_6mon"] = np.clip(data["bal_after_6mon"], 0, 11000000)

# Create the DataFrame
df = pd.DataFrame(data)

# Display a preview of the data
df.head()



Unnamed: 0,bal_now,age,tenure,credit_score,annual_income,inflow,outflow,mtg_balance,credit_card_balance,loan_balance,...,fea27,fea28,fea29,fea30,fea31,fea32,fea33,fea34,fea35,bal_after_6mon
0,6423388,27,24,837,191218,19428,1685,758311,15470,68750,...,48203,40848,17735,92914,7025,38486,43083,14451,68160,6320044.0
1,6550634,22,43,585,239710,5914,7998,813964,7092,458988,...,13815,4393,90459,8199,21066,29089,46831,4332,24124,6418258.0
2,4304572,47,25,616,127094,9698,8166,825144,33306,168257,...,22362,12895,48617,64084,20536,61102,10177,88745,53813,4173464.0
3,2234489,65,2,497,79674,17023,2248,74580,4987,64865,...,18514,79026,51272,6730,71238,74501,57498,45620,55350,2262496.0
4,9958614,38,34,623,132976,810,4576,7523,1726,252167,...,55122,98179,49028,55157,51188,52524,75274,18425,83985,9996288.0


In [3]:
df.to_csv('df_v2', index=False)