In [None]:
from lib import fabricate, analyze
import numpy as np
import pandas as pd
from scipy.linalg import block_diag

In [None]:
# occupation_categories = ('Salaried/Employed (Private/Public Sector)', 'Agriculturalists (Farmers)', 'Laborers/Skilled Workers', 'Students',  'Homemakers', 'Government Employees (Civil Services, Defense, Police)', 'Self-employed Professionals (Doctors, Lawyers, Chartered Accountants)', 'Retired/Pensioners', 'Business Owners/Entrepreneurs', 'Freelancers/Consultants', 'Unemployed/Jobseekers')
# occupation_stats = (12.6, 23, 11.4, 19.2, 14.7, 3.2, 3.4, 6, 2.6, 2, 1.9)
# education_categories = ('Primary or less', 'Secondary', 'Tertiary or more')
# education_stats = (69.4, 16.4, 14.2)
# vpn_stats = (8, 4, 0, 70)
# sim_card_changes_stats = (1.2, 0.8, 0, 60)
# trading_account_stats = (1.27, 0.9, 0, 10)
# recharge_frequency_stats = (1.426, 0.57, 0, 20)
# partener_categories = ('Never Married', 'Currently Married', 'Widowed', 'Divorced')
# partener_stats = (0.39, 0.475, 0.13, 0.005)
# dependents_stats = (2.25, 1.7, 0, 10)
# streaming_services_number_stats = (2.5, 1, 0, 10)
# streaming_services_delay_stats = (1.2, 1.4, 0, 6)
# family_defaulters_stats = (1.81, 1.62, 0, 10)
# smart_cards_stats = (2.6, 0.9, 0, 6)
# medical_bills_stats = (9000, 5000, 0, 50000)
# default_gst_filing_stats = (0.39,0.65,0,3)
# antivirus_subscriptions_stats = (0.81, 0.53, 0, 10)
# betting_categories = ('Yes', 'No')
# betting_stats = (20.69, 79.31)
# truecaller_categories = ('Red', 'Blue', 'Golden')
# truecaller_stats = (0.2, 0.75, 0.5)
# shopping_behavior_stats = (0.35, 0.7, 0, 25)
# utility_bills_stats = (9500, 6500, 2500, 35000)
# registered_vehicles_stats = (0.3, 0.5, 0, 10)
# registered_vehicles_challans_stats = (0.12, 0.5, 0, 100)
# certificates_stats = (0.4, 0.6, 0, 10)
# licenses_stats = (0.65, 0.72, 0, 7)
# insurances_stats = (0.75, 1.2, 0, 7)
# fd_dissolved_stats = (0.28, 0.65, 0, 10)
# international_transactions_stats = (0.45, 13.31, 0, 1000)
# public_memberships_stats = (0.035, 0.2, 0, 7)
# reviews_categories = ('1 Star', '2 Star', '3 Star', '4 Star', '5 Star')
# reviews_stats = (4, 6, 5, 45, 40)
# pet_bills_stats = (26000, 30000, 0, 250000)
# accounts_stats = (2.05, 1.5, 0, 8)
# social_sentiments_categories = ('Strongly Negative', 'Negative', 'Neutral', 'Positive', 'Strongly Positive')
# social_sentiments_stats = (20, 32, 32, 12, 4)
# upi_failures_stats = (25, 15, 0, 200)

In [None]:
data = fabricate.fabricate_base_data()

In [None]:

from numpy.random import default_rng

rng = default_rng(seed=42)  # reproducibility

# Step 1: Keep your existing stats_list
stats_list = [
    (10, 5, 0, 30, "continuous"),   # Recharge Frequency (per month)
    (2000, 1000, 200, 5000, "continuous"), # Utility Bills (per month)
    (3, 2, 0, 10, "continuous"),    # International Transactions
    (2, 2, 0, 12, "continuous"),    # UPI Failures
    (1, 1, 0, 5, "continuous"),     # Registered Vehicles
    (2, 1, 0, 10, "continuous"),    # Registered Vehicles Challan (per year)
    (2, 1, 0, 5, "continuous"),     # Insurances
    (2, 1, 0, 10, "continuous"),    # Trading Accounts
    (3, 2, 0, 10, "continuous"),    # Licenses
    (1, 1, 0, 5, "continuous"),     # FDs Dissolved
    (3, 2, 0, 10, "continuous"),    # Accounts
    (4, 3, 0, 12, "continuous"),    # Public Memberships
    (1, 1, 0, 4, "continuous"),     # Default in GST filing (per quarter)
    (5, 3, 0, 20, "continuous"),    # VPN Usage (hours per week)
    (1, 1, 0, 5, "continuous"),     # SIM Card Changes
    (4, 2, 0, 10, "continuous"),    # Streaming Services
    (15, 10, 0, 60, "continuous"),  # Payment Delay of Streaming Services (days/year)
    (2, 2, 0, 8, "continuous"),     # Smart Cards
    (2, 1, 0, 5, "continuous"),     # Antivirus Subscriptions
    (3, 2, 0, 10, "continuous"),    # Shopping Behavior (returns/month)
    (5000, 2000, 500, 15000, "continuous"), # Medical Bills (per year)
    (3, 2, 0, 10, "continuous"),    # Certificates
    (2000, 1500, 200, 10000, "continuous"), # Pet Bills (per year)
    (2, 1, 0, 6, "continuous")      # Dependents
]

numeric_columns = [
    "Recharge Frequency (per month)",
    "Utility Bills (per month)",
    "International Transactions",
    "UPI Failures",
    "Registered Vehicles",
    "Registered Vehicles Challan (per year)",
    "Insurances",
    "Trading Accounts",
    "Licenses",
    "FDs Dissolved",
    "Accounts",
    "Public Memberships",
    "Default in GST filing (per quarter)",
    "VPN Usage (hours per week)",
    "SIM Card Changes",
    "Streaming Services",
    "Payment Delay of Streaming Services (days/year)",
    "Smart Cards",
    "Antivirus Subscriptions",
    "Shopping Behavior (returns/month)",
    "Medical Bills (per year)",
    "Certificates",
    "Pet Bills (per year)",
    "Dependents"
]

# Step 2: Block correlation matrices
corr_payment_spending = np.array([
    [1.0, 0.45, 0.35, 0.30],
    [0.45, 1.0, 0.40, 0.33],
    [0.35, 0.40, 1.0, 0.38],
    [0.30, 0.33, 0.38, 1.0]
])

corr_asset_ownership = np.array([
    [1.0, 0.50, 0.40],
    [0.50, 1.0, 0.45],
    [0.40, 0.45, 1.0]
])

corr_fin_accounts = np.array([
    [1.0, 0.42, 0.30, 0.35],
    [0.42, 1.0, 0.38, 0.33],
    [0.30, 0.38, 1.0, 0.40],
    [0.35, 0.33, 0.40, 1.0]
])

corr_compliance_membership = np.array([
    [1.0, 0.28],
    [0.28, 1.0]
])

corr_digital = np.array([
    [1.0, 0.45, 0.35, 0.30, 0.25, 0.20],
    [0.45, 1.0, 0.32, 0.28, 0.26, 0.24],
    [0.35, 0.32, 1.0, 0.40, 0.30, 0.22],
    [0.30, 0.28, 0.40, 1.0, 0.33, 0.26],
    [0.25, 0.26, 0.30, 0.33, 1.0, 0.28],
    [0.20, 0.24, 0.22, 0.26, 0.28, 1.0]
])

corr_ecommerce = np.array([
    [1.0, 0.38, 0.35, 0.30],
    [0.38, 1.0, 0.36, 0.32],
    [0.35, 0.36, 1.0, 0.33],
    [0.30, 0.32, 0.33, 1.0]
])

corr_other = np.array([[1.0]])

# Step 3: Combine into full block diagonal matrix
correlation_matrix = block_diag(
    corr_payment_spending,
    corr_asset_ownership,
    corr_fin_accounts,
    corr_compliance_membership,
    corr_digital,
    corr_ecommerce,
    corr_other
)

# Step 4: Generate correlated standard normal data
n = 10000
L = np.linalg.cholesky(correlation_matrix)
uncorrelated = rng.normal(size=(n, correlation_matrix.shape[0]))
correlated = uncorrelated @ L.T

# Step 5: Scale each variable to match stats_list
scaled_data = []
for i, (mean, std, min_val, max_val, _) in enumerate(stats_list):
    col = correlated[:, i]
    col = (col - np.mean(col)) / np.std(col)  # standardize
    col = col * std + mean  # scale to desired mean/std
    col = np.clip(col, min_val, max_val)  # clip to min/max
    scaled_data.append(col)

df = pd.DataFrame(np.column_stack(scaled_data), columns=numeric_columns)

# 1️⃣ Descriptive stats
stats = df[numeric_columns].describe().T
print("\n📊 Descriptive Statistics:")
print(stats)

# 2️⃣ Correlation
corr = df[numeric_columns].corr()
print("\n🔗 Correlation Matrix:")
print(corr.round(2))

# Step 4: Categorical columns
occupation_categories = ('Salaried/Employed (Private/Public Sector)', 'Agriculturalists (Farmers)', 'Laborers/Skilled Workers', 'Students',  'Homemakers', 'Government Employees (Civil Services, Defense, Police)', 'Self-employed Professionals (Doctors, Lawyers, Chartered Accountants)', 'Retired/Pensioners', 'Business Owners/Entrepreneurs', 'Freelancers/Consultants', 'Unemployed/Jobseekers')
occupation_stats = (12.6, 23, 11.4, 19.2, 14.7, 3.2, 3.4, 6, 2.6, 2, 1.9)

education_categories = ('Primary or less', 'Secondary', 'Tertiary or more')
education_stats = (69.4, 16.4, 14.2)

partner_categories = ('Never Married', 'Currently Married', 'Widowed', 'Divorced')
partner_stats = (0.39, 0.475, 0.13, 0.005)

truecaller_categories = ('Red', 'Blue', 'Golden')
truecaller_stats = (0.2, 0.75, 0.05)

reviews_categories = ('1 Star', '2 Star', '3 Star', '4 Star', '5 Star')
reviews_stats = (4, 6, 5, 45, 40)

social_sentiments_categories = ('Strongly Negative', 'Negative', 'Neutral', 'Positive', 'Strongly Positive')
social_sentiments_stats = (20, 32, 32, 12, 4)

betting_categories = ('Yes', 'No')
betting_stats = (0.1, 0.9)

family_defaulters_stats=(1, 1, 0, 5, "continuous")

df['Occupation'] = fabricate.create_categorical_distribution(occupation_categories, occupation_stats, nan_probability=0.01)
df['Education'] = fabricate.create_categorical_distribution(education_categories, education_stats, nan_probability=0.01)
df['Partner'] = fabricate.create_categorical_distribution(partner_categories, partner_stats, nan_probability=0.01)
df['Betting Apps'] = fabricate.create_categorical_distribution(betting_categories, betting_stats, nan_probability=0.01)
df['TrueCaller Flag'] = fabricate.create_categorical_distribution(truecaller_categories, truecaller_stats, nan_probability=0.01)
df['Reviews received'] = fabricate.create_categorical_distribution(reviews_categories, reviews_stats, nan_probability=0.01)
df['Sentiment on Social Media'] = fabricate.create_categorical_distribution(social_sentiments_categories, social_sentiments_stats, nan_probability=0.01)
df['Family Defaulter'] = fabricate.create_truncated_norm_distribution(family_defaulters_stats,precision=0,nan_probability=0)


In [None]:
data2 = df[numeric_columns]  # numeric part
categorical_part = df[['Occupation', 'Education', 'Partner',
                       'Betting Apps', 'TrueCaller Flag',
                       'Reviews received', 'Sentiment on Social Media']]

# Reset indexes so they align before concatenating
data2 = data2.reset_index(drop=True)
categorical_part = categorical_part.reset_index(drop=True)

# Combine numeric + categorical
final_df = pd.concat([data2, categorical_part], axis=1)

In [None]:
final_df

In [None]:
print(len(numeric_columns))       # should match corr.shape[0]
print(correlation_matrix.shape)

In [None]:
corr_gen = df[numeric_columns].corr().values
expected = correlation_matrix.copy()
mask = ~np.eye(expected.shape[0], dtype=bool)
mae = np.mean(np.abs(corr_gen[mask] - expected[mask]))
print("MAE (off-diagonals):", mae)


In [None]:
 #Check correlation from generated df
corr_generated = df[numeric_columns].corr()

# Display correlation matrix
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(corr_generated.round(2))

# Payment & Spending Behavior correlation check
print("Payment & Spending Behavior Correlation:")
print(corr_generated.iloc[0:4, 0:4].round(2))

# Asset Ownership correlation check
print("\nAsset Ownership Correlation:")
print(corr_generated.iloc[4:7, 4:7].round(2))

# Financial Accounts correlation check
print("\nFinancial Accounts Correlation:")
print(corr_generated.iloc[7:11, 7:11].round(2))

# Compliance & Membership correlation check
print("\nCompliance & Membership Correlation:")
print(corr_generated.iloc[11:13, 11:13].round(2))

# Digital & Subscription Behavior correlation check
print("\nDigital & Subscription Behavior Correlation:")
print(corr_generated.iloc[13:19, 13:19].round(2))

# E-commerce & Consumer Patterns correlation check
print("\nE-commerce & Consumer Patterns Correlation:")
print(corr_generated.iloc[19:23, 19:23].round(2))

# Others correlation check
print("\nOthers Correlation:")
print(corr_generated.iloc[23:24, 23:24].round(2))

In [None]:
# data['Occupation'] = fabricate.create_categorical_distribution(occupation_categories, occupation_stats, nan_probability=0.01)
# data['Education'] = fabricate.create_categorical_distribution(education_categories, education_stats, nan_probability=0.01)
# data['VPN Usage (hours per week)'] = fabricate.create_truncated_norm_distribution(vpn_stats, nan_probability=0.01)
# data['SIM Card Changes'] = fabricate.create_truncated_norm_distribution(sim_card_changes_stats, precision=0, nan_probability=0.01)
# data['Trading Accounts'] = fabricate.create_truncated_norm_distribution(trading_account_stats, precision=0, nan_probability=0.01)
# data['Recharge Frequency'] = fabricate.create_truncated_norm_distribution(recharge_frequency_stats, precision=0,nan_probability=0.01)
# data['Partner'] = fabricate.create_categorical_distribution(partener_categories, partener_stats, nan_probability=0.01)
# data['Dependents'] = fabricate.create_truncated_norm_distribution(dependents_stats, precision=0, nan_probability=0.01)
# data['Streaming Services'] = fabricate.create_truncated_norm_distribution(streaming_services_number_stats, precision=0, nan_probability=0.01)
# data['Payment Delay of Streaming Services (per year)'] = fabricate.create_truncated_norm_distribution(streaming_services_delay_stats, precision=0, nan_probability=0.01)
# data['Family Defaulters'] = fabricate.create_truncated_norm_distribution(family_defaulters_stats, precision=0, nan_probability=0.01)
# data['Smart Cards'] = fabricate.create_truncated_norm_distribution(smart_cards_stats, precision=0, nan_probability=0.01)
# data['Medical Bills'] = fabricate.create_truncated_norm_distribution(medical_bills_stats, precision=1, nan_probability=0.01)
# data['Default in GST filing (per quarter)'] = fabricate.create_truncated_norm_distribution(default_gst_filing_stats, precision=0, nan_probability=0.01)
# data['Antivirus Subscriptions'] = fabricate.create_truncated_norm_distribution(antivirus_subscriptions_stats, precision=0, nan_probability=0.01)
# data['Betting Apps'] = fabricate.create_categorical_distribution(betting_categories, betting_stats, nan_probability=0.01)
# data['TrueCaller Flag'] = fabricate.create_categorical_distribution(truecaller_categories, truecaller_stats, nan_probability=0.01)
# data['Shopping Behavior (returns per month)'] = fabricate.create_truncated_norm_distribution(shopping_behavior_stats, precision=0, nan_probability=0.01)
# data['Utility Bills (per month)'] = fabricate.create_truncated_norm_distribution(utility_bills_stats, precision=0, nan_probability=0.01)
# data['Registered Vehicles'] = fabricate.create_truncated_norm_distribution(registered_vehicles_stats, precision=0, nan_probability=0.01)
# data['Registered Vehicles Challan (per year)'] = fabricate.create_truncated_norm_distribution(registered_vehicles_challans_stats, precision=0, nan_probability=0.01)
# data['Certificates'] = fabricate.create_truncated_norm_distribution(certificates_stats, precision=0, nan_probability=0.01)
# data['Licenses'] = fabricate.create_truncated_norm_distribution(licenses_stats, precision=0, nan_probability=0.01)
# data['Insurances'] = fabricate.create_truncated_norm_distribution(insurances_stats, precision=0, nan_probability=0.01)
# data['FDs Dissolved'] = fabricate.create_truncated_norm_distribution(fd_dissolved_stats, precision=0, nan_probability=0.01)
# data['International Transactions'] = fabricate.create_truncated_norm_distribution(international_transactions_stats, precision=0, nan_probability=0.01)
# data['Public Memberships'] = fabricate.create_truncated_norm_distribution(public_memberships_stats, precision=0, nan_probability=0.01)
# data['Reviews received'] = fabricate.create_categorical_distribution(reviews_categories, reviews_stats, nan_probability=0.01)
# data['Pet Bills (per year)'] = fabricate.create_truncated_norm_distribution(pet_bills_stats, precision=1, nan_probability=0.01)
# data['Accounts'] = fabricate.create_truncated_norm_distribution(accounts_stats, precision=0, nan_probability=0.01)
# data['Sentiment on Social Media'] = fabricate.create_categorical_distribution(social_sentiments_categories, social_sentiments_stats, nan_probability=0.01)
# data['UPI Failures'] = fabricate.create_truncated_norm_distribution(upi_failures_stats, precision=0, nan_probability=0.01)

In [None]:
analyze.perform_eda(final_df)
