In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime
from pathlib import Path

In [3]:
# Setup
fake = Faker()
np.random.seed(42)
random.seed(42)

In [5]:
# Constants
NUM_BUSINESSES = 500
NUM_EVENTS = 50000
NUM_MONTHS = 12
START_DATE = datetime(2024, 8, 1)

sectors = ['Retail', 'Manufacturing', 'Services', 'Technology', 'Healthcare']
regions = ['London', 'Midlands', 'Scotland', 'North West', 'South East']
risk_categories = ['Low', 'Medium', 'High']
event_types = ['login', 'viewed_report', 'downloaded_tool', 'updated_profile', 'requested_funding']
features = ['credit_score_report', 'funding_recommendation', 'cash_flow_forecast', 'growth_dashboard']

In [7]:
# Create output folder
output_dir = Path("generated_data")
output_dir.mkdir(exist_ok=True)

In [9]:
# --- 1. SME Profiles ---
sme_data = []
for i in range(NUM_BUSINESSES):
    sme_data.append({
        'business_id': f"SME_{i+1:04d}",
        'business_name': fake.company(),
        'sector': random.choice(sectors),
        'region': random.choice(regions),
        'employees': np.random.randint(1, 200),
        'annual_revenue (£)': round(np.random.uniform(100000, 5000000), 2),
        'credit_score': np.random.randint(300, 850),
        'created_at': fake.date_between(start_date='-3y', end_date='today'),
        'default_risk_category': random.choices(risk_categories, weights=[0.6, 0.3, 0.1])[0],
        'funding_needs': round(np.random.uniform(5000, 100000), 2),
        'loan_history_score': round(np.random.beta(2, 5), 2)
    })

sme_df = pd.DataFrame(sme_data)
sme_df.to_csv(output_dir / "sme_credit_profiles.csv", index=False)

In [11]:
# --- 2. Engagement Logs ---
event_data = []
for _ in range(NUM_EVENTS):
    business = sme_df.sample(1).iloc[0]
    event_data.append({
        'event_id': fake.uuid4(),
        'business_id': business['business_id'],
        'event_type': random.choice(event_types),
        'feature_name': random.choice(features),
        'timestamp': fake.date_time_between(start_date='-12m', end_date='now'),
        'session_duration_sec': np.random.randint(30, 1800),
        'is_conversion_event': random.random() < 0.05
    })

event_df = pd.DataFrame(event_data)
event_df.to_csv(output_dir / "product_engagement_log.csv", index=False)

In [13]:
# --- 3. Monthly Credit Scores ---
score_data = []
for _, row in sme_df.iterrows():
    current_score = row['credit_score']
    for m in range(NUM_MONTHS):
        month = START_DATE + pd.DateOffset(months=m)
        score_change = np.random.randint(-30, 31)
        current_score = max(300, min(850, current_score + score_change))
        score_data.append({
            'business_id': row['business_id'],
            'month': month.strftime('%Y-%m'),
            'credit_score': current_score,
            'funding_granted (£)': round(np.random.uniform(0, 30000), 2) if random.random() < 0.2 else 0.0,
            'product_used': random.random() < 0.7
        })

score_df = pd.DataFrame(score_data)
score_df.to_csv(output_dir / "monthly_credit_scores.csv", index=False)

print("✅ Simulated data generated and saved to 'generated_data' folder.")

✅ Simulated data generated and saved to 'generated_data' folder.
