In [4]:
import pandas as pd
import numpy as np
import random
import uuid 

# List of Kenyan counties
KENYAN_COUNTIES = ["Baringo","Bomet","Bungoma","Busia","Elgeyo-Marakwet","Embu","Garissa","Homa Bay",
    "Isiolo","Kajiado","Kakamega","Kericho","Kiambu","Kilifi","Kirinyaga","Kisii",
    "Kisumu","Kitui","Kwale","Laikipia","Lamu","Machakos","Makueni","Mandera",
    "Marsabit","Meru","Migori","Mombasa","Murang'a","Nairobi City","Nakuru","Nandi",
    "Narok","Nyamira","Nyandarua","Nyeri","Samburu","Siaya","Taita-Taveta","Tana River",
    "Tharaka-Nithi","Trans Nzoia","Turkana","Vihiga","Wajir","West Pokot"]

def generate_complex_synthetic_loan_data(num_samples=10000): # Increased samples for sufficiency
    """
    Generates a complex synthetic dataset to simulate loan stacking and default risk,
    incorporating features inspired by real-world lending data in an African setting,
    adjusted based on research findings.

    Args:
        num_samples (int): The number of data points to generate.

    Returns:
        pandas.DataFrame: A DataFrame containing the synthetic loan data.
    """
    data = []
    loan_grades = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
    grade_default_impact = {'A': -0.15, 'B': -0.10, 'C': -0.05, 'D': 0.05, 'E': 0.10, 'F': 0.15, 'G': 0.20}

    # Define urban and rural regions
    urban_regions = ["Nairobi City", "Mombasa", "Kisumu", "Nakuru", "Eldoret"]
    rural_regions = [region for region in KENYAN_COUNTIES if region not in urban_regions]

    # Assign weights based on region type, adjusted for Nairobi/Mombasa concentration
    # Overall urban vs rural weights
    overall_region_weights = {'urban': 0.65, 'rural': 0.35} # Slight increase in urban weight

    # Within urban, higher probability for Nairobi and Mombasa
    nairobi_mombasa_weight = 0.6 # 60% chance of being Nairobi/Mombasa within urban
    other_urban_weight = 0.4 # 40% chance of being other urban within urban

    for _ in range(num_samples):
        # Demographic and socio-economic features (from previous version)
        age = random.randint(18, 65)

        # Randomly select a region type (urban/rural) first
        region_type = random.choices(['urban', 'rural'], weights=[overall_region_weights['urban'], overall_region_weights['rural']], k=1)[0]

        # Adjust income generation based on region type
        if region_type == 'urban':
            # Higher income range for urban areas
            income = round(random.uniform(5000, 100000), 2)
        else:
            # Lower income range for rural areas
            income = round(random.uniform(1000, 40000), 2)

        employment_status = random.choice(['Employed', 'Self-Employed', 'Unemployed', 'Student', 'Hourly', 'Casual'])
        education_level = random.choices(['High School', 'Bachelors', 'Masters', 'PhD'], weights=[0.5, 0.2, 0.2, 0.1], k=1)[0] # Adjusted weights for education

        # Now select the specific region based on the determined region_type
        if region_type == 'urban':
            if random.random() < nairobi_mombasa_weight:
                region = random.choice(["Nairobi City", "Mombasa"])
            else:
                region = random.choice([r for r in urban_regions if r not in ["Nairobi City", "Mombasa"]])
        else:
            region = random.choice(rural_regions)

        # Introduce different borrower archetypes
        borrower_type = random.choices(
            ['Normal', 'Moderate Stacker', 'Aggressive Stacker', 'Low Risk'],
            weights=[0.40, 0.25, 0.20, 0.15], # Adjusted distribution, slightly more Low Risk
            k=1
        )[0]

        # Initialize core loan features 
        number_of_active_loans = 0
        apps_installed = 0
        loan_frequency_last_30_days = 0
        repayment_ratio_overall = 1.0
        credit_limit_utilization = 0.0
        device_or_ID_shared = False

        # NEW: Lending Club inspired features
        loan_amount = 0.0
        interest_rate = 0.0
        loan_grade = ''
        loan_term_days = 0 # Changed to days
        debt_to_income_ratio = 0.0
        delinquencies_last_2yrs = 0
        public_records = 0
        revolving_utilization = 0.0
        total_credit_lines = 0

        # Conditional logic based on borrower archetype and research findings
        if borrower_type == 'Normal':
            number_of_active_loans = random.randint(0, 2)
            apps_installed = random.randint(1, 3)
            loan_frequency_last_30_days = random.randint(0, 3)
            repayment_ratio_overall = round(random.uniform(0.7, 1.0), 2)
            credit_limit_utilization = round(random.uniform(0.05, 0.4), 2)
            device_or_ID_shared = False

            # Loan amounts and terms based on loan apps document for general loans
            loan_amount = round(random.uniform(1000, 50000), 2) # Up to KSh 50,000 common
            interest_rate = round(random.uniform(10.0, 25.0), 2) # Typical mobile loan rates
            loan_term_days = random.choice([7, 14, 21, 28, 30, 60, 90]) # Expanded terms
            debt_to_income_ratio = round(random.uniform(0.1, 0.35), 2)
            delinquencies_last_2yrs = random.randint(0, 1)
            public_records = 0
            revolving_utilization = round(random.uniform(0.0, 0.3), 2)
            total_credit_lines = random.randint(2, 12)

            # Loan grade based on income/education
            if income > 40000 and education_level in ['Masters', 'PhD']:
                loan_grade = random.choice(['A', 'B'])
            elif income > 20000 and education_level in ['Bachelors', 'Masters']:
                loan_grade = random.choice(['B', 'C'])
            else:
                loan_grade = random.choice(['C', 'D'])


        elif borrower_type == 'Moderate Stacker':
            number_of_active_loans = random.randint(2, 4)
            apps_installed = random.randint(3, 7)
            loan_frequency_last_30_days = random.randint(3, 8)
            repayment_ratio_overall = round(random.uniform(0.4, 0.75), 2) # Slightly lower repayment ratio
            credit_limit_utilization = round(random.uniform(0.4, 0.7), 2)
            device_or_ID_shared = random.random() < 0.5 # 50% chance

            loan_amount = round(random.uniform(5000, 25000), 2)
            interest_rate = round(random.uniform(15.0, 30.0), 2)
            loan_term_days = random.choice([7, 14, 21, 28, 30, 60])
            debt_to_income_ratio = round(random.uniform(0.2, 0.45), 2)
            delinquencies_last_2yrs = random.randint(0, 3)
            public_records = random.choices([0, 1], weights=[0.6, 0.4], k=1)[0]
            revolving_utilization = round(random.uniform(0.3, 0.6), 2)
            total_credit_lines = random.randint(4, 15)
            loan_grade = random.choice(['D', 'E']) # Moderate stackers usually get worse grades

        elif borrower_type == 'Aggressive Stacker':
            number_of_active_loans = random.randint(4, 10)
            apps_installed = random.randint(7, 15)
            loan_frequency_last_30_days = random.randint(8, 30)
            repayment_ratio_overall = round(random.uniform(0.1, 0.45), 2) # Lower repayment
            credit_limit_utilization = round(random.uniform(0.7, 0.95), 2)
            device_or_ID_shared = random.random() < 0.8 # 80% chance

            loan_amount = round(random.uniform(10000, 30000), 2)
            interest_rate = round(random.uniform(20.0, 40.0), 2) # Higher interest for higher risk
            loan_term_days = random.choice([7, 14, 21, 28, 30]) # Shorter terms
            debt_to_income_ratio = round(random.uniform(0.35, 0.7), 2)
            delinquencies_last_2yrs = random.randint(2, 5)
            public_records = random.choices([0, 1, 2], weights=[0.4, 0.3, 0.3], k=1)[0]
            revolving_utilization = round(random.uniform(0.6, 0.9), 2)
            total_credit_lines = random.randint(8, 25)
            loan_grade = random.choice(['F', 'G']) # Aggressive stackers usually get the worst grades

        elif borrower_type == 'Low Risk':
            number_of_active_loans = random.randint(0, 1)
            apps_installed = random.randint(1, 2)
            loan_frequency_last_30_days = random.randint(0, 1)
            repayment_ratio_overall = round(random.uniform(0.9, 1.0), 2)
            credit_limit_utilization = round(random.uniform(0.0, 0.15), 2)
            device_or_ID_shared = False

            # Reflecting potential for larger loans and lower interest for low risk borrowers
            loan_amount = round(random.uniform(5000, 100000), 2) # Can get very large loans (e.g., Eazzy Loan)
            interest_rate = round(random.uniform(3.0, 15.0), 2) # Lower interest rates (e.g., M-Shwari, AsapKash)
            loan_term_days = random.choice([30, 60, 90]) # Longer terms
            debt_to_income_ratio = round(random.uniform(0.01, 0.2), 2)
            delinquencies_last_2yrs = 0
            public_records = 0
            revolving_utilization = round(random.uniform(0.0, 0.1), 2)
            total_credit_lines = random.randint(5, 20)
            loan_grade = random.choice(['A', 'B']) # Low risk borrowers get the best grades

        # Introduce some noise/outliers to features
        number_of_active_loans = max(0, number_of_active_loans + random.randint(-1, 1))
        apps_installed = max(1, apps_installed + random.randint(-1, 1))
        loan_frequency_last_30_days = max(0, loan_frequency_last_30_days + random.randint(-1, 2))
        repayment_ratio_overall = np.clip(repayment_ratio_overall + random.uniform(-0.05, 0.05), 0.0, 1.0)
        credit_limit_utilization = np.clip(credit_limit_utilization + random.uniform(-0.05, 0.05), 0.0, 1.0)
        loan_amount = max(500, loan_amount + random.uniform(-1000, 1000) if loan_amount < 100000 else loan_amount + random.uniform(-50000, 50000)) # Larger noise for larger loans
        interest_rate = np.clip(interest_rate + random.uniform(-1.0, 1.0), 0.05, 45.0) # Expanded interest rate range
        debt_to_income_ratio = np.clip(debt_to_income_ratio + random.uniform(-0.02, 0.02), 0.01, 0.7) # Cap DTI
        revolving_utilization = np.clip(revolving_utilization + random.uniform(-0.03, 0.03), 0.0, 1.0)
        total_credit_lines = max(1, total_credit_lines + random.randint(-2, 2))

        # More complex default logic (probabilistic based on all features)
        default_prob = 0.0

        # Impact from loan stacking features
        if number_of_active_loans > 4: default_prob += 0.15
        if apps_installed > 6: default_prob += 0.1
        if loan_frequency_last_30_days > 2: default_prob += 0.2
        if repayment_ratio_overall < 0.5: default_prob += 0.25
        if credit_limit_utilization > 0.7: default_prob += 0.15
        if device_or_ID_shared: default_prob += 0.1

        # Impact from demographic/socio-economic features
        if income < 20000: default_prob += 0.1 # Increased impact for lower income
        if employment_status in ['Unemployed', 'Hourly', 'Casual']: default_prob += 0.15
        if age < 25: default_prob += 0.03

        # Impact from Lending Club inspired credit features
        default_prob += grade_default_impact.get(loan_grade, 0)
        if interest_rate > 18.0: default_prob += 0.1
        if debt_to_income_ratio > 0.4: default_prob += 0.15
        if delinquencies_last_2yrs > 1: default_prob += 0.1
        if public_records > 0: default_prob += 0.1
        if revolving_utilization > 0.8: default_prob += 0.15
        if total_credit_lines < 3: default_prob += 0.05

        # Regional impact on default probability
        if region in rural_regions:
            default_prob += 0.07 # Higher default risk in rural areas
        if region == "Nairobi City" or region == "Mombasa":
            default_prob -= 0.03 # Slightly lower default risk in main urban centers

        # Clip probability to ensure it's between 0 and 1
        default_prob = np.clip(default_prob, 0.01, 0.99)

        # Final default decision based on calculated probability
        is_default = (random.random() < default_prob)

        data.append({
            'user_id': str(uuid.uuid4()), # Generate UUID for user_id
            'age': age,
            'income': income,
            'employment_status': employment_status,
            'education_level': education_level,
            'region': region,
            'number_of_active_loans': number_of_active_loans,
            'apps_installed': apps_installed,
            'loan_frequency_last_30_days': loan_frequency_last_30_days,
            'repayment_ratio_overall': repayment_ratio_overall,
            'credit_limit_utilization': credit_limit_utilization,
            'device_or_ID_shared': device_or_ID_shared,
            'loan_amount': loan_amount,
            'interest_rate': interest_rate,
            'loan_grade': loan_grade,
            'loan_term_days': loan_term_days,
            'debt_to_income_ratio': debt_to_income_ratio,
            'delinquencies_last_2yrs': delinquencies_last_2yrs,
            'public_records': public_records,
            'revolving_utilization': revolving_utilization,
            'total_credit_lines': total_credit_lines,
            'is_default': is_default
        })

    df = pd.DataFrame(data)
    return df

# Generate the dataset with a larger number of samples
synthetic_df = generate_complex_synthetic_loan_data(num_samples=10000)

# Display the first few rows and basic info
print("Generated Complex Synthetic Dataset Head:")
print(synthetic_df.head())
print("\nDataset Info:")
synthetic_df.info()
print("\nDefault Distribution:")
print(synthetic_df['is_default'].value_counts(normalize=True))
print("\nDescriptive Statistics for Numerical Features:")
print(synthetic_df.describe())
print("\nValue Counts for Categorical Features:")
print(synthetic_df['employment_status'].value_counts())
print(synthetic_df['education_level'].value_counts())
print(synthetic_df['device_or_ID_shared'].value_counts())
print(synthetic_df['loan_grade'].value_counts())
print(synthetic_df['loan_term_days'].value_counts())


Generated Complex Synthetic Dataset Head:
                                user_id  age    income employment_status  \
0  780a55b7-eb0a-4d47-a075-d0c18bc56dfd   40  48035.58           Student   
1  53f77b8c-8a7c-4ba2-9e8c-60d2fee3edaa   37  52103.69           Student   
2  c7c828e5-9470-4c9b-8726-94a5268d3ee7   24  95887.47        Unemployed   
3  fd915640-0b72-4594-ac04-1ba58e30b0a3   25  35520.98        Unemployed   
4  a40ca0e5-6593-4b20-87da-54133794c02f   26  16558.55           Student   

  education_level        region  number_of_active_loans  apps_installed  \
0         Masters       Eldoret                       7               9   
1     High School  Nairobi City                       1               1   
2     High School       Eldoret                       8              14   
3     High School        Migori                       7              13   
4     High School       Mombasa                       0               2   

   loan_frequency_last_30_days  repayment_ratio_ov

In [10]:
# Save the dataset to a CSV file
try:
    synthetic_df.to_csv('m1bile_loan_data.csv', index=False)
    print("\nDataset successfully saved to 'mobile_loan_data.csv'")
except Exception as e:
    print(f"\nError saving dataset to CSV: {e}")


Dataset successfully saved to 'mobile_loan_data.csv'


In [9]:
synthetic_df['region'].nunique()

47