In [1]:
!pip install faker

Collecting faker
  Downloading Faker-29.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-29.0.0-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------- ----------------- 1.0/1.8 MB 5.6 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 4.6 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-29.0.0


In [1]:
import pandas as pd
import numpy as np
import random

# Set the random seed for reproducibility
np.random.seed(42)

# Define parameters for the dataset
num_records = 250000

# Expanded sample values for each category
companies = [
    'EQUIFAX, INC.', 'Experian Information Solutions Inc.', 'TransUnion Intermediate Holdings, Inc.',
    'Wells Fargo & Company', 'Citibank, N.A.', 'Adler Wallach & Associates, Inc.',
    'Portfolio Recovery Associates', 'American Express Company', 'Key Bank', 
    'Discover Financial Services', 'Chase Bank', 'Synchrony Financial',
    'PNC Bank', 'Capital One', 'Bank of America', 'US Bank',
    'HSBC Bank USA', 'Barclays', 'Navy Federal Credit Union',
    'Fifth Third Bank', 'Regions Bank', 'TD Bank', 'SunTrust Bank'
]

responses = [
    None, 'Company has responded to the consumer', 
    'Company believes it acted appropriately', 'No response provided', 
    'Closed with explanation', 'Closed with monetary relief', 
    'In progress', 'Closed with non-monetary relief', 
    'Investigation ongoing', 'Additional documentation requested',
    'Customer service attempted to contact', 'Response under review'
]

issues = [
    'Problem with a credit reporting company\'s investigation',
    'Incorrect information on your report',
    'Getting a credit card',
    'Debt collection issues',
    'Billing disputes',
    'Problems with account management',
    'Payment processing issues',
    'Fraudulent account activity',
    'Unauthorized charges',
    'Service not received',
    'Loan application denied', 'Identity theft issues', 'Credit card fraud'
]

products = [
    'Credit reporting', 'Credit card', 'Debt collection', 
    'Bank account', 'Mortgage', 'Student loan', 'Personal loan',
    'Auto loan', 'Insurance', 'Business loan',
    'Home equity loan', 'Retail store card', 'Payment plan'
]

states = [
    'GA', 'CA', 'NY', 'TX', 'FL', 'IL', 'PA', 'OH', 'MI', 'WA', 
    'NC', 'NJ', 'MA', 'VA', 'AZ', 'MD', 'CO', 'MN', 'OR', 'IN'
]

sub_issues = [
    'Their investigation did not fix an error',
    'Information belongs to someone else',
    'Application denied',
    'Card charged for something not purchased',
    'Improper reporting', 
    'Delay in processing', 'Multiple inquiries on credit report', None
]

sub_products = [
    'General-purpose credit card', 'Store credit card', 
    'Savings account', 'Other bank product/service', 
    'Home equity line of credit', 'Auto loan', 
    'Personal line of credit', None
]

submitted_via = ['Web', 'Postal mail', 'Phone', 'Email', None]

tags = [
    None, 'Fraud', 'Late payment', 'Dispute', 'Identity theft', 
    'Unresolved complaint', 'Debt settlement', 'Credit monitoring',
    'Unpaid balance', 'Contract violation', 'Service quality issue'
]

# Function to randomly assign null values
def random_nulls(array, null_percentage):
    mask = np.random.rand(len(array)) < null_percentage
    return np.where(mask, None, array)

# Generate the dataset
data = {
    'company': np.random.choice(companies, num_records),
    'company_public_response': random_nulls(np.random.choice(responses, num_records), 0.75),  # 75% null
    'company_response': random_nulls(np.random.choice(responses, num_records), 0.75),  # 75% null
    'complaint_id': [f'ID{str(i).zfill(6)}' for i in range(num_records)],
    'complaint_what_happened': np.random.choice(['Yes', 'No'], num_records, p=[0.9, 0.1]),
    'consumer_consent_provided': np.random.choice(['Consent provided', 'Consent not provided'], num_records),
    'consumer_disputed': np.random.choice(['Yes', 'No'], num_records, p=[0.2, 0.8]),
    'date_received': pd.to_datetime(np.random.choice(pd.date_range('2022-01-01', '2024-12-31'), num_records)),
    'date_sent_to_company': pd.to_datetime(np.random.choice(pd.date_range('2022-01-01', '2024-12-31'), num_records)),
    'issue': np.random.choice(issues, num_records),
    'product': np.random.choice(products, num_records),
    'state': np.random.choice(states, num_records),
    'sub_issue': random_nulls(np.random.choice(sub_issues, num_records), 0.75),  # 75% null
    'sub_product': random_nulls(np.random.choice(sub_products, num_records), 0.75),  # 75% null
    'submitted_via': np.random.choice(submitted_via, num_records, p=[0.5, 0.2, 0.2, 0.05, 0.05]),  # 50% Web
    'tags': random_nulls(np.random.choice(tags, num_records), 0.75),  # 75% null
    'timely': np.random.choice(['Yes', 'No'], num_records, p=[0.8, 0.2]),
    'zip_code': random_nulls([str(random.randint(10000, 99999)) for _ in range(num_records)], 0.75)  # 75% null
}

# Create DataFrame
df = pd.DataFrame(data)

# Convert all columns to object type
df = df.astype(object)

# Save to Parquet
df.to_parquet('input.parquet', index=False)


df.to_csv('input.csv', index=False)

In [2]:
df.head()

Unnamed: 0,company,company_public_response,company_response,complaint_id,complaint_what_happened,consumer_consent_provided,consumer_disputed,date_received,date_sent_to_company,issue,product,state,sub_issue,sub_product,submitted_via,tags,timely,zip_code
0,Portfolio Recovery Associates,,,ID000000,Yes,Consent not provided,Yes,2023-08-03 00:00:00,2023-01-26 00:00:00,Billing disputes,Retail store card,TX,,Store credit card,Postal mail,,Yes,
1,Fifth Third Bank,,Company believes it acted appropriately,ID000001,Yes,Consent not provided,No,2024-01-09 00:00:00,2024-06-13 00:00:00,Payment processing issues,Auto loan,PA,Improper reporting,,Phone,Unresolved complaint,Yes,
2,Bank of America,,,ID000002,Yes,Consent not provided,No,2023-06-14 00:00:00,2022-05-13 00:00:00,Unauthorized charges,Insurance,CO,,,Web,,Yes,
3,Chase Bank,,,ID000003,Yes,Consent not provided,No,2022-02-13 00:00:00,2024-08-31 00:00:00,Payment processing issues,Personal loan,WA,,Other bank product/service,Web,,Yes,12468.0
4,American Express Company,,,ID000004,Yes,Consent not provided,Yes,2023-10-03 00:00:00,2022-02-26 00:00:00,Service not received,Auto loan,TX,Application denied,General-purpose credit card,Web,,No,
