In [5]:
!pip install faker



In [6]:
from faker import Faker
import pandas as pd
import numpy as np

# Initialize Faker for generating realistic names
fake = Faker()

# Number of rows to generate
num_rows = 1000

# Generate the data dictionary
data = {
    # Primary Key - Unique identifier for each customer
    'CustomerID': [f'CUST{str(i).zfill(5)}' for i in range(1, num_rows + 1)],

    # AccountID - Unique identifier for each account
    'AccountID': [f'ACC{str(i).zfill(5)}' for i in range(1, num_rows + 1)],

    # TransactionID - Unique identifier for each transaction
    'TransactionID': [f'TRANS{str(i).zfill(5)}' for i in range(1, num_rows + 1)],

    # Customer Name - Nominal data
    'CustomerName': [fake.name() for _ in range(num_rows)],

    # Credit Score - Ordinal data
    'CreditScore': np.random.choice(['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'], num_rows),

    # Satisfaction Score - Interval data
    'SatisfactionScore': np.random.randint(1, 11, num_rows),

    # Account Balance - Ratio data
    'AccountBalance': np.round(np.random.uniform(0, 100000, num_rows), 2),

    # Transaction Frequency - Ordinal data
    'TransactionFrequency': np.random.choice(['Rarely', 'Occasionally', 'Frequently', 'Very Frequently'], num_rows),

    # Account Type - Nominal data
    'AccountType': np.random.choice(['Savings', 'Checking', 'Investment', 'Loan'], num_rows),

    # Last Transaction Date - Interval (date) data
    'LastTransactionDate': pd.to_datetime(
        np.random.choice(pd.date_range("2023-01-01", "2024-01-01"), num_rows)
    )
}

# Convert dictionary to DataFrame
df_banking = pd.DataFrame(data)

# Adding missing values in AccountBalance and SatisfactionScore for realism
df_banking.loc[df_banking.sample(frac=0.05).index, 'AccountBalance'] = np.nan
df_banking.loc[df_banking.sample(frac=0.03).index, 'SatisfactionScore'] = np.nan

# Adding some duplicates to the dataset
duplicates = df_banking.sample(frac=0.01, replace=True)
df_banking = pd.concat([df_banking, duplicates]).reset_index(drop=True)


In [7]:
df_banking 

Unnamed: 0,CustomerID,AccountID,TransactionID,CustomerName,CreditScore,SatisfactionScore,AccountBalance,TransactionFrequency,AccountType,LastTransactionDate
0,CUST00001,ACC00001,TRANS00001,Ronnie Hamilton,Excellent,7.0,,Frequently,Loan,2023-08-05
1,CUST00002,ACC00002,TRANS00002,Mr. James Alvarez MD,Very Good,5.0,51973.63,Frequently,Checking,2023-01-31
2,CUST00003,ACC00003,TRANS00003,Stacey Mccullough,Poor,2.0,15148.39,Frequently,Savings,2023-03-16
3,CUST00004,ACC00004,TRANS00004,Andrew Gill,Excellent,5.0,43605.63,Occasionally,Savings,2023-10-27
4,CUST00005,ACC00005,TRANS00005,Alan Jackson,Very Good,2.0,87815.83,Very Frequently,Loan,2023-12-07
...,...,...,...,...,...,...,...,...,...,...
1005,CUST00182,ACC00182,TRANS00182,Steven Gomez,Fair,4.0,90956.16,Rarely,Investment,2023-08-01
1006,CUST00584,ACC00584,TRANS00584,Sherri Watson,Good,10.0,58312.67,Rarely,Checking,2023-11-24
1007,CUST00683,ACC00683,TRANS00683,Jennifer Christian,Good,9.0,79246.62,Very Frequently,Checking,2023-04-05
1008,CUST00552,ACC00552,TRANS00552,Erika Romero,Good,8.0,39708.06,Frequently,Investment,2023-01-28


In [8]:
df_banking.head()

Unnamed: 0,CustomerID,AccountID,TransactionID,CustomerName,CreditScore,SatisfactionScore,AccountBalance,TransactionFrequency,AccountType,LastTransactionDate
0,CUST00001,ACC00001,TRANS00001,Ronnie Hamilton,Excellent,7.0,,Frequently,Loan,2023-08-05
1,CUST00002,ACC00002,TRANS00002,Mr. James Alvarez MD,Very Good,5.0,51973.63,Frequently,Checking,2023-01-31
2,CUST00003,ACC00003,TRANS00003,Stacey Mccullough,Poor,2.0,15148.39,Frequently,Savings,2023-03-16
3,CUST00004,ACC00004,TRANS00004,Andrew Gill,Excellent,5.0,43605.63,Occasionally,Savings,2023-10-27
4,CUST00005,ACC00005,TRANS00005,Alan Jackson,Very Good,2.0,87815.83,Very Frequently,Loan,2023-12-07


In [11]:
df_banking.to_csv(r"C:/Users/HP/Desktop\banking_data.csv", index=False)


In [12]:
import os
import pandas as pd

# Define the file path (ensure .csv extension)
file_path = "C:/Users/HP/Desktop/Customer.csv"

# Select specified columns
selected_columns = df_banking[['CustomerID', 'CustomerName', 'CreditScore', 'SatisfactionScore']]

# Save to the specified file as a CSV
selected_columns.to_csv(file_path, index=False)

print(f"File saved to: {file_path}")


File saved to: C:/Users/HP/Desktop/Customer.csv


In [13]:
import os
import pandas as pd

# Define the file path (ensure .csv extension)
file_path = "C:/Users/HP/Desktop/Account.csv"

# Select specified columns
selected_columns = df_banking[['AccountID','CustomerID', 'AccountType', 'AccountBalance']]

# Save to the specified file as a CSV
selected_columns.to_csv(file_path, index=False)

print(f"File saved to: {file_path}")


File saved to: C:/Users/HP/Desktop/Account.csv


In [14]:
import os
import pandas as pd

# Define the file path (ensure .csv extension)
file_path = "C:/Users/HP/Desktop/Transaction.csv"

# Select specified columns
selected_columns = df_banking[['TransactionID','AccountID', 'TransactionFrequency', 'LastTransactionDate']]

# Save to the specified file as a CSV
selected_columns.to_csv(file_path, index=False)

print(f"File saved to: {file_path}")


File saved to: C:/Users/HP/Desktop/Transaction.csv
