In [20]:
from openai import  OpenAI
import json

# Set your OpenAI API key here
client = OpenAI(api_key = '')

In [58]:
import re

def extract_json(text):
    """Extracts JSON content from a string, handling cases where it's wrapped in markdown-style formatting."""
    match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
    if match:
        return match.group(1)  # Extract the JSON inside the code block
    return text  # If no markdown found, assume the response is plain JSON

In [140]:
def get_bank_details_for_country(country_name):
    prompt = f"""
    Find a **comprehensive and official** list of all banks operating in {country_name}. Use **multiple sources** such as:
    - Wikipedia (if available)
    - The country's **central bank or financial regulatory authority**
    - Official **SWIFT** directories
    - Major **banking associations** in {country_name}
    - Other **reliable financial databases**
    
    ### **Response Format**
    Return the data in **strict JSON format** with **no extra commentary or explanations**:
    
    ```json
    [
        {{
            "bank_name": "BANK_NAME",
            "swift_code": "SWIFT_CODE",
            "country": "country"
        }},
        ...
    ]
    ```
    
    ### **Guidelines**
    1. **Include all banks** (commercial, central, and cooperative banks) that are **operational in {country_name}**.
    2. **Prioritize accuracy**:
        - Cross-check data across multiple sources.
        - Ensure each bank has a valid **SWIFT code** (preferably 8-character, but include 11-character variations where applicable).
        - Use the official **ISO 3166-1 alpha-2** country code for {country_name}.
    3. If a bank **does not have a SWIFT code**, include it **only if** it is a **major institution** (such as central banks or dominant local banks).
    4. **Ensure completeness**:
        - Extract banks even if they appear in tables or unstructured text.
        - Capture **alternative spellings or name variations** where relevant.
    5. **Exclude** banks that are defunct, inactive, or lack enough verifiable information.

    If no valid data is found, **return an empty JSON array `[]`** instead of partial or incorrect data.
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "developer", "content": "You are a professional assistant who generates high-quality banking data from multiple sources."},
            {"role": "user", "content": prompt}
        ]
    )
    
    try:
        resp = extract_json(response.choices[0].message.content)
        bank_data_json = json.loads(resp)
        return bank_data_json
    except json.JSONDecodeError:
        print(f"Error decoding JSON for {country_name}: The data format is invalid.")
        return None

In [141]:
import os
# Function to save the bank data to a JSON file using country code
def save_bank_data_to_json(country, bank_data):
    print(f"Saving bank data for {country}...")
    # Create the 'banks_by_country' folder if it doesn't exist
    folder_path = 'data'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Check if the response contains valid data
    if bank_data is None:
        print(f"No bank data found for {country}. Skipping...")
        return
    
    # Ensure the data is valid and structured correctly
    try:
        # bank_data_json = json.loads(bank_data.)
        
        # If JSON is empty or doesn't match the expected structure, handle it
        if not isinstance(bank_data, list) or len(bank_data) == 0:
            print(f"No valid bank data for {country}. Skipping...")
            return
        
        # Save to file
        file_path = os.path.join(folder_path, f'{country}.json')
        with open(file_path, 'w') as file:
            json.dump(bank_data, file, indent=4)
        print(f"Bank data for {country} saved successfully in {file_path}.")
    
    except json.JSONDecodeError:
        print(f"Error decoding JSON for {country}: The data format is invalid.")
    except Exception as e:
        print(f"Unexpected error while saving bank data for {country}: {e}")

In [146]:
countrys = [
    "AF", "AL", "DZ", "AD", "AO", "AG", "AR", "AM", "AU", "AT", 
    "AZ", "BS", "BH", "BD", "BB", "BY", "BE", "BZ", "BJ", "BT", "BO", 
    "BA", "BW", "BR", "BN", "BG", "BF", "BI", "CV", "KH", "CM", "CA", 
    "CF", "TD", "CL", "CN", "CO", "KM", "CG", "CR", "HR", "CU", "CY", 
    "CZ", "CD", "DK", "DJ", "DM", "DO", "EC", "EG", "SV", "GQ", "ER", 
    "EE", "SZ", "ET", "FJ", "FI", "FR", "GA", "GM", "GE", "DE", "GH", 
    "GR", "GD", "GT", "GN", "GW", "GY", "HT", "HN", "HU", "IS", "IN", 
    "ID", "IR", "IQ", "IE", "IL", "IT", "JM", "JP", "JO", "KZ", "KE", 
    "KI", "KP", "KR", "KW", "KG", "LA", "LV", "LB", "LS", "LR", "LY", 
    "LI", "LT", "LU", "MG", "MW", "MY", "MV", "ML", "MT", "MH", "MR", 
    "MU", "MX", "FM", "MD", "MC", "MN", "ME", "MA", "MZ", "MM", "NA", 
    "NR", "NP", "NL", "NZ", "NI", "NE", "NG", "MK", "NO", "OM", "PK", 
    "PW", "PA", "PG", "PY", "PE", "PH", "PL", "PT", "QA", "RO", "RU", 
    "RW", "KN", "LC", "VC", "WS", "SM", "ST", "SA", "SN", "RS", "SC", 
    "SL", "SG", "SK", "SI", "SB", "SO", "ZA", "SS", "ES", "LK", "SD", 
    "SR", "SE", "CH", "SY", "TW", "TJ", "TZ", "TH", "TL", "TG", "TO", 
    "TT", "TN", "TR", "TM", "TV", "UG", "UA", "AE", "GB", "US", "UY", 
    "UZ", "VU", "VA", "VE", "VN", "YE", "ZM", "ZW"
];

194


In [145]:

# Loop through all country codes and get bank details
for country in countrys:
    print(f"Fetching bank details for {country}...")
    bank_data = get_bank_details_for_country(country)
    print('bank data:', bank_data)
    save_bank_data_to_json(country, bank_data)

Fetching bank details for AF...
bank data: [{'bank_name': 'Central Bank of Afghanistan', 'swift_code': 'KABLAFAF', 'country_code': 'AF'}, {'bank_name': 'Afghanistan International Bank', 'swift_code': 'AIBKAFKX', 'country_code': 'AF'}, {'bank_name': 'Kabul Bank', 'swift_code': 'KABUAFKX', 'country_code': 'AF'}, {'bank_name': 'Azizi Bank', 'swift_code': 'AZBKAFKX', 'country_code': 'AF'}, {'bank_name': 'Habib Bank Limited', 'swift_code': 'HABBPKKA', 'country_code': 'AF'}, {'bank_name': 'Bank-E-Millie Afghan', 'swift_code': 'BEMIAFAF', 'country_code': 'AF'}, {'bank_name': 'Pashtany Bank', 'swift_code': 'PSTNBAFA', 'country_code': 'AF'}, {'bank_name': 'First MicroFinance Bank Afghanistan', 'swift_code': 'FMBAAFKX', 'country_code': 'AF'}, {'bank_name': 'Mutahid Daman Bank', 'swift_code': 'MDBKAFKX', 'country_code': 'AF'}, {'bank_name': 'DAB', 'swift_code': 'DABBAFAF', 'country_code': 'AF'}]
Saving bank data for AF...
Bank data for AF saved successfully in banks_by_country\AF.json.
Fetching b

Exploratory Data Analysis (EDA) on the extracted bank dataset to gain insights into its structure and quality.

In [36]:
import os
import json
import pandas as pd

# Define the folder containing bank JSON files
folder_path = "data"

# Load all JSON files into a DataFrame
bank_data = []

for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        country_code = filename.replace(".json", "")
        file_path = os.path.join(folder_path, filename)

        with open(file_path, "r", encoding="utf-8") as file:
            try:
                banks = json.load(file)
                for bank in banks:
                    bank["file_country_code"] = country_code  # Track file origin
                    bank_data.append(bank)
            except json.JSONDecodeError:
                print(f"⚠️ Error reading {filename}. Skipping.")

# Convert to DataFrame
df = pd.DataFrame(bank_data)


In [37]:
# Display basic dataset info
print("\n🔍 Dataset Overview:")
print(df.info())


🔍 Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1842 entries, 0 to 1841
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   bank_name          1842 non-null   object
 1   swift_code         1842 non-null   object
 2   country            1842 non-null   object
 3   file_country_code  1842 non-null   object
dtypes: object(4)
memory usage: 57.7+ KB
None


In [38]:
# Basic statistics
print("\n📊 Summary Statistics:")
print(df.describe(include="all"))


📊 Summary Statistics:
              bank_name swift_code country file_country_code
count              1842       1842    1842              1842
unique             1769       1797     149               149
top     Raiffeisen Bank   CITIUS33      US                US
freq                  4          8     100               100


In [39]:
# Count total banks in dataset
total_banks = len(df)
print(f"\n🏦 Total Banks: {total_banks}")


🏦 Total Banks: 1842


In [40]:
# Number of unique countries represented
unique_countries = df["country"].nunique()
print(f"🌍 Unique Countries Covered: {unique_countries}")

🌍 Unique Countries Covered: 149


In [41]:
# Top 10 countries with most banks
print("\n📌 Top 10 Countries by Number of Banks:")
print(df["country"].value_counts().head(10))


📌 Top 10 Countries by Number of Banks:
country
US    100
AE     45
ET     33
SA     28
BD     26
PH     22
VN     22
JP     20
TZ     20
BR     20
Name: count, dtype: int64


In [42]:

# Banks missing SWIFT codes
missing_swift = df["swift_code"].isna().sum()
print(f"\n⚠️ Banks Without SWIFT Codes: {missing_swift}")


⚠️ Banks Without SWIFT Codes: 0


In [43]:
# Check unique SWIFT codes
unique_swift_codes = df["swift_code"].nunique()
print(f"\n🔢 Unique SWIFT Codes: {unique_swift_codes}")


🔢 Unique SWIFT Codes: 1797


In [44]:

# Check mismatched country codes
mismatched_codes = df[df["country"] != df["file_country_code"]]
print(f"\n🚨 Mismatched Country Codes: {len(mismatched_codes)}")
print(mismatched_codes.head())


🚨 Mismatched Country Codes: 0
Empty DataFrame
Columns: [bank_name, swift_code, country, file_country_code]
Index: []


In [45]:
# Check for duplicate bank names
duplicate_banks = df["bank_name"].value_counts()
print("\n🔁 Potential Duplicate Bank Names:")
print(duplicate_banks[duplicate_banks > 1].head(10))


🔁 Potential Duplicate Bank Names:
bank_name
Raiffeisen Bank                4
Westpac Banking Corporation    4
Danske Bank A/S                4
Banco Sol                      4
UniCredit Bank                 4
Habib Bank Limited             4
Afriland First Bank            3
RHB Bank Berhad                3
Banco Promerica                3
Lebanon & Gulf Bank            3
Name: count, dtype: int64
