In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Load the cleaned dataset
df = pd.read_csv('cleaned_credit_union_data.csv')

# Correct Zip code datatype
df['Zip code (Mailing address)'] = df['Zip code (Mailing address)'].astype(int).astype(str).str.zfill(5)

In [None]:
# Load first few rows and check size of dataframe to confirm successul load.

pd.set_option('display.max_columns', None)
print(f"Dataframe is {df.shape[0]} rows deep and {df.shape[1]} columns wide.")
display(df.head(3))

In [None]:
# Define Metrics and Identify Top 5 Credit Unions

# Define the metrics for analysis using exact column names
metrics = ['Members', 'Total assets', 'Total loans', 'Total deposits']

# Identify the top 5 credit unions by number of members
top_5_credit_unions = df.groupby('Credit Union name')['Members'].sum().nlargest(5).index

# Filter the dataset for the top 5 credit unions
top_5_df = df[df['Credit Union name'].isin(top_5_credit_unions)]

# Display the top 5 credit unions
top_5_df.head()


In [None]:
# Calculate and Visualize Summary Statistics

# Filter the dataset for the top 5 credit unions and sort by 'Members' in descending order
top_5_df = df[df['Credit Union name'].isin(top_5_credit_unions)].sort_values('Members', ascending=False)

# Format the numbers with commas for readability
formatted_top_5_df = top_5_df.copy()
formatted_top_5_df[['Members', 'Total assets', 'Total loans', 'Total deposits']] = formatted_top_5_df[['Members', 'Total assets', 'Total loans', 'Total deposits']].map(lambda x: f"{x:,.0f}")

# Display the formatted top 5 credit unions
print('\033[1m' + 'Federally Insured Credit Unions: Top 5 by Membership (Q1 2024)\n'+'\033[0m')
formatted_top_5_df[['Credit Union name', 'Members', 'Total assets', 'Total loans', 'Total deposits']]


In [None]:
# 1. Histogram of ROAA Highlighting Top 5 by Total Assets

# Define 'top_5_assets' by selecting the top 5 credit unions based on 'Total assets'
top_5_assets = df.nlargest(5, 'Total assets')

# Apply the symmetrized log transformation
def symlog(x):
    return np.sign(x) * np.log1p(np.abs(x))

df['ROAA_symlog'] = df['Return on average assets'].apply(symlog)
top_5_assets['ROAA_symlog'] = top_5_assets['Return on average assets'].apply(symlog)

# Set seaborn style
sns.set(style="darkgrid")

# Define the color palette
palette = sns.color_palette()

# Plot histogram of symmetrized log-transformed 'Return on average assets' and highlight the top 5 by 'Total assets'
plt.figure(figsize=(10, 5))
sns.histplot(df['ROAA_symlog'], kde=True, bins=300, color=palette[2], label='All Credit Unions', alpha=0.25)
sns.histplot(top_5_assets['ROAA_symlog'], kde=False, bins=300, color=palette[4], label='Top 5 Peer Group (by Total Assets)')

# Order the top 5 credit unions by Loan-to-Share Ratio
top_5_assets_sorted = top_5_assets.sort_values(by='Return on average assets')

# Enhance visibility of top 5 with staggered labels
y_offset = np.linspace(10, 80, len(top_5_assets_sorted))  # Adjust y_offset as needed
for i, (idx, row) in enumerate(top_5_assets_sorted.iterrows()):
    plt.axvline(row['ROAA_symlog'], color=palette[4], linestyle='solid', linewidth=2)
    label = f"{row['Credit Union name']}\nROAA: {row['Return on average assets']:.2f}"
    plt.text(row['ROAA_symlog'], y_offset[i], label, rotation=0, verticalalignment='bottom', color='navy', 
             bbox=dict(facecolor='white', alpha=0.75, edgecolor='none', boxstyle='round,pad=0.2'), fontsize=7)

plt.suptitle('Distribution of Return on Average Assets feat. Top 5 Credit Unions by Asset Size', fontsize=12, weight='bold')
plt.title('Symmetric Log Transformation of ROAA (Q1 2024)\n', fontsize=10, weight='bold')
plt.xlabel('Return on Average Assets\n(SymLog-Transformed)', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.legend(fontsize=10)
plt.show()


In [None]:
# 2. Closer View of Histogram of ROAA Highlighting Top 5 by Total Assets

# Apply the symmetrized log transformation
def symlog(x):
    return np.sign(x) * np.log1p(np.abs(x))

df['ROAA_symlog'] = df['Return on average assets'].apply(symlog)
top_5_assets['ROAA_symlog'] = top_5_assets['Return on average assets'].apply(symlog)

# Set seaborn style
sns.set(style="darkgrid")

# Define the color palette
palette = sns.color_palette()

# Plot histogram of symmetrized log-transformed 'Return on average assets' and highlight the top 5 by 'Total assets'
plt.figure(figsize=(10, 5))
sns.histplot(df['ROAA_symlog'], kde=True, bins=300, color=palette[2], label='All Credit Unions', alpha=0.25)
sns.histplot(top_5_assets['ROAA_symlog'], kde=False, bins=300, color=palette[4], label='Top 5 Peer Group (by Total Assets)')

# Enhance visibility of top 5 with staggered labels
y_offset = np.linspace(15, 90, len(top_5_assets_sorted))  # Adjust y_offset as needed
for i, (idx, row) in enumerate(top_5_assets_sorted.iterrows()):
    plt.axvline(row['ROAA_symlog'], color=palette[4], linestyle='solid', linewidth=2)
    label = f"{row['Credit Union name']}\nROAA: {row['Return on average assets']:.2f}"
    plt.text(row['ROAA_symlog'], y_offset[i], label, rotation=0, verticalalignment='bottom', color='navy', 
             bbox=dict(facecolor='white', alpha=0.75, edgecolor='none', boxstyle='round,pad=0.2'), fontsize=7)

plt.suptitle('Closer View: Distribution of ROAA feat. Top 5 Credit Unions by Asset Size', fontsize=12, weight='bold')
plt.title('Symmetric Log Transformation of ROAA (Q1 2024)\n', fontsize=10, weight='bold')
plt.xlabel('Return on Average Assets\n(SymLog-Transformed)', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.legend(fontsize=10)
plt.xlim(-1, 1.5)
plt.show()


In [None]:
# 3. Histogram of Loan-to-Share Ratio Highlighting Top 5 by Total Loans

# Filter the top 5 credit unions by 'Total loans'
top_5_loans = df.nlargest(5, 'Total loans')

# Calculate the histogram for Loan-to-Share Ratio
hist_values, bin_edges = np.histogram(df['Loan-to-share ratio'].dropna(), bins=30)

# Set seaborn style
sns.set(style="darkgrid")

# Define the color palette
palette = sns.color_palette()

# Plot the histogram
plt.figure(figsize=(10, 5))
sns.histplot(df['Loan-to-share ratio'], kde=True, bins=300, color=palette[0], label='All Credit Unions', alpha=0.25)
sns.histplot(top_5_loans['Loan-to-share ratio'], kde=False, bins=300, color=palette[2], label='Top 5 Peer Group (by Total Loans)')

# Order the top 5 credit unions by Loan-to-Share Ratio
top_5_loans_sorted = top_5_loans.sort_values(by='Loan-to-share ratio')

# Enhance visibility of top 5 with labels
y_offset = np.linspace(8, 40, len(top_5_loans_sorted))  # Adjust y_offset as needed
for i, (idx, row) in enumerate(top_5_loans_sorted.iterrows()):
    bin_index = np.digitize(row['Loan-to-share ratio'], bin_edges) - 1
    if bin_index < len(hist_values):
        height = hist_values[bin_index]  # Get the height of the corresponding bin
        plt.axvline(row['Loan-to-share ratio'], color=palette[2], linestyle='solid', linewidth=2, ymax=height/plt.gca().get_ylim()[1])
        label = f"{row['Credit Union name']}\nLoan-to-Share Ratio: {row['Loan-to-share ratio']:.2f}"
        plt.text(row['Loan-to-share ratio'], y_offset[i], label, rotation=0, verticalalignment='bottom', color='navy', 
                 bbox=dict(facecolor='white', alpha=0.75, edgecolor='none', boxstyle='round,pad=0.2'), fontsize=8)

plt.title('Distribution of Loan-to-Share Ratio feat. Top 5 Credit Unions by Total Loans', fontsize=12, weight='bold')
plt.xlabel('Loan-to-Share Ratio', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.legend(fontsize=10)
plt.show()


In [None]:
# Histogram of Net Worth Ratio Highlighting Top 5 by Total Deposits

# Filter the top 5 credit unions by 'Total deposits'
top_5_deposits = df.nlargest(5, 'Total deposits')

# Calculate the histogram for Net Worth Ratio
hist_values, bin_edges = np.histogram(df['Net worth ratio (excludes CECL transition provision)'].dropna(), bins=30)

# Plot the histogram
plt.figure(figsize=(10, 5))
sns.histplot(df['Net worth ratio (excludes CECL transition provision)'], kde=True, bins=300, color=palette[0], label='All Credit Unions')
sns.histplot(top_5_deposits['Net worth ratio (excludes CECL transition provision)'], kde=False, bins=300, color=palette[2], label='Top 5 Peer Group (by Total Deposits)')

# Order the top 5 credit unions by Loan-to-Share Ratio
top_5_deposits_sorted = top_5_deposits.sort_values(by='Net worth ratio (excludes CECL transition provision)')

# Enhance visibility of top 5 with labels
y_offset = np.linspace(8, 120, len(top_5_deposits_sorted))  # Adjust y_offset as needed
for i, (idx, row) in enumerate(top_5_deposits_sorted.iterrows()):
    bin_index = np.digitize(row['Net worth ratio (excludes CECL transition provision)'], bin_edges) - 1
    if bin_index < len(hist_values):
        height = hist_values[bin_index]  # Get the height of the corresponding bin
        plt.axvline(row['Net worth ratio (excludes CECL transition provision)'], color=palette[2], linestyle='solid', linewidth=2, ymax=height/plt.gca().get_ylim()[1])
        label = f"{row['Credit Union name']}\nNet worth Ratio: {row['Net worth ratio (excludes CECL transition provision)']:.2f}"
        plt.text(row['Net worth ratio (excludes CECL transition provision)'], y_offset[i], label, rotation=0, verticalalignment='bottom', color='navy', 
                 bbox=dict(facecolor='white', alpha=0.75, edgecolor='none', boxstyle='round,pad=0.2'), fontsize=8)

plt.title('Distribution of Net Worth Ratio feat. Top 5 Credit Unions by Total Deposits', fontsize=12, weight='bold')
plt.xlabel('Net Worth Ratio (excludes CECL transition provision)', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.legend(fontsize=10)
plt.show()
