In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
base_folder = "/content/drive/MyDrive/Colab Notebooks/customer_churn"
%cd "{base_folder}"

/content/drive/MyDrive/Colab Notebooks/customer_churn


In [3]:
import sqlite3
import pandas as pd
conn = sqlite3.connect(f"{base_folder}/data/customer_churn.db")
customer = pd.read_sql_query(
    """
    SELECT
        c.customerId,
        c.surname,
        c.CreditScore,
        c.age,
        c.tenure,
        c.balance,
        c.numofproducts,
        c.hascrcard,
        c.IsActiveMember,
        g.gender,
        geo.geography,
        c.estimatedSalary,
        c.exited
    FROM customer_fact AS c
    JOIN gender AS g
        ON g.gender_id = c.gender_id
    JOIN geography AS geo
        ON geo.geography_id = c.geography_id
    ORDER BY c.customerId
    """,
    conn,
)
conn.close()

customer.head()

Unnamed: 0,customerId,surname,creditScore,age,tenure,balance,numofProducts,hasCrCard,isActiveMember,gender,geography,estimatedSalary,exited
0,15565701,Ferri,698,39,9,161993.89,1,0,0,Female,Spain,90212.38,0
1,15565706,Akobundu,612,35,1,0.0,1,1,1,Male,Spain,83256.26,1
2,15565714,Cattaneo,601,47,1,64430.06,2,0,1,Male,France,96517.97,0
3,15565779,Kent,627,30,6,57809.32,1,1,0,Female,Germany,188258.49,0
4,15565796,Docherty,745,48,10,96048.55,1,1,0,Male,Germany,74510.65,0


In [4]:
# =============================================================================
# ANALYZE CUSTOMER DATA FOR STREAMLIT APP
# Find min/max/median for numerical features and unique values for categorical features
# =============================================================================

import json

print("=" * 80)
print("ANALYZING CUSTOMER DATA FOR STREAMLIT APP")
print("=" * 80)

# Create isZeroBalance feature to add zero balance information on prediction
customer["isZeroBalance"] = (customer["balance"] == 0).astype(int)

# Define the features we need for prediction
numerical_features = [
    'creditScore',
    'age',
    'tenure',
    'balance',
    'numofProducts',
    'estimatedSalary'
]

categorical_features = [
    'hasCrCard',
    'isActiveMember',
    'gender',
    'geography',
    'isZeroBalance']

# Create schema dictionary
data_schema = {
    "numerical": {},
    "categorical": {}
}

# Analyze numerical features
print("\n" + "-" * 80)
print("NUMERICAL FEATURES")
print("-" * 80)
print(f"{'Feature':<25} {'Min':<15} {'Max':<15} {'Mean':<15} {'Median':<15}")
print("-" * 80)

for feature in numerical_features:
    min_val = float(customer[feature].min())
    max_val = float(customer[feature].max())
    mean_val = float(customer[feature].mean())
    median_val = float(customer[feature].median())

    data_schema["numerical"][feature] = {
        "min": min_val,
        "max": max_val,
        "mean": mean_val,
        "median": median_val
    }

    print(f"{feature:<25} {min_val:<15.2f} {max_val:<15.2f} {mean_val:<15.2f} {median_val:<15.2f}")

# Analyze categorical features
print("\n" + "-" * 80)
print("CATEGORICAL FEATURES")
print("-" * 80)

for feature in categorical_features:
    unique_values = customer[feature].unique().tolist()
    value_counts = customer[feature].value_counts().to_dict()

    data_schema["categorical"][feature] = {
        "unique_values": unique_values,
        "value_counts": value_counts
    }

    print(f"\n{feature}:")
    print(f"  Unique values: {unique_values}")
    print(f"  Value counts:")
    for value, count in value_counts.items():
        print(f"    {value}: {count} ({count/len(customer)*100:.1f}%)")

# Save schema to JSON file
output_file = f"{base_folder}/data/data_schema.json"
with open(output_file, 'w') as f:
    json.dump(data_schema, f, indent=2)

print("\n" + "=" * 80)
print(f"✓ Data schema saved to {output_file}")
print("=" * 80)

# Display the JSON structure
print("\n" + "-" * 80)
print("GENERATED SCHEMA (data_schema.json)")
print("-" * 80)
print(json.dumps(data_schema, indent=2))

print("\n" + "=" * 80)
print("DONE! Use data_schema.json in your Streamlit app")
print("=" * 80)

ANALYZING CUSTOMER DATA FOR STREAMLIT APP

--------------------------------------------------------------------------------
NUMERICAL FEATURES
--------------------------------------------------------------------------------
Feature                   Min             Max             Mean            Median         
--------------------------------------------------------------------------------
creditScore               350.00          850.00          650.53          652.00         
age                       18.00           92.00           38.92           37.00          
tenure                    0.00            10.00           5.01            5.00           
balance                   0.00            250898.09       76485.89        97198.54       
numofProducts             1.00            4.00            1.53            1.00           
estimatedSalary           11.58           199992.48       100090.24       100193.91      

----------------------------------------------------------------