# Setup & BigQuery Connection Test

**Objectives:**
1. Test Python environment setup
2. Verify all required libraries are installed
3. Test BigQuery connection
4. Validate access to Olist marts
5. Run sample queries to confirm data availability

## 1. Import Libraries

In [None]:
# Core libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# BigQuery
from google.cloud import bigquery

# Visualization
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

print("All libraries imported successfully!")
print(f"Execution date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 2. Check Library Versions

In [None]:
import sys

print("=" * 60)
print("PYTHON ENVIRONMENT INFO")
print("=" * 60)
print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Plotly version: {px.__version__}")
print(f"Seaborn version: {sns.__version__}")
print(f"BigQuery SDK version: {bigquery.__version__}")
print("=" * 60)

## 3. Configure BigQuery Connection

In [None]:
# BigQuery Configuration
PROJECT_ID = "quintoandar-ecommerce-analysis"
DATASET_RAW = "olist_raw"
DATASET_STAGING = "olist_staging"
DATASET_MARTS = "olist_marts"

# Initialize BigQuery client
try:
    client = bigquery.Client(project=PROJECT_ID)
    print(f"BigQuery client initialized successfully!")
    print(f"Project ID: {PROJECT_ID}")
    print(f"Default location: {client.location or 'US'}")
except Exception as e:
    print(f"Error initializing BigQuery client: {e}")
    print("\nMake sure GOOGLE_APPLICATION_CREDENTIALS is set correctly!")

## 4. Test BigQuery Connection

In [None]:
def test_bigquery_connection():
    """Test basic BigQuery connectivity"""
    try:
        query = """
        SELECT 
            CURRENT_TIMESTAMP() as test_timestamp,
            'Connection successful!' as message
        """
        
        df = client.query(query).to_dataframe()
        
        print("=" * 60)
        print("BIGQUERY CONNECTION TEST")
        print("=" * 60)
        print(df)
        print("=" * 60)
        print("BigQuery connection is working!")
        return True
    except Exception as e:
        print(f"BigQuery connection failed: {e}")
        return False

test_bigquery_connection()

## 5. List Available Datasets

In [None]:
def list_datasets():
    """List all datasets in the project"""
    try:
        datasets = list(client.list_datasets())
        
        print("=" * 60)
        print("AVAILABLE DATASETS")
        print("=" * 60)
        
        if datasets:
            for dataset in datasets:
                dataset_id = dataset.dataset_id
                dataset_ref = client.get_dataset(dataset_id)
                tables = list(client.list_tables(dataset_id))
                table_count = len(tables)
                
                print(f"{dataset_id}")
                print(f"   └─ Tables: {table_count}")
                print(f"   └─ Location: {dataset_ref.location}")
                print(f"   └─ Created: {dataset_ref.created.strftime('%Y-%m-%d')}")
                print()
        else:
            print("No datasets found in this project")
        
        print("=" * 60)
        return datasets
    except Exception as e:
        print(f"Error listing datasets: {e}")
        return []

datasets = list_datasets()

## 6. List Tables in Marts Dataset

In [None]:
def list_mart_tables():
    """List all tables in olist_marts dataset"""
    try:
        tables = client.list_tables(f"{PROJECT_ID}.{DATASET_MARTS}")
        
        print("=" * 60)
        print(f"TABLES IN {DATASET_MARTS}")
        print("=" * 60)
        
        table_info = []
        
        for table in tables:
            table_ref = client.get_table(f"{PROJECT_ID}.{DATASET_MARTS}.{table.table_id}")
            
            info = {
                'table_name': table.table_id,
                'rows': table_ref.num_rows,
                'size_mb': round(table_ref.num_bytes / (1024 * 1024), 2),
                'created': table_ref.created.strftime('%Y-%m-%d')
            }
            table_info.append(info)
            
            print(f"{table.table_id}")
            print(f"   └─ Rows: {table_ref.num_rows:,}")
            print(f"   └─ Size: {info['size_mb']:.2f} MB")
            print(f"   └─ Created: {info['created']}")
            print()
        
        print("=" * 60)
        print(f"Total marts: {len(table_info)}")
        print("=" * 60)
        
        df_summary = pd.DataFrame(table_info)
        return df_summary
    
    except Exception as e:
        print(f"Error listing mart tables: {e}")
        return pd.DataFrame()

df_marts = list_mart_tables()
print("\nMarts Summary:")
print(df_marts)

## 7. Test Sample Queries on Key Marts

### 7.1 - Test mart_customer_base

In [None]:
def test_customer_base():
    query = f"""
    SELECT 
        COUNT(*) as total_customers,
        COUNT(DISTINCT customer_id) as unique_customers,
        ROUND(AVG(total_revenue), 2) as avg_revenue,
        ROUND(AVG(total_orders), 2) as avg_orders,
        COUNTIF(is_repeat_customer) as repeat_customers,
        ROUND(COUNTIF(is_repeat_customer) * 100.0 / COUNT(*), 2) as repeat_rate
    FROM `{PROJECT_ID}.{DATASET_MARTS}.mart_customer_base`
    """
    
    try:
        df = client.query(query).to_dataframe()
        print("=" * 60)
        print("MART_CUSTOMER_BASE - SUMMARY")
        print("=" * 60)
        print(df.T)
        print("=" * 60)
        return df
    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()

df_customer_summary = test_customer_base()

### 7.2 - Test mart_customer_rfm

In [None]:
def test_customer_rfm():
    query = f"""
    SELECT 
        rfm_segment,
        COUNT(*) as customer_count,
        ROUND(SUM(monetary), 2) as total_revenue,
        ROUND(AVG(monetary), 2) as avg_ltv
    FROM `{PROJECT_ID}.{DATASET_MARTS}.mart_customer_rfm`
    GROUP BY rfm_segment
    ORDER BY total_revenue DESC
    """
    
    try:
        df = client.query(query).to_dataframe()
        print("=" * 60)
        print("MART_CUSTOMER_RFM - SEGMENT DISTRIBUTION")
        print("=" * 60)
        print(df)
        print("=" * 60)
        
        fig = px.bar(
            df, 
            x='rfm_segment', 
            y='customer_count',
            title='Customer Distribution by RFM Segment',
            color='avg_ltv',
            color_continuous_scale='Blues'
        )
        fig.show()
        return df
    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()

df_rfm = test_customer_rfm()

## 8. Final Validation Summary

In [None]:
print("\n" + "=" * 60)
print("SETUP & TEST VALIDATION SUMMARY")
print("=" * 60)

all_passed = (
    not df_marts.empty and 
    not df_customer_summary.empty and 
    not df_rfm.empty
)

if all_passed:
    print("\nALL TESTS PASSED! Environment is ready for analysis!")
    print(f"\nYou have access to {len(df_marts)} mart tables")
    print("\nYou can proceed to create analysis notebooks!")
else:
    print("\nSome tests failed. Please check the errors above.")