<a href="https://colab.research.google.com/github/Articbug/Telecom-CDR-Analytics-Platform/blob/main/Notebooks/5_CDR_Snowflake_Optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
#   SNOWFLAKE OPTIMIZATION & SECURITY
#   Wipro CDR Analytics Project
#   Author: Chandan Sahoo, Bismaya Ranjan Sahoo, Debasish Sahoo
# ============================================================

import subprocess, sys

packages = ['snowflake-connector-python', 'pandas']
for package in packages:
    subprocess.run(
        [sys.executable, '-m', 'pip', 'install', package, '--quiet', '--disable-pip-version-check'],
        capture_output=True
    )

print('All libraries installed successfully!')

import snowflake.connector
import pandas as pd

def get_connection():
    return snowflake.connector.connect(
        account  = 'bopsoxz-lr52214',
        user     = 'CHANDANSAHOO',
        password = 'Chandansahoosnowflake5',
        database = 'TELECOM_DWH',
        schema   = 'DWH',
        warehouse= 'TRANSFORM_WH'
    )

# Test connection
conn   = get_connection()
cursor = conn.cursor()
cursor.execute('SELECT CURRENT_USER(), CURRENT_DATABASE(), CURRENT_WAREHOUSE()')
row = cursor.fetchone()
print(f'Connected successfully!')
print(f'   User:      {row[0]}')
print(f'   Database:  {row[1]}')
print(f'   Warehouse: {row[2]}')
conn.close()

All libraries installed successfully!
Connected successfully!
   User:      CHANDANSAHOO
   Database:  TELECOM_DWH
   Warehouse: TRANSFORM_WH


In [None]:
# ============================================================
#   CELL 2: CLUSTERING KEYS
# ============================================================
print('=' * 55)
print('   1. CLUSTERING KEYS')
print('=' * 55)

conn   = get_connection()
cursor = conn.cursor()

# Apply clustering key on FACT_CDR
cursor.execute('ALTER TABLE TELECOM_DWH.DWH.FACT_CDR CLUSTER BY (DATE_KEY)')
print('✅ Clustering key applied on FACT_CDR (DATE_KEY)')
print('   Benefit: Queries filtering by date scan less data')

# Apply clustering on STG_CDR
cursor.execute('ALTER TABLE TELECOM_DWH.STAGING.STG_CDR CLUSTER BY (CALL_TYPE)')
print('✅ Clustering key applied on STG_CDR (CALL_TYPE)')
print('   Benefit: Queries filtering by call type run faster')

# Check clustering info
cursor.execute("SELECT SYSTEM$CLUSTERING_INFORMATION('TELECOM_DWH.DWH.FACT_CDR')")
info = cursor.fetchone()[0]
print(f'\nFACT_CDR Clustering Info:')
print(f'   {info}')

cursor.close()
conn.close()
print('\n✅ Clustering Keys Complete!')

   1. CLUSTERING KEYS
✅ Clustering key applied on FACT_CDR (DATE_KEY)
   Benefit: Queries filtering by date scan less data
✅ Clustering key applied on STG_CDR (CALL_TYPE)
   Benefit: Queries filtering by call type run faster

FACT_CDR Clustering Info:
   {
  "cluster_by_keys" : "LINEAR(DATE_KEY)",
  "notes" : "Clustering key columns contain high cardinality key DATE_KEY which might result in expensive re-clustering. Consider reducing the cardinality of clustering keys. Please refer to https://docs.snowflake.net/manuals/user-guide/tables-clustering-keys.html for more information.",
  "total_partition_count" : 1,
  "total_constant_partition_count" : 0,
  "average_overlaps" : 0.0,
  "average_depth" : 1.0,
  "partition_depth_histogram" : {
    "00000" : 0,
    "00001" : 1,
    "00002" : 0,
    "00003" : 0,
    "00004" : 0,
    "00005" : 0,
    "00006" : 0,
    "00007" : 0,
    "00008" : 0,
    "00009" : 0,
    "00010" : 0,
    "00011" : 0,
    "00012" : 0,
    "00013" : 0,
    "00014" : 0,

In [None]:
# ============================================================
#   CELL 3: TIME TRAVEL
# ============================================================
print('=' * 55)
print('   2. TIME TRAVEL')
print('=' * 55)

conn   = get_connection()
cursor = conn.cursor()

# Set retention period
cursor.execute('ALTER TABLE TELECOM_DWH.DWH.FACT_CDR SET DATA_RETENTION_TIME_IN_DAYS = 7')
cursor.execute('ALTER TABLE TELECOM_DWH.STAGING.STG_CDR SET DATA_RETENTION_TIME_IN_DAYS = 7')
print('✅ Time Travel retention set to 7 days')
print('   FACT_CDR  → 7 days')
print('   STG_CDR   → 7 days')

# Step 1: Restore full 50,000 records first
cursor.execute('TRUNCATE TABLE TELECOM_DWH.DWH.FACT_CDR')
cursor.execute('''
    INSERT INTO TELECOM_DWH.DWH.FACT_CDR (
        CALL_ID, DATE_KEY, CALLER_KEY, CALLEE_KEY,
        CELL_KEY, CALL_TYPE_KEY, DURATION_SECS,
        CHARGE_AMOUNT, DATA_VOLUME_MB, IS_ROAMING,
        IS_FRAUD, TERMINATION_CD, NETWORK_TYPE
    )
    SELECT
        s.CALL_ID,
        d.DATE_KEY,
        sub.SUBSCRIBER_KEY,
        -1,
        c.CELL_KEY,
        ct.CALL_TYPE_KEY,
        s.DURATION_SECONDS,
        s.CHARGE_AMOUNT,
        s.DATA_VOLUME_MB,
        s.IS_ROAMING,
        s.IS_FRAUD,
        s.TERMINATION_CD,
        s.NETWORK_TYPE
    FROM TELECOM_DWH.STAGING.STG_CDR s
    JOIN TELECOM_DWH.DWH.DIM_DATE d
        ON DATE(s.CALL_START_TIME) = d.FULL_DATE
    JOIN TELECOM_DWH.DWH.DIM_SUBSCRIBER sub
        ON s.CALLING_NUMBER = sub.MSISDN
    JOIN TELECOM_DWH.DWH.DIM_CELL_TOWER c
        ON s.CELL_ID = c.CELL_ID
    JOIN TELECOM_DWH.DWH.DIM_CALL_TYPE ct
        ON s.CALL_TYPE = ct.CALL_TYPE_CODE
''')
cursor.execute('SELECT COUNT(*) FROM TELECOM_DWH.DWH.FACT_CDR')
full_count = cursor.fetchone()[0]
print(f'\nStep 1 - Restored full records: {full_count:,}')

# Step 2: Delete records and capture query ID
cursor.execute('DELETE FROM TELECOM_DWH.DWH.FACT_CDR WHERE DATE_KEY = 20240101')
delete_query_id = cursor.sfqid
cursor.execute('SELECT COUNT(*) FROM TELECOM_DWH.DWH.FACT_CDR')
after_delete = cursor.fetchone()[0]
deleted = full_count - after_delete
print(f'Step 2 - Records deleted:        {deleted:,}')
print(f'         Records remaining:       {after_delete:,}')

# Step 3: Recover using Time Travel
cursor.execute(f'''
    INSERT INTO TELECOM_DWH.DWH.FACT_CDR
    SELECT * FROM TELECOM_DWH.DWH.FACT_CDR
    BEFORE (STATEMENT => \'{delete_query_id}\')
    WHERE DATE_KEY = 20240101
''')
cursor.execute('SELECT COUNT(*) FROM TELECOM_DWH.DWH.FACT_CDR')
recovered = cursor.fetchone()[0]
print(f'Step 3 - Records after recovery: {recovered:,}')

if recovered == full_count:
    print('\n✅ Time Travel Recovery Successful!')
    print(f'   Recovered {deleted:,} deleted records perfectly!')
else:
    print(f'\n⚠️  Count mismatch — expected {full_count:,}, got {recovered:,}')

conn.commit()
cursor.close()
conn.close()
print('\n✅ Time Travel Complete!')

   2. TIME TRAVEL
✅ Time Travel retention set to 7 days
   FACT_CDR  → 7 days
   STG_CDR   → 7 days

Step 1 - Restored full records: 50,000
Step 2 - Records deleted:        138
         Records remaining:       49,862
Step 3 - Records after recovery: 50,000

✅ Time Travel Recovery Successful!
   Recovered 138 deleted records perfectly!

✅ Time Travel Complete!


In [None]:
# ============================================================
#   CELL 4: SEMI-STRUCTURED JSON DATA (VARIANT)
# ============================================================
print('=' * 55)
print('   3. SEMI-STRUCTURED JSON DATA')
print('=' * 55)

conn   = get_connection()
cursor = conn.cursor()

# Create JSON table
cursor.execute('''
    CREATE OR REPLACE TABLE TELECOM_DWH.STAGING.CDR_JSON (
        CALL_ID      VARCHAR(50),
        CALL_DATE    DATE,
        CDR_METADATA VARIANT
    )
''')
print('✅ CDR_JSON table created with VARIANT column')

# Insert JSON data
cursor.execute('''
    INSERT INTO TELECOM_DWH.STAGING.CDR_JSON
    SELECT
        CALL_ID,
        DATE(CALL_START_TIME),
        PARSE_JSON(\'{\' ||
            \'"call_type":"\' || CALL_TYPE || \'",\' ||
            \'"network":"\' || NETWORK_TYPE || \'",\' ||
            \'"cell_id":"\' || CELL_ID || \'",\' ||
            \'"duration":\' || DURATION_SECONDS || \',\' ||
            \'"charge":\' || CHARGE_AMOUNT || \',\' ||
            \'"is_roaming":\' || IFF(IS_ROAMING, \'true\', \'false\') || \',\' ||
            \'"is_fraud":\' || IFF(IS_FRAUD, \'true\', \'false\') ||
        \'}\')
    FROM TELECOM_DWH.STAGING.STG_CDR
    LIMIT 1000
''')
print('✅ 1000 CDR records inserted as JSON')

# Query JSON fields
cursor.execute('''
    SELECT
        CALL_ID,
        CALL_DATE,
        CDR_METADATA:call_type::VARCHAR  AS call_type,
        CDR_METADATA:network::VARCHAR    AS network,
        CDR_METADATA:duration::INT       AS duration,
        CDR_METADATA:charge::FLOAT       AS charge,
        CDR_METADATA:is_roaming::BOOLEAN AS is_roaming,
        CDR_METADATA:is_fraud::BOOLEAN   AS is_fraud
    FROM TELECOM_DWH.STAGING.CDR_JSON
    LIMIT 10
''')
rows    = cursor.fetchall()
cols    = [d[0] for d in cursor.description]
df_json = pd.DataFrame(rows, columns=cols)
print('\n✅ JSON fields extracted successfully:')
print(df_json.to_string(index=False))

# JSON aggregation
cursor.execute('''
    SELECT
        CDR_METADATA:call_type::VARCHAR AS call_type,
        COUNT(*)                        AS total_calls,
        ROUND(SUM(CDR_METADATA:charge::FLOAT), 2) AS total_revenue
    FROM TELECOM_DWH.STAGING.CDR_JSON
    GROUP BY call_type
    ORDER BY total_revenue DESC
''')
rows      = cursor.fetchall()
cols      = [d[0] for d in cursor.description]
df_agg    = pd.DataFrame(rows, columns=cols)
print('\n✅ JSON Aggregation by Call Type:')
print(df_agg.to_string(index=False))

cursor.close()
conn.close()
print('\n✅ Semi-Structured JSON Complete!')

   3. SEMI-STRUCTURED JSON DATA
✅ CDR_JSON table created with VARIANT column
✅ 1000 CDR records inserted as JSON

✅ JSON fields extracted successfully:
    CALL_ID  CALL_DATE CALL_TYPE NETWORK  DURATION  CHARGE  IS_ROAMING  IS_FRAUD
CDR00000001 2024-10-13       SMS      5G         0  0.1000       False     False
CDR00000002 2024-09-02     VOICE      4G        84  0.7000       False     False
CDR00000003 2024-11-28     VOICE      5G       541  4.5083       False     False
CDR00000004 2024-07-08      DATA      4G       790  0.4458       False     False
CDR00000005 2024-03-19     VOICE      5G        30  0.2500       False     False
CDR00000006 2024-11-24     VOICE      3G        30  0.2500       False     False
CDR00000007 2024-11-24      DATA      2G        35  0.9821       False     False
CDR00000008 2024-11-18     VOICE      4G       165  1.3750       False     False
CDR00000009 2024-09-04       SMS      5G         0  0.1000       False     False
CDR00000010 2024-12-23     VOICE      