<a href="https://colab.research.google.com/github/Articbug/Telecom-CDR-Analytics-Platform/blob/main/Notebooks/3_CDR_ETL_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================================
#   CDR ETL PIPELINE
#   Wipro CDR Analytics Project
#   Author: Chandan Sahoo
# ============================================================

import subprocess, sys

packages = ['snowflake-connector-python', 'pandas']
for package in packages:
    subprocess.run(
        [sys.executable, '-m', 'pip', 'install', package, '--quiet', '--disable-pip-version-check'],
        capture_output=True
    )

print('All libraries installed successfully!')

import snowflake.connector
import pandas as pd
from datetime import datetime

def get_connection():
    return snowflake.connector.connect(
        account  = 'bopsoxz-lr52214',
        user     = 'CHANDANSAHOO',
        password = 'Chandansahoosnowflake5',
        database = 'TELECOM_DWH',
        schema   = 'STAGING',
        warehouse= 'TRANSFORM_WH',
        role     = 'ACCOUNTADMIN'
    )

conn   = get_connection()
cursor = conn.cursor()
cursor.execute('SELECT CURRENT_USER(), CURRENT_ROLE(), CURRENT_DATABASE(), CURRENT_WAREHOUSE()')
row = cursor.fetchone()
print(f'Connected successfully!')
print(f'   User:      {row[0]}')
print(f'   Role:      {row[1]}')
print(f'   Database:  {row[2]}')
print(f'   Warehouse: {row[3]}')
conn.close()

All libraries installed successfully!
Connected successfully!
   User:      CHANDANSAHOO
   Role:      ACCOUNTADMIN
   Database:  TELECOM_DWH
   Warehouse: TRANSFORM_WH


In [2]:
# ============================================================
#   CELL 2: ETL PIPELINE (STG_CDR ‚Üí FACT_CDR)
# ============================================================
print('=' * 55)
print('   TELECOM CDR - ETL PIPELINE')
print(f'   Started: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
print('=' * 55)

conn   = get_connection()
cursor = conn.cursor()

# ‚îÄ‚îÄ EXTRACT
print('\nüì• EXTRACT: Reading from STG_CDR...')
cursor.execute('SELECT * FROM TELECOM_DWH.STAGING.STG_CDR')
rows    = cursor.fetchall()
columns = [desc[0] for desc in cursor.description]
df_etl  = pd.DataFrame(rows, columns=columns)
print(f'   ‚úÖ Extracted {len(df_etl):,} records')

# ‚îÄ‚îÄ TRANSFORM
print('\nüîÑ TRANSFORM: Mapping dimension keys...')

cursor.execute('SELECT DATE_KEY, FULL_DATE FROM TELECOM_DWH.DWH.DIM_DATE')
date_map = {str(r[1]): r[0] for r in cursor.fetchall()}

cursor.execute('SELECT SUBSCRIBER_KEY, MSISDN FROM TELECOM_DWH.DWH.DIM_SUBSCRIBER')
sub_map = {r[1]: r[0] for r in cursor.fetchall()}

cursor.execute('SELECT CELL_KEY, CELL_ID FROM TELECOM_DWH.DWH.DIM_CELL_TOWER')
cell_map = {r[1]: r[0] for r in cursor.fetchall()}

cursor.execute('SELECT CALL_TYPE_KEY, CALL_TYPE_CODE FROM TELECOM_DWH.DWH.DIM_CALL_TYPE')
type_map = {r[1]: r[0] for r in cursor.fetchall()}

print(f'   DATE keys:       {len(date_map)}')
print(f'   SUBSCRIBER keys: {len(sub_map)}')
print(f'   CELL TOWER keys: {len(cell_map)}')
print(f'   CALL TYPE keys:  {len(type_map)}')

df_etl['CALL_DATE_STR'] = pd.to_datetime(df_etl['CALL_START_TIME']).dt.strftime('%Y-%m-%d')
df_etl['DATE_KEY']      = df_etl['CALL_DATE_STR'].map(date_map)
df_etl['CALLER_KEY']    = df_etl['CALLING_NUMBER'].map(sub_map)
df_etl['CALLEE_KEY']    = df_etl['CALLED_NUMBER'].map(sub_map).fillna(-1).astype(int)
df_etl['CELL_KEY']      = df_etl['CELL_ID'].map(cell_map)
df_etl['CALL_TYPE_KEY'] = df_etl['CALL_TYPE'].map(type_map)

before = len(df_etl)
df_etl = df_etl.dropna(subset=['DATE_KEY', 'CALLER_KEY', 'CELL_KEY', 'CALL_TYPE_KEY'])
after  = len(df_etl)
print(f'\n   Records before cleanup: {before:,}')
print(f'   Records after cleanup:  {after:,}')
print(f'   Dropped (missing keys): {before - after:,}')

df_etl['DATE_KEY']      = df_etl['DATE_KEY'].astype(int)
df_etl['CALLER_KEY']    = df_etl['CALLER_KEY'].astype(int)
df_etl['CELL_KEY']      = df_etl['CELL_KEY'].astype(int)
df_etl['CALL_TYPE_KEY'] = df_etl['CALL_TYPE_KEY'].astype(int)

fact_df = df_etl[[
    'CALL_ID', 'DATE_KEY', 'CALLER_KEY', 'CALLEE_KEY',
    'CELL_KEY', 'CALL_TYPE_KEY', 'DURATION_SECONDS',
    'CHARGE_AMOUNT', 'DATA_VOLUME_MB', 'IS_ROAMING',
    'IS_FRAUD', 'TERMINATION_CD', 'NETWORK_TYPE'
]].copy()
print(f'\n   ‚úÖ Transform complete ‚Äî {len(fact_df):,} records ready')

# ‚îÄ‚îÄ LOAD
print('\nüì§ LOAD: Inserting into FACT_CDR...')
cursor.execute('TRUNCATE TABLE TELECOM_DWH.DWH.FACT_CDR')

batch_size = 5000
total      = len(fact_df)
loaded     = 0

for i in range(0, total, batch_size):
    batch  = fact_df.iloc[i:i+batch_size]
    values = []
    for _, row in batch.iterrows():
        values.append((
            str(row['CALL_ID']),
            int(row['DATE_KEY']),
            int(row['CALLER_KEY']),
            int(row['CALLEE_KEY']),
            int(row['CELL_KEY']),
            int(row['CALL_TYPE_KEY']),
            int(row['DURATION_SECONDS']),
            float(row['CHARGE_AMOUNT']),
            float(row['DATA_VOLUME_MB']),
            bool(row['IS_ROAMING']),
            bool(row['IS_FRAUD']),
            str(row['TERMINATION_CD']),
            str(row['NETWORK_TYPE'])
        ))
    cursor.executemany('''
        INSERT INTO TELECOM_DWH.DWH.FACT_CDR (
            CALL_ID, DATE_KEY, CALLER_KEY, CALLEE_KEY,
            CELL_KEY, CALL_TYPE_KEY, DURATION_SECS,
            CHARGE_AMOUNT, DATA_VOLUME_MB, IS_ROAMING,
            IS_FRAUD, TERMINATION_CD, NETWORK_TYPE
        ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    ''', values)
    loaded += len(batch)
    print(f'   Loaded {loaded:,} / {total:,} records...')

conn.commit()

# ‚îÄ‚îÄ VALIDATE
print('\n‚úîÔ∏è  VALIDATE:')
cursor.execute('SELECT COUNT(*) FROM TELECOM_DWH.DWH.FACT_CDR')
fact_count = cursor.fetchone()[0]
cursor.execute('''
    SELECT COUNT(*), COUNT(DISTINCT CALLER_KEY),
           ROUND(SUM(CHARGE_AMOUNT), 2)
    FROM TELECOM_DWH.DWH.FACT_CDR
''')
row = cursor.fetchone()
print(f'   Total Records:  {row[0]:,}')
print(f'   Unique Callers: {row[1]:,}')
print(f'   Total Revenue:  ‚Çπ{row[2]:,}')
print(f'\n‚úÖ ETL PIPELINE COMPLETED SUCCESSFULLY!')
print(f'   Finished: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
print('=' * 55)

cursor.close()
conn.close()

   TELECOM CDR - ETL PIPELINE
   Started: 2026-02-22 16:59:40

üì• EXTRACT: Reading from STG_CDR...
   ‚úÖ Extracted 50,000 records

üîÑ TRANSFORM: Mapping dimension keys...
   DATE keys:       366
   SUBSCRIBER keys: 500
   CELL TOWER keys: 5
   CALL TYPE keys:  4

   Records before cleanup: 50,000
   Records after cleanup:  50,000
   Dropped (missing keys): 0

   ‚úÖ Transform complete ‚Äî 50,000 records ready

üì§ LOAD: Inserting into FACT_CDR...
   Loaded 5,000 / 50,000 records...
   Loaded 10,000 / 50,000 records...
   Loaded 15,000 / 50,000 records...
   Loaded 20,000 / 50,000 records...
   Loaded 25,000 / 50,000 records...
   Loaded 30,000 / 50,000 records...
   Loaded 35,000 / 50,000 records...
   Loaded 40,000 / 50,000 records...
   Loaded 45,000 / 50,000 records...
   Loaded 50,000 / 50,000 records...

‚úîÔ∏è  VALIDATE:
   Total Records:  50,000
   Unique Callers: 500
   Total Revenue:  ‚Çπ59,405.55

‚úÖ ETL PIPELINE COMPLETED SUCCESSFULLY!
   Finished: 2026-02-22 17:00:28