<a href="https://colab.research.google.com/github/Articbug/Telecom-CDR-Analytics-Platform/blob/main/Notebooks/1_CDR_Analytics_Wipro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
#   TELECOM CDR ANALYTICS PLATFORM
#   Wipro Data Engineering Project
#   Author: Chandan Sahoo, Bismaya Ranjan Sahoo
#   Date: February 2026
# ============================================================
import subprocess
import sys

packages = [
    'pyspark',
    'snowflake-connector-python',
    'faker',
    'numpy',
    'pandas'
]

for package in packages:
    subprocess.run(
        [sys.executable, '-m', 'pip', 'install', package, '--quiet', '--disable-pip-version-check'],
        capture_output=True
    )

print('All libraries installed successfully!')
print('   - pyspark')
print('   - snowflake-connector-python')
print('   - faker')
print('   - numpy')
print('   - pandas')

All libraries installed successfully!
   - pyspark
   - snowflake-connector-python
   - faker
   - numpy
   - pandas


In [None]:
# ============================================================
#   CELL 2: SNOWFLAKE CONNECTION
# ============================================================
import snowflake.connector
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random
import builtins

# ── Snowflake Connection
def get_connection():
    return snowflake.connector.connect(
        account  = 'bopsoxz-lr52214',
        user     = 'CHANDANSAHOO',
        password = 'Chandansahoosnowflake5',
        database = 'TELECOM_DWH',
        schema   = 'STAGING',
        warehouse= 'INGEST_WH'
    )

# ── Test Connection
print('Testing Snowflake connection...')
conn   = get_connection()
cursor = conn.cursor()
cursor.execute('SELECT CURRENT_USER(), CURRENT_DATABASE(), CURRENT_WAREHOUSE()')
row = cursor.fetchone()
print(f'✅ Connected!')
print(f'   User:      {row[0]}')
print(f'   Database:  {row[1]}')
print(f'   Warehouse: {row[2]}')
conn.close()

Testing Snowflake connection...
✅ Connected!
   User:      CHANDANSAHOO
   Database:  TELECOM_DWH
   Warehouse: INGEST_WH


In [None]:
# ============================================================
#   CELL 3: CDR DATA GENERATION
# ============================================================
fake = Faker('en_IN')
random.seed(42)
np.random.seed(42)

INDIAN_PREFIXES = [
    '9199', '9198', '9197', '9196', '9195',
    '9188', '9187', '9186', '9185', '9184',
    '9177', '9176', '9175', '9174', '9173',
    '9166', '9165', '9164', '9163', '9162',
]

INDIAN_CITIES = [
    ('Mumbai', 'Maharashtra'), ('Delhi', 'Delhi'),
    ('Bangalore', 'Karnataka'), ('Hyderabad', 'Telangana'),
    ('Chennai', 'Tamil Nadu'), ('Kolkata', 'West Bengal'),
    ('Pune', 'Maharashtra'), ('Ahmedabad', 'Gujarat'),
    ('Bhubaneswar', 'Odisha'), ('Cuttack', 'Odisha'),
    ('Jaipur', 'Rajasthan'), ('Lucknow', 'Uttar Pradesh'),
    ('Patna', 'Bihar'), ('Bhopal', 'Madhya Pradesh'),
    ('Kochi', 'Kerala'),
]

HOURLY_WEIGHTS = [
    0.005, 0.003, 0.002, 0.002, 0.003, 0.010,
    0.020, 0.045, 0.065, 0.070, 0.060, 0.055,
    0.055, 0.050, 0.048, 0.050, 0.055, 0.068,
    0.075, 0.072, 0.060, 0.045, 0.030, 0.015
]

CELL_TOWERS = ['CELL_00001', 'CELL_00002', 'CELL_00003', 'CELL_00004', 'CELL_00005']

def generate_phone():
    prefix = random.choice(INDIAN_PREFIXES)
    suffix = str(random.randint(10000000, 99999999))
    return f'91{prefix[2:]}{suffix}'[:12]

def generate_subscribers(n=500):
    subscribers = []
    for i in range(n):
        city, state = random.choice(INDIAN_CITIES)
        subscribers.append({
            'phone':   generate_phone(),
            'city':    city,
            'state':   state,
            'plan':    random.choices(['PREPAID', 'POSTPAID'], weights=[65, 35])[0],
            'segment': random.choices(['HIGH_VALUE', 'STANDARD', 'LOW_VALUE'], weights=[20, 60, 20])[0]
        })
    return subscribers

def generate_cdr(n=50000):
    subscribers  = generate_subscribers(500)
    phones       = [s['phone'] for s in subscribers]
    call_types   = ['VOICE', 'SMS', 'DATA', 'VIDEO']
    type_weights = [0.55, 0.25, 0.15, 0.05]
    networks     = ['2G', '3G', '4G', '5G']
    net_weights  = [0.05, 0.15, 0.55, 0.25]
    terminations = ['NORMAL', 'NORMAL', 'NORMAL', 'NORMAL', 'DROPPED', 'BUSY', 'NO_ANSWER']

    records = []
    for i in range(n):
        call_type  = random.choices(call_types, weights=type_weights)[0]
        start_date = datetime(2024, 1, 1) + timedelta(
            days=random.randint(0, 365),
            hours=random.choices(range(24), weights=HOURLY_WEIGHTS)[0],
            minutes=random.randint(0, 59),
            seconds=random.randint(0, 59)
        )

        if call_type == 'VOICE':
            duration = min(max(5,  int(np.random.exponential(180))), 3600)
        elif call_type == 'SMS':
            duration = 0
        elif call_type == 'DATA':
            duration = min(max(30, int(np.random.exponential(600))), 7200)
        else:
            duration = min(max(30, int(np.random.exponential(300))), 3600)

        if call_type == 'DATA':
            data_mb = builtins.round(min(np.random.exponential(50), 500), 3)
        elif call_type == 'VIDEO':
            data_mb = builtins.round(duration * 0.5, 3)
        else:
            data_mb = 0.0

        if call_type == 'VOICE':
            charge = builtins.round(duration * (0.50 / 60), 4)
        elif call_type == 'SMS':
            charge = 0.10
        elif call_type == 'DATA':
            charge = builtins.round(data_mb * (10 / 1024), 4)
        else:
            charge = builtins.round(duration * (1.0 / 60), 4)

        caller = random.choice(phones)
        callee = random.choice(phones)
        while callee == caller:
            callee = random.choice(phones)

        is_roaming = random.random() < 0.02
        is_fraud   = random.random() < 0.005
        if is_roaming:
            charge = builtins.round(charge * 2, 4)

        records.append({
            'CALL_ID':          f'CDR{str(i+1).zfill(8)}',
            'CALLING_NUMBER':   caller,
            'CALLED_NUMBER':    callee,
            'CALL_START_TIME':  start_date.strftime('%Y-%m-%d %H:%M:%S'),
            'CALL_END_TIME':    (start_date + timedelta(seconds=duration)).strftime('%Y-%m-%d %H:%M:%S'),
            'DURATION_SECONDS': duration,
            'CALL_TYPE':        call_type,
            'CELL_ID':          random.choice(CELL_TOWERS),
            'TERMINATION_CD':   random.choice(terminations),
            'IS_ROAMING':       bool(is_roaming),
            'CHARGE_AMOUNT':    charge,
            'DATA_VOLUME_MB':   data_mb,
            'NETWORK_TYPE':     random.choices(networks, weights=net_weights)[0],
            'IS_FRAUD':         bool(is_fraud)
        })

        if (i + 1) % 10000 == 0:
            print(f'  Generated {i+1:,} records...')

    return pd.DataFrame(records)

print('Generating 50,000 CDR records...')
df = generate_cdr(50000)
print(f'\n✅ Generation complete!')
print(f'\nCall type distribution:')
print(df['CALL_TYPE'].value_counts())
print(f'\nSample record:')
print(df.head(2).to_string())

Generating 50,000 CDR records...
  Generated 10,000 records...
  Generated 20,000 records...
  Generated 30,000 records...
  Generated 40,000 records...
  Generated 50,000 records...

✅ Generation complete!

Call type distribution:
CALL_TYPE
VOICE    27405
SMS      12457
DATA      7605
VIDEO     2533
Name: count, dtype: int64

Sample record:
       CALL_ID CALLING_NUMBER CALLED_NUMBER      CALL_START_TIME        CALL_END_TIME  DURATION_SECONDS CALL_TYPE     CELL_ID TERMINATION_CD  IS_ROAMING  CHARGE_AMOUNT  DATA_VOLUME_MB NETWORK_TYPE  IS_FRAUD
0  CDR00000001   918689968863  916440173413  2024-10-13 09:13:55  2024-10-13 09:13:55                 0       SMS  CELL_00002         NORMAL       False            0.1             0.0           5G     False
1  CDR00000002   918453112268  916663640499  2024-09-02 21:44:38  2024-09-02 21:46:02                84     VOICE  CELL_00003      NO_ANSWER       False            0.7             0.0           4G     False


In [None]:
# ============================================================
#   CELL 4: LOAD DATA TO SNOWFLAKE STG_CDR
# ============================================================
from snowflake.connector.pandas_tools import write_pandas

def load_to_snowflake(df):
    print('Connecting to Snowflake...')
    conn = snowflake.connector.connect(
        account  = 'bopsoxz-lr52214',
        user     = 'CHANDANSAHOO',
        password = 'Chandansahoosnowflake5',
        database = 'TELECOM_DWH',
        schema   = 'STAGING',
        warehouse= 'INGEST_WH'
    )

    cursor = conn.cursor()
    print('Clearing old data...')
    cursor.execute('TRUNCATE TABLE TELECOM_DWH.STAGING.STG_CDR')

    print('Loading to Snowflake...')
    success, chunks, rows, _ = write_pandas(
        conn, df, 'STG_CDR',
        schema='STAGING',
        database='TELECOM_DWH'
    )
    print(f'✅ Loaded {rows:,} rows successfully!')

    # Verify
    cursor.execute('SELECT COUNT(*) FROM TELECOM_DWH.STAGING.STG_CDR')
    count = cursor.fetchone()[0]
    print(f'✅ Verified: {count:,} rows in STG_CDR')
    conn.close()

load_to_snowflake(df)

Connecting to Snowflake...
Clearing old data...
Loading to Snowflake...
✅ Loaded 50,000 rows successfully!
✅ Verified: 50,000 rows in STG_CDR
