# 02 Feature Engineering (Telecom Churn / Cease)

This notebook builds a **leakage-safe modelling dataset** for churn/cease prediction aligned to the business objective:

> **Prioritise retention resources by identifying customers most likely to place a cease in the next 30 days.**

## What this notebook does
- Loads `customer_info`, `calls`, `usage`, `cease`
- Standardises schemas and types
- Removes duplicate snapshots and duplicate event rows
- Builds a **leakage-safe target**: `target_cease_30d`
- Engineers high-value predictive features (contract, payment, calls, usage, trend, recency)
- Produces a clean feature table ready for modelling
- Saves the modelling dataset for the next notebook


In [1]:
# Core libraries
from pathlib import Path
import os
import warnings
warnings.filterwarnings("ignore")

import duckdb
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)


In [2]:
# Project path detection (works from /notebooks or repo root)
cwd = Path.cwd()
repo_dir = cwd.parent if cwd.name.lower() in {"notebook", "notebooks"} else cwd

data_dir = repo_dir / "data"
outputs_dir = repo_dir / "outputs"
features_dir = outputs_dir / "features"
outputs_dir.mkdir(parents=True, exist_ok=True)
features_dir.mkdir(parents=True, exist_ok=True)

# Default data paths
cease_path_default = data_dir / "cease.csv"
calls_path_default = data_dir / "calls.csv"
customer_path_default = data_dir / "customer_info.parquet"
usage_path_default = data_dir / "usage.parquet"




In [12]:
# Connect to DuckDB (file-based for reproducibility)
db_path = repo_dir / "notebooks" / "K_telecom_session2.duckdb"
db_path.parent.mkdir(parents=True, exist_ok=True)
con = duckdb.connect(str(db_path))
print("DuckDB:", db_path)


DuckDB: c:\Users\Admin\OneDrive - University of West London\Desktop\AA\TECH_REYAL_project\Talk_talk\Churn_retention_taltalk\notebooks\K_telecom_session2.duckdb


In [16]:
#Recall  Default data paths
cease_path = cease_path_default 
calls_path = calls_path_default 
customer_path = customer_path_default 
usage_path = usage_path_default 



In [15]:
# Register raw views (DuckDB reads files directly; efficient for large parquet)
con.execute(f"CREATE OR REPLACE VIEW customer_raw AS SELECT * FROM read_parquet('{customer_path.as_posix()}')")
con.execute(f"CREATE OR REPLACE VIEW calls_raw    AS SELECT * FROM read_csv_auto('{calls_path.as_posix()}')")
con.execute(f"CREATE OR REPLACE VIEW cease_raw    AS SELECT * FROM read_csv_auto('{cease_path.as_posix()}')")
con.execute(f"CREATE OR REPLACE VIEW usage_raw    AS SELECT * FROM read_parquet('{usage_path.as_posix()}')")

print("Schemas")
display(con.execute("DESCRIBE customer_raw").df())
display(con.execute("DESCRIBE calls_raw").df())
display(con.execute("DESCRIBE cease_raw").df())
display(con.execute("DESCRIBE usage_raw").df())


Schemas


Unnamed: 0,column_name,column_type,null,key,default,extra
0,unique_customer_identifier,VARCHAR,YES,,,
1,datevalue,DATE,YES,,,
2,contract_status,VARCHAR,YES,,,
3,contract_dd_cancels,BIGINT,YES,,,
4,dd_cancel_60_day,INTEGER,YES,,,
5,ooc_days,INTEGER,YES,,,
6,technology,VARCHAR,YES,,,
7,speed,INTEGER,YES,,,
8,line_speed,DOUBLE,YES,,,
9,sales_channel,VARCHAR,YES,,,


Unnamed: 0,column_name,column_type,null,key,default,extra
0,unique_customer_identifier,VARCHAR,YES,,,
1,event_date,DATE,YES,,,
2,call_type,VARCHAR,YES,,,
3,talk_time_seconds,DOUBLE,YES,,,
4,hold_time_seconds,DOUBLE,YES,,,


Unnamed: 0,column_name,column_type,null,key,default,extra
0,unique_customer_identifier,VARCHAR,YES,,,
1,cease_placed_date,DATE,YES,,,
2,cease_completed_date,VARCHAR,YES,,,
3,reason_description,VARCHAR,YES,,,
4,reason_description_insight,VARCHAR,YES,,,


Unnamed: 0,column_name,column_type,null,key,default,extra
0,unique_customer_identifier,VARCHAR,YES,,,
1,calendar_date,DATE,YES,,,
2,usage_download_mbs,VARCHAR,YES,,,
3,usage_upload_mbs,VARCHAR,YES,,,


## 1) Clean and standardise base tables

We normalise date and numeric types, then de-duplicate records before feature engineering.


In [None]:
# Customer snapshot table (standardised)
con.execute("""
CREATE OR REPLACE VIEW customer_info_std AS
SELECT
    unique_customer_identifier,
    CAST(datevalue AS DATE) AS snapshot_date,
    CAST(contract_status AS VARCHAR) AS contract_status,
    TRY_CAST(ooc_days AS DOUBLE) AS ooc_days,
    TRY_CAST(dd_cancel_60_day AS DOUBLE) AS dd_cancel_60_day,
    TRY_CAST(contract_dd_cancels AS DOUBLE) AS contract_dd_cancels,
    CAST(Technology AS VARCHAR) AS technology,
    CAST(crm_package_name AS VARCHAR) AS crm_package_name,
    CAST(sales_channel AS VARCHAR) AS sales_channel,
    TRY_CAST(speed AS DOUBLE) AS speed,
    TRY_CAST(line_speed AS DOUBLE) AS line_speed,
    TRY_CAST(tenure_days AS DOUBLE) AS tenure_days
FROM customer_raw
WHERE unique_customer_identifier IS NOT NULL
  AND datevalue IS NOT NULL
""")

# De-duplicate by customer + snapshot_date
con.execute("""
CREATE OR REPLACE VIEW customer_info_dedup AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (
               PARTITION BY unique_customer_identifier, snapshot_date
               ORDER BY snapshot_date DESC
           ) AS rn
    FROM customer_info_std
)
WHERE rn = 1
""")

# Calls standardisation
con.execute("""
CREATE OR REPLACE VIEW calls_std AS
SELECT
    unique_customer_identifier,
    CAST(event_date AS DATE) AS event_date,
    CAST(call_type AS VARCHAR) AS call_type,
    TRY_CAST(talk_time_seconds AS DOUBLE) AS talk_time_seconds,
    TRY_CAST(hold_time_seconds AS DOUBLE) AS hold_time_seconds
FROM calls_raw
WHERE unique_customer_identifier IS NOT NULL
  AND event_date IS NOT NULL
""")

# De-duplicate call rows (exact duplicates)
con.execute("""
CREATE OR REPLACE VIEW calls_dedup AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (
               PARTITION BY unique_customer_identifier, event_date, call_type,
                            coalesce(talk_time_seconds, -1), coalesce(hold_time_seconds, -1)
               ORDER BY event_date DESC
           ) AS rn
    FROM calls_std
)
WHERE rn = 1
""")

# Cease standardisation
con.execute("""
CREATE OR REPLACE VIEW cease_std AS
SELECT
    unique_customer_identifier,
    CAST(cease_placed_date AS DATE) AS cease_placed_date,
    CAST(cease_completed_date AS DATE) AS cease_completed_date,
    CAST(reason_description AS VARCHAR) AS reason_description,
    CAST(reason_description_insight AS VARCHAR) AS reason_description_insight
FROM cease_raw
WHERE unique_customer_identifier IS NOT NULL
  AND cease_placed_date IS NOT NULL
""")

# Usage standardisation
con.execute("""
CREATE OR REPLACE VIEW usage_std AS
SELECT
    unique_customer_identifier,
    CAST(calendar_date AS DATE) AS usage_date,
    TRY_CAST(usage_download_mbs AS DOUBLE) AS usage_download_mbs,
    TRY_CAST(usage_upload_mbs AS DOUBLE) AS usage_upload_mbs
FROM usage_raw
WHERE unique_customer_identifier IS NOT NULL
  AND calendar_date IS NOT NULL
""")

# Deduplicate usage by customer/date
con.execute("""
CREATE OR REPLACE VIEW usage_dedup AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (
               PARTITION BY unique_customer_identifier, usage_date
               ORDER BY usage_date DESC
           ) AS rn
    FROM usage_std
)
WHERE rn = 1
""")

print("Row counts (raw -> dedup)")
for t in ["customer_info_std","customer_info_dedup","calls_std","calls_dedup","cease_std","usage_std","usage_dedup"]:
    print(t, con.execute(f"SELECT COUNT(*) AS n FROM {t}").fetchone()[0])


Row counts (raw -> dedup)
customer_info_std 3545538


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

customer_info_dedup 3532720
calls_std 628437
calls_dedup 621951
cease_std 146363
usage_std 83185050


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

## 2) Leakage-safe target (`target_cease_30d`)

For each customer snapshot, the target is:

- `1` if a cease is placed **after** the snapshot date and **within 30 days**
- else `0`

This matches the retention prioritisation use case (who to call now).


In [None]:
# Customer snapshot table (standardised)
con.execute("""
CREATE OR REPLACE VIEW customer_info_std AS
SELECT
    unique_customer_identifier,
    CAST(datevalue AS DATE) AS snapshot_date,
    CAST(contract_status AS VARCHAR) AS contract_status,
    TRY_CAST(ooc_days AS DOUBLE) AS ooc_days,
    TRY_CAST(dd_cancel_60_day AS DOUBLE) AS dd_cancel_60_day,
    TRY_CAST(contract_dd_cancels AS DOUBLE) AS contract_dd_cancels,
    CAST(Technology AS VARCHAR) AS technology,
    CAST(crm_package_name AS VARCHAR) AS crm_package_name,
    CAST(sales_channel AS VARCHAR) AS sales_channel,
    TRY_CAST(speed AS DOUBLE) AS speed,
    TRY_CAST(line_speed AS DOUBLE) AS line_speed,
    TRY_CAST(tenure_days AS DOUBLE) AS tenure_days
FROM customer_raw
WHERE unique_customer_identifier IS NOT NULL
  AND datevalue IS NOT NULL
""")

# De-duplicate by customer + snapshot_date
con.execute("""
CREATE OR REPLACE VIEW customer_info_dedup AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (
               PARTITION BY unique_customer_identifier, snapshot_date
               ORDER BY snapshot_date DESC
           ) AS rn
    FROM customer_info_std
)
WHERE rn = 1
""")

# Calls standardisation
con.execute("""
CREATE OR REPLACE VIEW calls_std AS
SELECT
    unique_customer_identifier,
    CAST(event_date AS DATE) AS event_date,
    CAST(call_type AS VARCHAR) AS call_type,
    TRY_CAST(talk_time_seconds AS DOUBLE) AS talk_time_seconds,
    TRY_CAST(hold_time_seconds AS DOUBLE) AS hold_time_seconds
FROM calls_raw
WHERE unique_customer_identifier IS NOT NULL
  AND event_date IS NOT NULL
""")

# De-duplicate call rows (exact duplicates)
con.execute("""
CREATE OR REPLACE VIEW calls_dedup AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (
               PARTITION BY unique_customer_identifier, event_date, call_type,
                            coalesce(talk_time_seconds, -1), coalesce(hold_time_seconds, -1)
               ORDER BY event_date DESC
           ) AS rn
    FROM calls_std
)
WHERE rn = 1
""")

# Cease standardisation
con.execute("""
CREATE OR REPLACE VIEW cease_std AS
SELECT
    unique_customer_identifier,
    CAST(cease_placed_date AS DATE) AS cease_placed_date,
    CAST(cease_completed_date AS DATE) AS cease_completed_date,
    CAST(reason_description AS VARCHAR) AS reason_description,
    CAST(reason_description_insight AS VARCHAR) AS reason_description_insight
FROM cease_raw
WHERE unique_customer_identifier IS NOT NULL
  AND cease_placed_date IS NOT NULL
""")

# Usage standardisation
con.execute("""
CREATE OR REPLACE VIEW usage_std AS
SELECT
    unique_customer_identifier,
    CAST(calendar_date AS DATE) AS usage_date,
    TRY_CAST(usage_download_mbs AS DOUBLE) AS usage_download_mbs,
    TRY_CAST(usage_upload_mbs AS DOUBLE) AS usage_upload_mbs
FROM usage_raw
WHERE unique_customer_identifier IS NOT NULL
  AND calendar_date IS NOT NULL
""")

# Deduplicate usage by customer/date
con.execute("""
CREATE OR REPLACE VIEW usage_dedup AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (
               PARTITION BY unique_customer_identifier, usage_date
               ORDER BY usage_date DESC
           ) AS rn
    FROM usage_std
)
WHERE rn = 1
""")

print("Row counts (raw -> dedup)")
for t in ["customer_info_std","customer_info_dedup","calls_std","calls_dedup","cease_std","usage_std","usage_dedup"]:
    print(t, con.execute(f"SELECT COUNT(*) AS n FROM {t}").fetchone()[0])


In [None]:
# Customer snapshot table (standardised)
con.execute("""
CREATE OR REPLACE VIEW customer_info_std AS
SELECT
    unique_customer_identifier,
    CAST(datevalue AS DATE) AS snapshot_date,
    CAST(contract_status AS VARCHAR) AS contract_status,
    TRY_CAST(ooc_days AS DOUBLE) AS ooc_days,
    TRY_CAST(dd_cancel_60_day AS DOUBLE) AS dd_cancel_60_day,
    TRY_CAST(contract_dd_cancels AS DOUBLE) AS contract_dd_cancels,
    CAST(Technology AS VARCHAR) AS technology,
    CAST(crm_package_name AS VARCHAR) AS crm_package_name,
    CAST(sales_channel AS VARCHAR) AS sales_channel,
    TRY_CAST(speed AS DOUBLE) AS speed,
    TRY_CAST(line_speed AS DOUBLE) AS line_speed,
    TRY_CAST(tenure_days AS DOUBLE) AS tenure_days
FROM customer_raw
WHERE unique_customer_identifier IS NOT NULL
  AND datevalue IS NOT NULL
""")

# De-duplicate by customer + snapshot_date
con.execute("""
CREATE OR REPLACE VIEW customer_info_dedup AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (
               PARTITION BY unique_customer_identifier, snapshot_date
               ORDER BY snapshot_date DESC
           ) AS rn
    FROM customer_info_std
)
WHERE rn = 1
""")

# Calls standardisation
con.execute("""
CREATE OR REPLACE VIEW calls_std AS
SELECT
    unique_customer_identifier,
    CAST(event_date AS DATE) AS event_date,
    CAST(call_type AS VARCHAR) AS call_type,
    TRY_CAST(talk_time_seconds AS DOUBLE) AS talk_time_seconds,
    TRY_CAST(hold_time_seconds AS DOUBLE) AS hold_time_seconds
FROM calls_raw
WHERE unique_customer_identifier IS NOT NULL
  AND event_date IS NOT NULL
""")

# De-duplicate call rows (exact duplicates)
con.execute("""
CREATE OR REPLACE VIEW calls_dedup AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (
               PARTITION BY unique_customer_identifier, event_date, call_type,
                            coalesce(talk_time_seconds, -1), coalesce(hold_time_seconds, -1)
               ORDER BY event_date DESC
           ) AS rn
    FROM calls_std
)
WHERE rn = 1
""")

# Cease standardisation
con.execute("""
CREATE OR REPLACE VIEW cease_std AS
SELECT
    unique_customer_identifier,
    CAST(cease_placed_date AS DATE) AS cease_placed_date,
    CAST(cease_completed_date AS DATE) AS cease_completed_date,
    CAST(reason_description AS VARCHAR) AS reason_description,
    CAST(reason_description_insight AS VARCHAR) AS reason_description_insight
FROM cease_raw
WHERE unique_customer_identifier IS NOT NULL
  AND cease_placed_date IS NOT NULL
""")

# Usage standardisation
con.execute("""
CREATE OR REPLACE VIEW usage_std AS
SELECT
    unique_customer_identifier,
    CAST(calendar_date AS DATE) AS usage_date,
    TRY_CAST(usage_download_mbs AS DOUBLE) AS usage_download_mbs,
    TRY_CAST(usage_upload_mbs AS DOUBLE) AS usage_upload_mbs
FROM usage_raw
WHERE unique_customer_identifier IS NOT NULL
  AND calendar_date IS NOT NULL
""")

# Deduplicate usage by customer/date
con.execute("""
CREATE OR REPLACE VIEW usage_dedup AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (
               PARTITION BY unique_customer_identifier, usage_date
               ORDER BY usage_date DESC
           ) AS rn
    FROM usage_std
)
WHERE rn = 1
""")

print("Row counts (raw -> dedup)")
for t in ["customer_info_std","customer_info_dedup","calls_std","calls_dedup","cease_std","usage_std","usage_dedup"]:
    print(t, con.execute(f"SELECT COUNT(*) AS n FROM {t}").fetchone()[0])
