In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_excel('../data/raw/SubscriptionUseCase_Dataset.xlsx')
df.columns = df.columns.str.lower().str.replace(' ', '_')
print(df.columns)

Index(['user_id', 'name', 'phone', 'email', 'status'], dtype='object')


In [3]:
# 1. Understand Dataset
date_cols = ['subscription_start_date', 'subscription_end_date', 'last_renewal_date', 'cancellation_date']
numeric_cols = ['user_id', 'subscription_price', 'used_quota_gb', 'total_quota_gb', 'support_tickets_raised']
categorical_cols = ['subscription_plan', 'payment_method', 'auto_renewal_status', 'cancellation_reason', 'user_country', 'user_age_group', 'user_gender']

# 2. Data Cleaning
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  100 non-null    int64 
 1   name     100 non-null    object
 2   phone    100 non-null    int64 
 3   email    100 non-null    object
 4   status   100 non-null    object
dtypes: int64(2), object(3)
memory usage: 4.0+ KB
None


In [4]:
# Handle missing values
for col in numeric_cols:
    if col in df.columns:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)

for col in categorical_cols:
    if col in df.columns:
        df[col].fillna('UNKNOWN', inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

print(df.isnull().sum())

user_id    0
name       0
phone      0
email      0
status     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


In [5]:
# 3. Feature Engineering
df['tenure_days'] = (df['subscription_end_date'] - df['subscription_start_date']).dt.days
df['days_since_renewal'] = (pd.to_datetime('today') - df['last_renewal_date']).dt.days
df['is_auto_renew'] = df['auto_renewal_status'].apply(lambda x: 1 if x == 'Enabled' else 0)
df['usage_ratio'] = df['used_quota_gb'] / df['total_quota_gb']
df['over_usage_flag'] = df['usage_ratio'].apply(lambda x: 1 if x > 1 else 0)

print(df[['tenure_days', 'days_since_renewal', 'is_auto_renew', 'usage_ratio', 'over_usage_flag']].head())

KeyError: 'subscription_end_date'

In [None]:
# 4. Target Definition
df['churn'] = df['cancellation_date'].notna().astype(int)

print(df['churn'].value_counts())

In [None]:
# 5. Exploratory Analysis
print(df.describe())

plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='tenure_days', hue='churn', multiple='stack')
plt.title('Tenure Distribution by Churn')
plt.show()

In [None]:
# 6. Save Outputs
df.to_csv('../data/processed/processed_subscription_data.csv', index=False)

report = '''
## Data Processing and Feature Engineering Report

### Features Created:
- tenure_days: Days since subscription started.
- days_since_renewal: How long since last renewal.
- is_auto_renew: Binary flag if auto-renew is enabled.
- usage_ratio: used_quota ÷ quota.
- over_usage_flag: Whether user exceeded quota.

### Missing Value Handling:
- Numeric columns: Filled with the median value.
- Categorical columns: Replaced with 'UNKNOWN'.
- Date columns: Kept missing if meaningful (e.g., no cancellation date = active).

### Target Label Distribution:
CHURN_DISTRIBUTION

### Key Insights:
- Churners tend to have a lower tenure on average.
'''
churn_distribution = df['churn'].value_counts().to_string()
report = report.replace('CHURN_DISTRIBUTION', churn_distribution)
print(report)