# ============================================
# CELL 1: Imports and Setup
# ============================================

In [49]:
from google.colab import drive
drive.mount('/content/drive')

# Install packages
!pip install faker imbalanced-learn xgboost shap optuna -q

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
# Clone your repo in Colab
!git clone https://github.com/YOUR_USERNAME/business-analytics-ml.git
%cd business-analytics-ml

# Configure git
!git config --global user.email "asthapankaj2503@gmail.com"
!git config --global user.name "Asthaasu"

fatal: destination path 'business-analytics-ml' already exists and is not an empty directory.
/content/business-analytics-ml/business-analytics-ml


In [51]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from faker import Faker
import random
import warnings
warnings.filterwarnings('ignore')

fake = Faker()
np.random.seed(42)
random.seed(42)

print("✓ Packages loaded")


✓ Packages loaded


# ============================================
# CELL 2: Generate Customer Master Data
# ============================================

In [52]:
N_CUSTOMERS = 10000
START_DATE = datetime(2020, 1, 1)
END_DATE = datetime(2024, 12, 31)

print(f"Generating {N_CUSTOMERS} customers...")

customers = []
for i in range(N_CUSTOMERS):
    signup_date = fake.date_between(start_date=START_DATE, end_date=datetime(2024, 6, 30))

    customers.append({
        'customer_id': i,
        'signup_date': signup_date,
        'contract_type': random.choice(['monthly', 'annual', 'biennial']),
        'region': random.choice(['North', 'South', 'East', 'West']),
        'age': random.randint(18, 75),
        'industry': random.choice(['Tech', 'Retail', 'Finance', 'Healthcare', 'Manufacturing']),
        'company_size': random.choice(['Small', 'Medium', 'Enterprise']),
        'account_manager_assigned': random.choice([True, False])
    })

customers_df = pd.DataFrame(customers)
print(f"✓ Generated {len(customers_df)} customers")
print(customers_df.head())

Generating 10000 customers...
✓ Generated 10000 customers
   customer_id signup_date contract_type region  age       industry  \
0            0  2020-05-17      biennial  North   19        Finance   
1            1  2024-03-07       monthly  North   61  Manufacturing   
2            2  2022-11-22       monthly  North   23         Retail   
3            3  2022-06-26      biennial  South   63  Manufacturing   
4            4  2020-08-14        annual   East   69           Tech   

  company_size  account_manager_assigned  
0        Small                      True  
1        Small                     False  
2        Small                      True  
3       Medium                      True  
4        Small                     False  


# ============================================
# CELL 3: Generate Transactions (This is KEY)
# ============================================

In [53]:
print("Generating transactions (this takes 3-4 minutes)...")

transactions = []
transaction_id = 0

for idx, customer in customers_df.iterrows():
    customer_id = customer['customer_id']
    signup = pd.to_datetime(customer['signup_date'])

    # Churn logic: monthly contracts churn more
    base_churn_prob = 0.30 if customer['contract_type'] == 'monthly' else 0.12

    # Account manager reduces churn
    if customer['account_manager_assigned']:
        base_churn_prob *= 0.6

    # Enterprise customers churn less
    if customer['company_size'] == 'Enterprise':
        base_churn_prob *= 0.5

    will_churn = random.random() < base_churn_prob

    # Determine active period
    if will_churn:
        # Churned customers: active 60-300 days
        active_days = random.randint(60, 300)
        last_transaction_date = signup + timedelta(days=active_days)
    else:
        # Active customers: up to end date
        last_transaction_date = END_DATE

    # Generate transactions for this customer
    current_date = signup

    # Transaction frequency based on company size
    if customer['company_size'] == 'Enterprise':
        avg_days_between = 7  # Weekly
        base_amount = 2000
    elif customer['company_size'] == 'Medium':
        avg_days_between = 14  # Bi-weekly
        base_amount = 800
    else:
        avg_days_between = 30  # Monthly
        base_amount = 300

    transaction_count = 0

    while current_date <= last_transaction_date:
        # Add some randomness
        days_offset = random.randint(-3, 7)
        next_date = current_date + timedelta(days=avg_days_between + days_offset)

        if next_date > last_transaction_date:
            break

        # Transaction amount with variance
        amount = base_amount * random.uniform(0.7, 1.5)

        # Declining engagement for churning customers
        if will_churn:
            days_until_churn = (last_transaction_date - current_date).days
            if days_until_churn < 60:
                amount *= 0.6  # Reduced spending before churn

        transactions.append({
            'transaction_id': transaction_id,
            'customer_id': customer_id,
            'transaction_date': current_date,
            'amount': round(amount, 2),
            'product_category': random.choice(['Software', 'Consulting', 'Support', 'Training']),
            'payment_method': random.choice(['Credit Card', 'Invoice', 'Bank Transfer'])
        })

        transaction_id += 1
        transaction_count += 1
        current_date = next_date

    if idx % 1000 == 0:
        print(f"  Processed {idx}/{N_CUSTOMERS} customers...")

transactions_df = pd.DataFrame(transactions)
print(f"✓ Generated {len(transactions_df)} transactions")
print(f"  Average transactions per customer: {len(transactions_df) / N_CUSTOMERS:.1f}")

Generating transactions (this takes 3-4 minutes)...
  Processed 0/10000 customers...
  Processed 1000/10000 customers...
  Processed 2000/10000 customers...
  Processed 3000/10000 customers...
  Processed 4000/10000 customers...
  Processed 5000/10000 customers...
  Processed 6000/10000 customers...
  Processed 7000/10000 customers...
  Processed 8000/10000 customers...
  Processed 9000/10000 customers...
✓ Generated 625663 transactions
  Average transactions per customer: 62.6


# ============================================
# CELL 4: Generate Support Tickets
# ============================================

In [54]:
print("Generating support tickets...")

support_tickets = []
ticket_id = 0

for customer_id in range(N_CUSTOMERS):
    customer = customers_df.iloc[customer_id]

    # Number of tickets (churning customers have MORE tickets)
    customer_transactions = transactions_df[transactions_df['customer_id'] == customer_id]

    if len(customer_transactions) == 0:
        continue

    # Dissatisfied customers (more tickets) are more likely to churn
    if len(customer_transactions) < 5:  # Low engagement = potential churner
        n_tickets = random.randint(3, 10)
    else:
        n_tickets = random.randint(0, 4)

    for _ in range(n_tickets):
        ticket_date = fake.date_between(
            start_date=customer['signup_date'],
            end_date=customer_transactions['transaction_date'].max()
        )

        support_tickets.append({
            'ticket_id': ticket_id,
            'customer_id': customer_id,
            'ticket_date': ticket_date,
            'issue_type': random.choice(['Technical', 'Billing', 'Feature Request', 'Bug']),
            'priority': random.choice(['Low', 'Medium', 'High']),
            'resolved': random.choice([True, False])
        })
        ticket_id += 1

support_tickets_df = pd.DataFrame(support_tickets)
print(f"✓ Generated {len(support_tickets_df)} support tickets")

Generating support tickets...
✓ Generated 21049 support tickets


# CELL 5: Create Churn Labels

In [55]:
print("Creating churn labels...")

# Calculate recency for each customer
reference_date = pd.Timestamp('2024-12-31')

customer_recency = transactions_df.groupby('customer_id').agg({
    'transaction_date': 'max'
}).reset_index()

customer_recency['days_since_last_transaction'] = (
    reference_date - pd.to_datetime(customer_recency['transaction_date'])
).dt.days

# Churn definition: No transaction in last 90 days
customer_recency['churned'] = (customer_recency['days_since_last_transaction'] > 90).astype(int)

# Merge with customer data
customers_df = customers_df.merge(
    customer_recency[['customer_id', 'churned']],
    on='customer_id',
    how='left'
)

customers_df['churned'] = customers_df['churned'].fillna(1).astype(int)

churn_rate = customers_df['churned'].mean()
print(f"✓ Churn rate: {churn_rate:.1%}")
print(f"  Churned customers: {customers_df['churned'].sum()}")
print(f"  Active customers: {(customers_df['churned'] == 0).sum()}")

Creating churn labels...
✓ Churn rate: 11.9%
  Churned customers: 1186
  Active customers: 8814


# CELL 6: Generate Daily Revenue for Forecasting

In [56]:
print("Creating daily revenue aggregation...")

# Aggregate to daily level
daily_revenue = transactions_df.groupby(
    transactions_df['transaction_date'].dt.date
).agg({
    'amount': 'sum',
    'transaction_id': 'count'
}).reset_index()

daily_revenue.columns = ['date', 'revenue', 'transaction_count']
daily_revenue['date'] = pd.to_datetime(daily_revenue['date'])

# Fill missing dates with 0
date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
daily_revenue = daily_revenue.set_index('date').reindex(date_range, fill_value=0).reset_index()
daily_revenue.columns = ['date', 'revenue', 'transaction_count']

print(f"✓ Created daily revenue data: {len(daily_revenue)} days")
print(f"  Average daily revenue: ${daily_revenue['revenue'].mean():,.2f}")


Creating daily revenue aggregation...
✓ Created daily revenue data: 1827 days
  Average daily revenue: $523,431.64


# CELL 7: Save All Datasets

In [57]:
import os

os.makedirs("data/raw", exist_ok=True)


In [58]:
print("Saving datasets...")

# Save to CSV
customers_df.to_csv('data/raw/customers.csv', index=False)
transactions_df.to_csv('data/raw/transactions.csv', index=False)
support_tickets_df.to_csv('data/raw/support_tickets.csv', index=False)
daily_revenue.to_csv('data/raw/daily_revenue.csv', index=False)

print("✓ All datasets saved to data/raw/")
print("\n=== DATASET SUMMARY ===")
print(f"Customers: {len(customers_df):,}")
print(f"Transactions: {len(transactions_df):,}")
print(f"Support Tickets: {len(support_tickets_df):,}")
print(f"Daily Revenue Records: {len(daily_revenue):,}")
print(f"Churn Rate: {churn_rate:.1%}")


Saving datasets...
✓ All datasets saved to data/raw/

=== DATASET SUMMARY ===
Customers: 10,000
Transactions: 625,663
Support Tickets: 21,049
Daily Revenue Records: 1,827
Churn Rate: 11.9%


# CELL 8: Data Quality Checks

In [59]:
print("\n=== DATA QUALITY CHECKS ===")

# Check for nulls
print("Null values:")
print(f"  Customers: {customers_df.isnull().sum().sum()}")
print(f"  Transactions: {transactions_df.isnull().sum().sum()}")

# Check date ranges
print(f"\nDate ranges:")
print(f"  Transactions: {transactions_df['transaction_date'].min()} to {transactions_df['transaction_date'].max()}")
print(f"  Revenue: {daily_revenue['date'].min()} to {daily_revenue['date'].max()}")

# Check distributions
print(f"\nContract type distribution:")
print(customers_df['contract_type'].value_counts())

print("\n✓ DATA GENERATION COMPLETE!")


=== DATA QUALITY CHECKS ===
Null values:
  Customers: 0
  Transactions: 0

Date ranges:
  Transactions: 2020-01-01 00:00:00 to 2025-03-31 00:00:00
  Revenue: 2020-01-01 00:00:00 to 2024-12-31 00:00:00

Contract type distribution:
contract_type
monthly     3411
annual      3326
biennial    3263
Name: count, dtype: int64

✓ DATA GENERATION COMPLETE!
