In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta


In [3]:
np.random.seed(42)


In [5]:
# Users & content
n_users = 1200
content_types = ["Movie", "Series", "Sports", "Kids", "News"]
plans = ["Free", "Basic", "Premium"]

# Time range
start_date = "2023-01-01"
end_date = "2024-12-31"
dates = pd.date_range(start=start_date, end=end_date, freq="D")


In [7]:
users = []

for i in range(1, n_users + 1):
    signup_date = np.random.choice(dates[:500])
    plan = np.random.choice(plans, p=[0.45, 0.35, 0.20])

    users.append([
        f"U{i:05d}",
        signup_date,
        plan
    ])

user_df = pd.DataFrame(
    users,
    columns=["user_id", "signup_date", "subscription_plan"]
)


In [9]:
user_df.head()

Unnamed: 0,user_id,signup_date,subscription_plan
0,U00001,2023-04-13,Basic
1,U00002,2023-09-28,Basic
2,U00003,2023-07-08,Basic
3,U00004,2023-05-02,Free
4,U00005,2023-11-27,Basic


In [11]:
data = []

for _, row in user_df.iterrows():
    user_id = row["user_id"]
    plan = row["subscription_plan"]
    signup_date = row["signup_date"]

    active_days = np.random.randint(90, 600)
    user_dates = pd.date_range(
        start=signup_date,
        periods=min(active_days, len(dates)),
        freq="D"
    )

    for date in user_dates:
        sessions = np.random.randint(1, 4)

        for _ in range(sessions):
            content_type = np.random.choice(content_types)
            watch_minutes = np.random.randint(5, 120)

            # Revenue logic
            if plan == "Free":
                ad_views = np.random.randint(1, 6)
                revenue = ad_views * 0.03
            elif plan == "Basic":
                ad_views = 0
                revenue = 0.25
            else:  # Premium
                ad_views = 0
                revenue = 0.40

            data.append([
                user_id,
                date,
                f"C{np.random.randint(1000, 9999)}",
                content_type,
                watch_minutes,
                sessions,
                plan,
                ad_views,
                revenue
            ])


In [12]:
engagement_df = pd.DataFrame(
    data,
    columns=[
        "user_id",
        "session_date",
        "content_id",
        "content_type",
        "watch_minutes",
        "session_count",
        "subscription_plan",
        "ad_views",
        "revenue"
    ]
)


In [13]:
engagement_df.head()

Unnamed: 0,user_id,session_date,content_id,content_type,watch_minutes,session_count,subscription_plan,ad_views,revenue
0,U00001,2023-04-13,C2827,Movie,70,1,Basic,0,0.25
1,U00001,2023-04-14,C9569,Kids,82,2,Basic,0,0.25
2,U00001,2023-04-14,C3196,Series,14,2,Basic,0,0.25
3,U00001,2023-04-15,C7303,Series,17,3,Basic,0,0.25
4,U00001,2023-04-15,C9714,Kids,102,3,Basic,0,0.25


In [17]:
last_activity = (
    engagement_df
    .groupby("user_id")["session_date"]
    .max()
    .reset_index()
    .rename(columns={"session_date": "last_active_date"})
)

engagement_df = engagement_df.merge(
    last_activity,
    on="user_id",
    how="left"
)

cutoff_date = engagement_df["session_date"].max() - timedelta(days=30)

engagement_df["churn_flag"] = np.where(
    engagement_df["last_active_date"] < cutoff_date,
    1,
    0
)


In [19]:
engagement_df.head()


Unnamed: 0,user_id,session_date,content_id,content_type,watch_minutes,session_count,subscription_plan,ad_views,revenue,last_active_date,churn_flag
0,U00001,2023-04-13,C2827,Movie,70,1,Basic,0,0.25,2024-03-22,1
1,U00001,2023-04-14,C9569,Kids,82,2,Basic,0,0.25,2024-03-22,1
2,U00001,2023-04-14,C3196,Series,14,2,Basic,0,0.25,2024-03-22,1
3,U00001,2023-04-15,C7303,Series,17,3,Basic,0,0.25,2024-03-22,1
4,U00001,2023-04-15,C9714,Kids,102,3,Basic,0,0.25,2024-03-22,1


In [21]:
engagement_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 833821 entries, 0 to 833820
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   user_id            833821 non-null  object        
 1   session_date       833821 non-null  datetime64[ns]
 2   content_id         833821 non-null  object        
 3   content_type       833821 non-null  object        
 4   watch_minutes      833821 non-null  int64         
 5   session_count      833821 non-null  int64         
 6   subscription_plan  833821 non-null  object        
 7   ad_views           833821 non-null  int64         
 8   revenue            833821 non-null  float64       
 9   last_active_date   833821 non-null  datetime64[ns]
 10  churn_flag         833821 non-null  int32         
dtypes: datetime64[ns](2), float64(1), int32(1), int64(3), object(4)
memory usage: 66.8+ MB


In [23]:
engagement_df["churn_flag"].value_counts()


churn_flag
1    829020
0      4801
Name: count, dtype: int64

In [25]:
engagement_df.to_csv(
    "C:/Users/Abhi/Desktop/Python_Data_Analytics_Projects/content_engagement_churn_analytics/data/raw/content_engagement_raw.csv",
    index=False
)
