In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# ----------------------------
# STEP 1: Load Netflix Titles
# ----------------------------
# Download 'netflix_titles.csv' from:
# https://www.kaggle.com/datasets/shivamb/netflix-shows
# Place it in your working directory

netflix = pd.read_csv('netflix_titles.csv')

# Keep only titles that are Movies or TV Shows (exclude nulls)
titles = netflix['title'].dropna().unique()

# ----------------------------
# STEP 2: Generate Synthetic Viewing Data
# ----------------------------
np.random.seed(42)  # for reproducibility

# Parameters
n_users = 10_000          # 10k fake users
n_events = 500_000        # 500k viewing events
start_date = datetime(1925, 1, 1)       # the lowest realse year on the Koggle Data
end_date = datetime(2025, 6, 30)

# Generate random user IDs (e.g., "user_00001")
user_ids = np.random.choice([f"user_{i:05d}" for i in range(1, n_users + 1)], size=n_events)

# Randomly pick titles (with replacement)
watched_titles = np.random.choice(titles, size=n_events)

# Generate random timestamps between start_date and end_date
time_between = (end_date - start_date).total_seconds()
random_seconds = np.random.uniform(0, time_between, size=n_events)
watched_at = [start_date + timedelta(seconds=s) for s in random_seconds]

# ----------------------------
# STEP 3: Create DataFrame
# ----------------------------
viewing_data = pd.DataFrame({
    'user_id': user_ids,
    'title': watched_titles,
    'watched_at': watched_at
})

# Optional: Sort by time
viewing_data = viewing_data.sort_values('watched_at').reset_index(drop=True)

# ----------------------------
# STEP 4: Save or Preview
# ----------------------------
print(viewing_data.head(10))
print(f"\nDataset shape: {viewing_data.shape}")
viewing_data.to_csv('netflix_fake_viewers(10K).csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'netflix_titles.csv'