In [None]:
# 01_data_prep.ipynb

import pandas as pd
import numpy as np
import random
from faker import Faker

fake = Faker()

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)

# Simulate users
NUM_USERS = 100
user_ids = [f"user_{i}" for i in range(NUM_USERS)]

# Simulate post topics
topics = ['tech', 'food', 'travel', 'fitness', 'gaming', 'fashion', 'news']
NUM_POSTS = 1000

# Simulate posts
posts = []
for i in range(NUM_POSTS):
    post_id = f"post_{i}"
    topic = random.choice(topics)
    text = fake.sentence(nb_words=12)
    posts.append((post_id, topic, text))

posts_df = pd.DataFrame(posts, columns=['post_id', 'topic', 'text'])

# Simulate user interests
user_interests = {user: random.sample(topics, k=random.randint(1, 3)) for user in user_ids}

# Simulate user-post interactions
interactions = []
for user in user_ids:
    for _, row in posts_df.iterrows():
        topic_match = row['topic'] in user_interests[user]
        # Higher chance to like if topic matches interest
        if topic_match and random.random() < 0.25:
            interactions.append((user, row['post_id'], 'like'))
        elif random.random() < 0.05:
            interactions.append((user, row['post_id'], 'view'))

interactions_df = pd.DataFrame(interactions, columns=['user_id', 'post_id', 'interaction'])

# Merge post text into interactions
data = interactions_df.merge(posts_df, on='post_id', how='left')

# Optional: Map interaction types to binary labels
data['label'] = data['interaction'].apply(lambda x: 1 if x == 'like' else 0)

# Save for next step
data.to_csv('../data/processed/user_post_interactions.csv', index=False)

print("✅ Data simulation complete.")
data.head()
