# Synthetic Data Generator
This notebook generates synthetic data to populate the tables of MiniGram.

## Setup

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from faker import Faker
import random

# Initialize Faker for generating realistic usernames
faker = Faker()

# Extract the range of dates from the sample data for the 'created_at' attribute
DATE_RANGE_START = pd.to_datetime("2014-01-01 00:00:00")
DATE_RANGE_END = pd.to_datetime("2024-12-31 23:59:59")

# Function to generate synthetic timestamps within the desired range
def generate_timestamp(start_date = DATE_RANGE_START, end_date = DATE_RANGE_END):
    return faker.date_time_between(start_date=start_date, 
                                   end_date=end_date).strftime('%Y-%m-%d %H:%M:%S')

  from pandas.core import (


## Users

In [2]:
# Function to generate synthetic usernames
def generate_username():
    return faker.user_name()

# Generate synthetic data
synthetic_user_data = pd.DataFrame({
    'id': range(1, 5001),
    'username': [generate_username() for _ in range(5000)],
    'created_at': [generate_timestamp() for _ in range(5000)]
})

# Display data
synthetic_user_data.head()

Unnamed: 0,id,username,created_at
0,1,stephaniesanders,2021-09-23 22:25:00
1,2,nblack,2014-11-19 02:54:25
2,3,khamilton,2022-09-10 01:17:00
3,4,courtneymartinez,2023-05-12 06:32:49
4,5,qross,2016-12-24 22:21:55


# Photos

In [3]:
# Function to generate synthetic image urls
def generate_image_url():
    return faker.image_url()

# Function to generate user id based on user column
def generate_user_id():
    return random.randint(synthetic_user_data['id'].min(), synthetic_user_data['id'].max())

# Generate synthetic data
synthetic_photo_data = pd.DataFrame({
    'id': range(1, 25001),
    'image_url': [generate_image_url() for _ in range(25000)],
    'user_id': [generate_user_id() for _ in range(25000)],
    'created_at': [None] * 25000 # Placeholder for timestamps
})

# Generating timestamps where the start time has to be after the user has created their profile
for index, row in synthetic_photo_data.iterrows():
    user_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == row['user_id'], 'created_at'].values[0]
    user_created_at = pd.to_datetime(user_created_at)
    synthetic_photo_data.at[index, 'created_at'] = generate_timestamp(start_date=user_created_at)

# Display data
synthetic_photo_data.head()

Unnamed: 0,id,image_url,user_id,created_at
0,1,https://placekitten.com/139/750,1768,2024-05-02 18:41:09
1,2,https://placekitten.com/745/809,3710,2021-06-10 18:15:58
2,3,https://dummyimage.com/256x694,2107,2024-01-15 14:59:25
3,4,https://picsum.photos/175/504,2292,2023-07-13 13:14:57
4,5,https://dummyimage.com/504x185,3174,2024-11-18 06:39:49


## Tags

In [4]:
# Create a list of hashtags categorized as requested
tags = [
    "love", "instagood", "photooftheday", "fashion", "beautiful", "happy", "cute", "tbt", "followme", "picoftheday",
    "nature", "landscape", "sunset", "sunrise", "sky", "mountains", "ocean", "waterfall", "beach", "forest",
    "travel", "wanderlust", "explore", "adventure", "vacation", "travellife", "roadtrip", "instatravel", "travelgram", "tourism",
    "fitness", "workout", "gym", "fitfam", "fitlife", "health", "motivation", "fitnessmotivation", "yoga", "crossfit",
    "food", "foodie", "yummy", "delicious", "foodporn", "instafood", "homemade", "healthyfood", "coffee", "breakfast",
    "style", "outfitoftheday", "streetstyle", "fashionblogger", "styleinspo", "ootd", "fashionista", "mensfashion", "womensfashion", "accessories",
    "art", "drawing", "painting", "artist", "illustration", "artwork", "creative", "digitalart", "photography", "design",
    "music", "dance", "dj", "concert", "livemusic", "singer", "songwriter", "hiphop", "guitar", "drums",
    "fun", "party", "friends", "memes", "instafun", "comedy", "weekend", "goodvibes", "selfie", "bestoftheday",
    "family", "love", "momlife", "dadlife", "siblings", "relationshipgoals", "couple", "wedding", "engagement", "friendship",
    "inspiration", "quoteoftheday", "motivation", "goals", "positivevibes", "believe", "success", "mindset", "dreambig", "hustle",
    "tech", "technology", "smartphone", "gaming", "gadget", "electronics", "innovation", "computer", "programming", "developer",
    "sports", "soccer", "basketball", "football", "baseball", "running", "cycling", "swimming", "tennis", "golf",
    "cars", "carsofinstagram", "supercars", "motorcycle", "biker", "racing", "classiccars", "carlover", "vintagecars", "offroad",
    "animals", "pets", "dogsofinstagram", "catsofinstagram", "puppylove", "kitten", "wildlife", "petstagram", "animallovers", "birdsofinstagram",
    "summer", "winter", "spring", "autumn", "holiday", "christmas", "halloween", "newyear", "easter", "thanksgiving",
    "business", "entrepreneur", "startup", "marketing", "socialmedia", "branding", "ecommerce", "smallbusiness", "sales", "digitalmarketing",
    "photography", "photo", "camera", "portrait", "photoshoot", "photographer", "landscapephotography", "streetphotography", "naturephotography", "blackandwhite",
    "quotes", "lifequotes", "lovequotes", "motivationalquotes", "quote", "inspirationalquotes", "wordsofwisdom", "positivity", "thoughts", "wisdom",
    "funny", "jokes", "lol", "hilarious", "humor", "funnytweets", "meme", "laugh", "silly", "comedy",
    "instadaily", "daily", "bestoftheday", "like4like", "follow4follow", "instamood", "nofilter", "blackandwhite", "vintage", "retro",
    "selfcare", "selflove", "wellness", "mindfulness", "meditation", "growth", "personaldevelopment", "motivate", "lifecoach", "mentalhealth",
    "event", "celebration", "partytime", "birthday", "anniversary", "festival", "gathering", "gettogether", "nightlife", "club",
    "strong", "fit", "noexcuses", "trainhard", "cardio", "fitnessjourney", "healthyliving", "weightloss", "progress", "gymtime",
    "paradise", "cityscape", "destination", "getaway", "exploring", "bucketlist", "localtravel", "worldtravel", "globetrotting", "traveladdict"
]

# Date range end for tags, minimizing deletion where photos are older than tags in photo_tags df 
date_range_end_for_tags = pd.to_datetime("2019-12-31 23:59:59")

# Create a DataFrame from the list of hashtags
synthetic_tag_data = pd.DataFrame({
    'id': range(1,251),
    'tag_name': tags,
    'created_at': [generate_timestamp(end_date = date_range_end_for_tags) for _ in range(250)]
})

# Display data
synthetic_tag_data.head()

Unnamed: 0,id,tag_name,created_at
0,1,love,2016-08-16 20:18:19
1,2,instagood,2019-11-27 07:28:00
2,3,photooftheday,2014-06-27 23:37:53
3,4,fashion,2018-01-16 14:25:17
4,5,beautiful,2016-09-27 18:37:51


## Photo Tags

In [5]:
# Function to generate user id based on user column
def generate_photo_id():
    return random.randint(synthetic_photo_data['id'].min(), synthetic_photo_data['id'].max())

# Function to generate tag id based on tag column
def generate_tag_id():
    return random.randint(synthetic_tag_data['id'].min(), synthetic_tag_data['id'].max())

# Generate synthetic data
synthetic_photo_tag_data = pd.DataFrame({
    'photo_id': [generate_photo_id() for _ in range(100000)],
    'tag_id': [generate_tag_id() for _ in range(100000)]
})

# Deleting invalid rows (those where the pictures are older than the tags)
for index, row in synthetic_photo_tag_data.iterrows():
    if (synthetic_photo_data.loc[synthetic_photo_data['id'] == row['photo_id'], 'created_at'].values[0] <
         synthetic_tag_data.loc[synthetic_tag_data['id'] == row['tag_id'], 'created_at'].values[0]):
        synthetic_photo_tag_data.drop(index, inplace=True)
        
# Formatting
synthetic_photo_tag_data.reset_index(drop=True,inplace=True)

# Display data
synthetic_photo_tag_data.head()

Unnamed: 0,photo_id,tag_id
0,19793,173
1,5634,63
2,15073,218
3,22972,49
4,14476,9


## Likes

In [7]:
# Function to generate user id based on user column
def generate_user_id():
    return random.randint(synthetic_user_data['id'].min(), synthetic_user_data['id'].max())

# Function to generate image id based on image column
def generate_image_id():
    return random.randint(synthetic_photo_data['id'].min(), synthetic_photo_data['id'].max())

# Generate synthetic data
synthetic_like_data = pd.DataFrame({
    'user_id': [generate_user_id() for _ in range(250000)],
    'photo_id': [generate_image_id() for _ in range(250000)],
    'created_at': [None] * 250000 # Placeholder for timestamps
})

# Generating timestamps where 
# the start time has to be after the user has created their profile
# and after the photo has been posted
for index, row in synthetic_like_data.iterrows():
    user_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == row['user_id'], 'created_at'].values[0]
    image_created_at = synthetic_photo_data.loc[synthetic_photo_data['id'] == row['photo_id'], 'created_at'].values[0]
    created_at = user_created_at if user_created_at < image_created_at else image_created_at
    created_at = pd.to_datetime(created_at)
    synthetic_like_data.at[index, 'created_at'] = generate_timestamp(start_date=created_at)

# Display data
synthetic_like_data.head()

Unnamed: 0,user_id,photo_id,created_at
0,1708,15802,2020-08-14 15:43:07
1,2875,2969,2020-08-14 11:35:53
2,828,4216,2024-12-18 21:36:04
3,2369,24888,2018-02-05 23:48:07
4,174,22020,2021-03-15 22:34:30


## Comments

In [8]:
# Generic comments
generic_comments = [
    "Love this! 😍", "Amazing shot!", "So beautiful!", "Wow, just wow!", "Incredible!", "Stunning!", "Goals! 💯",
    "I need this!", "This is everything!", "You’re killing it!", "This made my day!", "Epic!", "On point! 👌",
    "Yesss!", "Totally agree!", "So true!", "Perfection!", "Dreamy!", "Can’t stop looking at this!", "Absolutely stunning!",
    "Mind-blowing!", "Best thing I’ve seen all day!", "Keep shining!", "OMG, yes!", "Legend!", "This is pure art!", 
    "So inspiring!", "Love the vibes!", "That’s a mood!", "You’re a star!", "Can’t get enough of this!", "Pure magic!", 
    "Iconic!", "This is fire! 🔥", "So proud of you!", "This is life!", "I’m obsessed!", "How do you do it?", 
    "This is too good!", "Amazing work!", "Keep going!", "So proud!", "Speechless!", "I’m in awe!", "Love this energy!", 
    "Absolute goals!", "This gave me life!", "Take me here!", "You nailed it!", "Big mood!", "This is lit!", 
    "All the feels!", "Too cute!", "This made me smile!", "I needed this!", "Simply the best!", "Never stop creating!", 
    "Heart eyes for days! 😍", "Your feed is goals!", "My fave post today!", "So much talent!", "Forever inspired by you!", 
    "This is next level!", "Unreal!", "Picture perfect!", "A true masterpiece!", "Breathtaking!", "You’re unstoppable!", 
    "Too good to be true!", "Keep slaying!", "You did it again!", "Wow, just incredible!", "My new favorite!", 
    "You have outdone yourself!", "Nothing but love for this!", "This is goals!", "Love the vibe!", "Fabulous!", 
    "You’re a rockstar!", "I’m living for this!", "Take all my likes!", "Beyond amazing!", "So creative!", 
    "You’re so talented!", "Forever a fan!", "Can’t even handle this!", "You never disappoint!", "Obsessed with this!", 
    "Keep shining bright!", "Major inspo!", "Bravo!", "Perfection!", "Y’all need to see this!", "Straight to my favorites!", 
    "Art in its purest form!", "Simply stunning!", "Crushing it!", "Keep doing you!", "Nothing compares to this!"
]

# Merging tables to get tags of photos (NEXT UPDATE)
photo_with_tags = synthetic_photo_data.merge(synthetic_photo_tag_data, how='inner',left_on='id',right_on='photo_id')
photo_with_tags = photo_with_tags.merge(synthetic_tag_data, how='inner', left_on='tag_id', right_on='id')
photo_with_tags[['photo_id','tag_name']].head()

# Function to generate user id based on user column
def generate_user_id():
    return random.randint(synthetic_user_data['id'].min(), synthetic_user_data['id'].max())

# Function to generate photo id based on photo column
def generate_photo_id():
    return random.randint(synthetic_photo_data['id'].min(), synthetic_photo_data['id'].max())

# Function to generate comment based on generic comments provided
def generate_comment():
    # Step 1) create num_of_comments that should be random
    # Step 2) each comment should be either:
    #          a) be something simple ie. wow!, very pretty! cool, love this, so cute
    #          b) be something that uses the tags to write a comment (range = (3 words, 20 words), 
    #                                                                 heavily skewed to around mean=7)
    return generic_comments[random.randint(0,98)]

    
# Generating the dataframe (24k rows because I wanted at least 4% of photos to have 0 comments)
synthetic_comment_data = pd.DataFrame({
    'comment': [None] * 24000,
    'photo_id': [generate_photo_id() for _ in range(24000)],
    'user_id': [None] * 24000,
    'created_at': [None] * 24000
})

# Making photo_id unique
synthetic_comment_data.drop_duplicates(subset='photo_id',inplace=True)

# Populating placeholders
for index, row in synthetic_comment_data.iterrows():
    
    # Creating list of comments for that specific photo
    num_of_comments = abs(round(np.random.normal(15, 15, size=(1,1))[0][0]))
    comments = []
    users = []
    for i in range(1,num_of_comments + 1):
        comments.append(generate_comment())  # appending comment to list of comments for that specific photo
        
        # adding the user that wrote the comment
        user_id = ''
        while user_id not in users:  # ensuring that the same user does not comment the same thing twice
            user_id = generate_user_id()
            users.append(user_id)
            
    # Each photo has a list of comments and a list of the respective users that commented
    synthetic_comment_data.at[index, 'comment'] = comments
    synthetic_comment_data.at[index, 'user_id'] = users
        
# Converting the list of comments into their own rows
synthetic_comment_data = synthetic_comment_data.explode(['comment','user_id'])

# Formatting
synthetic_comment_data['id'] = range(1,len(synthetic_comment_data)+1)
first_column = synthetic_comment_data.pop('id') 
synthetic_comment_data.insert(0, 'id', first_column)
synthetic_comment_data.reset_index(drop=True,inplace=True)
synthetic_comment_data.dropna(subset=["user_id"],inplace=True)


# Generating timestamps where 
# the start time has to be after the user has created their profile
# and after the photo has been posted
for index, row in synthetic_comment_data.iterrows():
    user_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == row['user_id'], 'created_at'].values[0]
    image_created_at = synthetic_photo_data.loc[synthetic_photo_data['id'] == row['photo_id'], 'created_at'].values[0]
    created_at = user_created_at if user_created_at < image_created_at else image_created_at
    created_at = pd.to_datetime(created_at)
    synthetic_comment_data.at[index, 'created_at'] = generate_timestamp(start_date=created_at)

# Display data
synthetic_comment_data.head()

Unnamed: 0,id,comment,photo_id,user_id,created_at
0,1,My fave post today!,7639,3080,2024-08-21 13:07:43
1,2,Can’t stop looking at this!,7639,3822,2024-11-16 20:51:01
2,3,Keep shining!,7639,3961,2019-12-27 11:08:07
3,4,Amazing work!,7639,2075,2016-07-04 12:34:48
4,5,Take me here!,7639,3088,2023-05-16 05:13:22


## Follows

In [9]:
# Function to generate user id based on user column
def generate_user_id():
    return random.randint(synthetic_user_data['id'].min(), synthetic_user_data['id'].max())

# Generate synthetic data
synthetic_follows_data = pd.DataFrame({
    'follower_id': [generate_user_id() for _ in range(25000)],
    'followee_id': [generate_user_id() for _ in range(25000)],
    'created_at': [None] * 25000
})

# Generating timestamps where the start time has to be after the both users have created their profile
for index, row in synthetic_follows_data.iterrows():
    follower_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == row['follower_id'], 'created_at'].values[0]
    followee_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == row['followee_id'], 'created_at'].values[0]
    created_at = follower_created_at if follower_created_at < followee_created_at else followee_created_at
    created_at = pd.to_datetime(created_at)
    synthetic_follows_data.at[index, 'created_at'] = generate_timestamp(start_date=created_at)
    
    if row['follower_id'] == row['followee_id']:
        synthetic_follows_data.drop(index,inplace=True)
        
# Formatting
synthetic_follows_data.reset_index(drop=True,inplace=True)

# Display data
synthetic_follows_data.head()

Unnamed: 0,follower_id,followee_id,created_at
0,2468,3128,2023-07-10 15:51:15
1,1278,1508,2022-06-04 00:43:24
2,1839,1511,2018-11-01 16:53:36
3,3719,826,2024-05-21 05:16:02
4,762,819,2020-10-04 20:08:13


## Saving data

In [10]:
synthetic_user_data.to_csv("users.csv", index=False)
synthetic_photo_data.to_csv("photos.csv", index=False)
synthetic_tag_data.to_csv("tags.csv", index=False)
synthetic_photo_tag_data.to_csv("photo_tags.csv", index=False)
synthetic_like_data.to_csv("likes.csv", index=False)
synthetic_comment_data.to_csv("comments.csv", index=False)
synthetic_follows_data.to_csv("follows.csv", index=False)