# Synthetic Data Generator
This notebook generates synthetic data to populate the tables of MiniGram.

## Setup

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from faker import Faker
import random

# Initialize Faker for generating realistic usernames
faker = Faker()

# Extract the range of dates from the sample data for the 'created_at' attribute
DATE_RANGE_START = pd.to_datetime("2014-01-01 00:00:00")
DATE_RANGE_END = pd.to_datetime("2024-12-31 23:59:59")

# Function to generate synthetic timestamps within the desired range
def generate_timestamp(start_date = DATE_RANGE_START, end_date = DATE_RANGE_END):
    return faker.date_time_between(start_date=start_date, 
                                   end_date=end_date).strftime('%Y-%m-%d %H:%M:%S')

  from pandas.core import (


## Load Data
This section is only because I've created the dataset beforehand and needed to tweak a few things.

In [2]:
synthetic_user_data = pd.read_csv("Big Dataset/users.csv")
synthetic_photo_data = pd.read_csv("Big Dataset/photos.csv")
synthetic_tag_data = pd.read_csv("Big Dataset/tags.csv")
synthetic_photo_tag_data = pd.read_csv("Big Dataset/photo_tags.csv")
synthetic_like_data = pd.read_csv("Big Dataset/likes.csv")

## Users

In [None]:
# Function to generate synthetic usernames
def generate_username():
    return faker.user_name()

# Generate synthetic data
synthetic_user_data = pd.DataFrame({
    'id': range(1, 5001),
    'username': [generate_username() for _ in range(5000)],
    'created_at': [generate_timestamp() for _ in range(5000)]
})

# Display data
synthetic_user_data.head()

# Photos

In [None]:
# Function to generate synthetic image urls
def generate_image_url():
    return faker.image_url()

# Function to generate user id based on user column
def generate_user_id():
    return random.randint(synthetic_user_data['id'].min(), synthetic_user_data['id'].max())

# Generate synthetic data
synthetic_photo_data = pd.DataFrame({
    'id': range(1, 25001),
    'image_url': [generate_image_url() for _ in range(25000)],
    'user_id': [generate_user_id() for _ in range(25000)],
    'created_at': [None] * 25000 # Placeholder for timestamps
})

# Generating timestamps where the start time has to be after the user has created their profile
for index, row in synthetic_photo_data.iterrows():
    user_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == row['user_id'], 'created_at'].values[0]
    user_created_at = pd.to_datetime(user_created_at)
    synthetic_photo_data.at[index, 'created_at'] = generate_timestamp(start_date=user_created_at)

# Display data
synthetic_photo_data.head()

## Tags

In [None]:
# Create a list of hashtags categorized as requested
tags = [
    "love", "instagood", "photooftheday", "fashion", "beautiful", "happy", "cute", "tbt", "followme", "picoftheday",
    "nature", "landscape", "sunset", "sunrise", "sky", "mountains", "ocean", "waterfall", "beach", "forest",
    "travel", "wanderlust", "explore", "adventure", "vacation", "travellife", "roadtrip", "instatravel", "travelgram", "tourism",
    "fitness", "workout", "gym", "fitfam", "fitlife", "health", "motivation", "fitnessmotivation", "yoga", "crossfit",
    "food", "foodie", "yummy", "delicious", "foodporn", "instafood", "homemade", "healthyfood", "coffee", "breakfast",
    "style", "outfitoftheday", "streetstyle", "fashionblogger", "styleinspo", "ootd", "fashionista", "mensfashion", "womensfashion", "accessories",
    "art", "drawing", "painting", "artist", "illustration", "artwork", "creative", "digitalart", "photography", "design",
    "music", "dance", "dj", "concert", "livemusic", "singer", "songwriter", "hiphop", "guitar", "drums",
    "fun", "party", "friends", "memes", "instafun", "comedy", "weekend", "goodvibes", "selfie", "bestoftheday",
    "family", "love", "momlife", "dadlife", "siblings", "relationshipgoals", "couple", "wedding", "engagement", "friendship",
    "inspiration", "quoteoftheday", "motivation", "goals", "positivevibes", "believe", "success", "mindset", "dreambig", "hustle",
    "tech", "technology", "smartphone", "gaming", "gadget", "electronics", "innovation", "computer", "programming", "developer",
    "sports", "soccer", "basketball", "football", "baseball", "running", "cycling", "swimming", "tennis", "golf",
    "cars", "carsofinstagram", "supercars", "motorcycle", "biker", "racing", "classiccars", "carlover", "vintagecars", "offroad",
    "animals", "pets", "dogsofinstagram", "catsofinstagram", "puppylove", "kitten", "wildlife", "petstagram", "animallovers", "birdsofinstagram",
    "summer", "winter", "spring", "autumn", "holiday", "christmas", "halloween", "newyear", "easter", "thanksgiving",
    "business", "entrepreneur", "startup", "marketing", "socialmedia", "branding", "ecommerce", "smallbusiness", "sales", "digitalmarketing",
    "photography", "photo", "camera", "portrait", "photoshoot", "photographer", "landscapephotography", "streetphotography", "naturephotography", "blackandwhite",
    "quotes", "lifequotes", "lovequotes", "motivationalquotes", "quote", "inspirationalquotes", "wordsofwisdom", "positivity", "thoughts", "wisdom",
    "funny", "jokes", "lol", "hilarious", "humor", "funnytweets", "meme", "laugh", "silly", "comedy",
    "instadaily", "daily", "bestoftheday", "like4like", "follow4follow", "instamood", "nofilter", "blackandwhite", "vintage", "retro",
    "selfcare", "selflove", "wellness", "mindfulness", "meditation", "growth", "personaldevelopment", "motivate", "lifecoach", "mentalhealth",
    "event", "celebration", "partytime", "birthday", "anniversary", "festival", "gathering", "gettogether", "nightlife", "club",
    "strong", "fit", "noexcuses", "trainhard", "cardio", "fitnessjourney", "healthyliving", "weightloss", "progress", "gymtime",
    "paradise", "cityscape", "destination", "getaway", "exploring", "bucketlist", "localtravel", "worldtravel", "globetrotting", "traveladdict"
]

# Date range end for tags, minimizing deletion where photos are older than tags in photo_tags df 
date_range_end_for_tags = pd.to_datetime("2019-12-31 23:59:59")

# Create a DataFrame from the list of hashtags
synthetic_tag_data = pd.DataFrame({
    'id': range(1,251),
    'tag_name': tags,
    'created_at': [generate_timestamp(end_date = date_range_end_for_tags) for _ in range(250)]
})

# Display data
synthetic_tag_data.head()

## Photo Tags

In [None]:
# Function to generate user id based on user column
def generate_photo_id():
    return random.randint(synthetic_photo_data['id'].min(), synthetic_photo_data['id'].max())

# Function to generate tag id based on tag column
def generate_tag_id():
    return random.randint(synthetic_tag_data['id'].min(), synthetic_tag_data['id'].max())

# Generate synthetic data
synthetic_photo_tag_data = pd.DataFrame({
    'photo_id': [generate_photo_id() for _ in range(100000)],
    'tag_id': [generate_tag_id() for _ in range(100000)]
})

# Deleting invalid rows (those where the pictures are older than the tags)
for index, row in synthetic_photo_tag_data.iterrows():
    if (synthetic_photo_data.loc[synthetic_photo_data['id'] == row['photo_id'], 'created_at'].values[0] <
         synthetic_tag_data.loc[synthetic_tag_data['id'] == row['tag_id'], 'created_at'].values[0]):
        synthetic_photo_tag_data.drop(index, inplace=True)
        
# Formatting
synthetic_photo_tag_data.reset_index(drop=True,inplace=True)

# Display data
synthetic_photo_tag_data.head()

## Likes

In [None]:
# Function to generate user id based on user column
def generate_user_id():
    return random.randint(synthetic_user_data['id'].min(), synthetic_user_data['id'].max())

# Function to generate image id based on image column
def generate_image_id():
    return random.randint(synthetic_photo_data['id'].min(), synthetic_photo_data['id'].max())

# Generate synthetic data
synthetic_like_data = pd.DataFrame({
    'user_id': [generate_user_id() for _ in range(250000)],
    'photo_id': [generate_image_id() for _ in range(250000)],
    'created_at': [None] * 250000 # Placeholder for timestamps
})

# Generating timestamps where 
# the start time has to be after the user has created their profile
# and after the photo has been posted
for index, row in synthetic_like_data.iterrows():
    user_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == row['user_id'], 'created_at'].values[0]
    image_created_at = synthetic_photo_data.loc[synthetic_photo_data['id'] == row['photo_id'], 'created_at'].values[0]
    created_at = user_created_at if user_created_at < image_created_at else image_created_at
    created_at = pd.to_datetime(created_at)
    synthetic_like_data.at[index, 'created_at'] = generate_timestamp(start_date=created_at)
    
# Display data
synthetic_like_data.head()

## Comments

In [None]:
# Generic comments
generic_comments = [
    "Love this!", "Amazing shot!", "So beautiful!", "Wow, just wow!", "Incredible!", "Stunning!", "Goals!",
    "I need this!", "This is everything!", "You’re killing it!", "This made my day!", "Epic!", "On point!",
    "Yesss!", "Totally agree!", "So true!", "Perfection!", "Dreamy!", "Can’t stop looking at this!", "Absolutely stunning!",
    "Mind-blowing!", "Best thing I’ve seen all day!", "Keep shining!", "OMG, yes!", "Legend!", "This is pure art!", 
    "So inspiring!", "Love the vibes!", "That’s a mood!", "You’re a star!", "Can’t get enough of this!", "Pure magic!", 
    "Iconic!", "This is fire!", "So proud of you!", "This is life!", "I’m obsessed!", "How do you do it?", 
    "This is too good!", "Amazing work!", "Keep going!", "So proud!", "Speechless!", "I’m in awe!", "Love this energy!", 
    "Absolute goals!", "This gave me life!", "Take me here!", "You nailed it!", "Big mood!", "This is lit!", 
    "All the feels!", "Too cute!", "This made me smile!", "I needed this!", "Simply the best!", "Never stop creating!", 
    "Heart eyes for days!", "Your feed is goals!", "My fave post today!", "So much talent!", "Forever inspired by you!", 
    "This is next level!", "Unreal!", "Picture perfect!", "A true masterpiece!", "Breathtaking!", "You’re unstoppable!", 
    "Too good to be true!", "Keep slaying!", "You did it again!", "Wow, just incredible!", "My new favorite!", 
    "You have outdone yourself!", "Nothing but love for this!", "This is goals!", "Love the vibe!", "Fabulous!", 
    "You’re a rockstar!", "I’m living for this!", "Take all my likes!", "Beyond amazing!", "So creative!", 
    "You’re so talented!", "Forever a fan!", "Can’t even handle this!", "You never disappoint!", "Obsessed with this!", 
    "Keep shining bright!", "Major inspo!", "Bravo!", "Perfection!", "Y’all need to see this!", "Straight to my favorites!", 
    "Art in its purest form!", "Simply stunning!", "Crushing it!", "Keep doing you!", "Nothing compares to this!"
]

# Merging tables to get tags of photos (NEXT UPDATE)
photo_with_tags = synthetic_photo_data.merge(synthetic_photo_tag_data, how='inner',left_on='id',right_on='photo_id')
photo_with_tags = photo_with_tags.merge(synthetic_tag_data, how='inner', left_on='tag_id', right_on='id')
photo_with_tags[['photo_id','tag_name']].head()

# Function to generate user id based on user column
def generate_user_id():
    return random.randint(synthetic_user_data['id'].min(), synthetic_user_data['id'].max())

# Function to generate photo id based on photo column
def generate_photo_id():
    return random.randint(synthetic_photo_data['id'].min(), synthetic_photo_data['id'].max())

# Function to generate comment based on generic comments provided
def generate_comment():
    # Step 1) create num_of_comments that should be random
    # Step 2) each comment should be either:
    #          a) be something simple ie. wow!, very pretty! cool, love this, so cute
    #          b) be something that uses the tags to write a comment (range = (3 words, 20 words), 
    #                                                                 heavily skewed to around mean=7)
    return generic_comments[random.randint(0,98)]

    
# Generating the dataframe (24k rows because I wanted at least 4% of photos to have 0 comments)
synthetic_comment_data = pd.DataFrame({
    'comment': [None] * 24000,
    'photo_id': [generate_photo_id() for _ in range(24000)],
    'user_id': [None] * 24000,
    'created_at': [None] * 24000
})

# Making photo_id unique
synthetic_comment_data.drop_duplicates(subset='photo_id',inplace=True)

# Populating placeholders
for index, row in synthetic_comment_data.iterrows():
    
    # Creating list of comments for that specific photo
    num_of_comments = abs(round(np.random.normal(15, 15, size=(1,1))[0][0]))
    comments = []
    users = []
    for i in range(1,num_of_comments + 1):
        comments.append(generate_comment())  # appending comment to list of comments for that specific photo
        
        # adding the user that wrote the comment
        user_id = 0
        while user_id not in users:  # ensuring that the same user does not comment the same thing twice
            user_id = generate_user_id()
            users.append(user_id)
            
    # Each photo has a list of comments and a list of the respective users that commented
    synthetic_comment_data.at[index, 'comment'] = comments
    synthetic_comment_data.at[index, 'user_id'] = users
        
# Converting the list of comments into their own rows
synthetic_comment_data = synthetic_comment_data.explode(['comment','user_id'])

# Formatting
synthetic_comment_data['id'] = range(1,len(synthetic_comment_data)+1)
first_column = synthetic_comment_data.pop('id') 
synthetic_comment_data.insert(0, 'id', first_column)
synthetic_comment_data.reset_index(drop=True,inplace=True)
synthetic_comment_data.dropna(subset=["user_id"],inplace=True)  # removing rows with NaN values

# Generating timestamps where 
# the start time has to be after the user has created their profile
# and after the photo has been posted
for index, row in synthetic_comment_data.iterrows():
    user_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == row['user_id'], 'created_at'].values[0]
    image_created_at = synthetic_photo_data.loc[synthetic_photo_data['id'] == row['photo_id'], 'created_at'].values[0]
    created_at = user_created_at if user_created_at < image_created_at else image_created_at
    created_at = pd.to_datetime(created_at)
    synthetic_comment_data.at[index, 'created_at'] = generate_timestamp(start_date=created_at)

# Display data
synthetic_comment_data.head()

## Follows

In [None]:
# Function to generate user id based on user column
def generate_user_id():
    return random.randint(synthetic_user_data['id'].min(), synthetic_user_data['id'].max())

# Generate synthetic data
synthetic_follows_data = pd.DataFrame({
    'follower_id': [generate_user_id() for _ in range(25000)],
    'followee_id': [generate_user_id() for _ in range(25000)],
    'created_at': [None] * 25000
})

# Generating timestamps where the start time has to be after the both users have created their profile
for index, row in synthetic_follows_data.iterrows():
    follower_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == row['follower_id'], 'created_at'].values[0]
    followee_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == row['followee_id'], 'created_at'].values[0]
    created_at = follower_created_at if follower_created_at < followee_created_at else followee_created_at
    created_at = pd.to_datetime(created_at)
    synthetic_follows_data.at[index, 'created_at'] = generate_timestamp(start_date=created_at)
    
    if row['follower_id'] == row['followee_id']:
        synthetic_follows_data.drop(index,inplace=True)
        
# Formatting
synthetic_follows_data.reset_index(drop=True,inplace=True)

# Display data
synthetic_follows_data.head()

## Bots

In [3]:
def generate_user_id():
    return random.randint(synthetic_user_data['id'].min(), synthetic_user_data['id'].max())

def generate_photo_id():
    return random.randint(synthetic_photo_data['id'].min(), synthetic_photo_data['id'].max())

In [4]:
# Create a random number of bots that should be around 0.5%-1% of users
num_of_bots = random.randint(len(synthetic_user_data)*0.005, len(synthetic_user_data)*0.01)
     
# Storing bot IDs
bots = set()
while len(bots) < num_of_bots:
    bot = generate_user_id()
    bots.add(bot)  # Sets automatically handle duplicates
bots = list(bots)

# Removing bots from like dataframe
synthetic_like_data[~synthetic_like_data['user_id'].isin(bots)]

# Traverse through each bot and add liked photos
for bot in bots:
    
    # Number of photos that bots like (each bot should like around 5%-7% of all photos)
    num_of_photos = random.randint(round(len(synthetic_photo_data)*0.05),
                                   round(len(synthetic_photo_data)*0.07))
    
    counter = 0
    
    # Going through a random set of photos for this specific bot
    while counter < num_of_photos:
        photo_id = generate_photo_id()
        user_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == bot, 'created_at'].values[0]
        photo_created_at = synthetic_photo_data.loc[synthetic_photo_data['id'] == photo_id, 'created_at'].values[0]
        created_at = user_created_at if user_created_at < photo_created_at else photo_created_at
        created_at = pd.to_datetime(created_at)
        created_at = generate_timestamp(start_date = created_at)
        new_row = {'user_id': [bot], 'photo_id': [photo_id], 'created_at': [created_at]}
        new_row = pd.DataFrame(new_row)
        synthetic_like_data = pd.concat([synthetic_like_data, new_row], ignore_index = True)
        counter += 1

# Formatting
synthetic_like_data.reset_index(drop=True,inplace=True)

# Display data
synthetic_like_data

Unnamed: 0,user_id,photo_id,created_at
296150,510,289,2020-06-14 18:40:21
296151,510,307,2020-12-21 01:04:32
296152,510,15368,2023-12-15 01:50:26
296153,510,2244,2020-02-25 06:48:34
296154,510,11346,2021-12-18 10:30:34


## Influencers
Influencers are defined as users who have 10%-12% of all users as their followers

In [34]:
synthetic_follows_data = pd.read_csv("Big Dataset/follows.csv")

In [35]:
# Create a random number of influencers that should be around 2%-5% of users
num_of_influencers = random.randint(len(synthetic_user_data)*0.02, len(synthetic_user_data)*0.05)

# Storing influencers IDs
influencers = set()
while len(influencers) < num_of_influencers:
    influencer = generate_user_id()
    influencers.add(influencer) if influencer not in bots else None  # Making sure the ID does not belong to a bot
influencers = list(influencers)

# Removing influencers from follows dataframe
synthetic_follows_data[~synthetic_follows_data['followee_id'].isin(influencers)]

# Creating followers for influencers
for influencer in influencers:
    
    # Number of followers that influencers have (each influencer should have around 10%-12% of all users)
    num_of_followers = random.randint(round(len(synthetic_follows_data)*0.1),
                                      round(len(synthetic_follows_data)*0.12))
        
    # Storing the IDs of followers of this specific influencer
    followers = set()
    while len(followers) < num_of_influencers:
        follower = generate_user_id()
        followers.add(follower) if follower != influencer else None
    followers = list(followers)
    
    # Adding followers
    for follower in followers:
        follower_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == follower, 'created_at'].values[0]
        followee_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == influencer, 'created_at'].values[0]
        created_at = follower_created_at if follower_created_at < followee_created_at else followee_created_at
        created_at = pd.to_datetime(created_at)
        created_at = generate_timestamp(start_date=created_at)
        
        # Concatenating rows to follows dataframe
        new_row = {'follower_id': [follower], 'followee_id': [influencer], 'created_at': [created_at]}
        new_row = pd.DataFrame(new_row)
        synthetic_follows_data = pd.concat([synthetic_follows_data, new_row], ignore_index = True)

# Display data
synthetic_follows_data

Unnamed: 0,follower_id,followee_id,created_at
0,2468,3128,2023-07-10 15:51:15
1,1278,1508,2022-06-04 00:43:24
2,1839,1511,2018-11-01 16:53:36
3,3719,826,2024-05-21 05:16:02
4,762,819,2020-10-04 20:08:13
...,...,...,...
41116,1001,3582,2022-12-08 15:59:17
41117,2026,3582,2023-04-27 22:20:11
41118,4081,3582,2018-12-08 07:56:08
41119,3058,3582,2019-06-07 02:22:23


In [36]:
# Making sure that ~50% of each influencer's followers should like their posts

# Create df to see which users have liked the photos by influencers
influencer_photos = synthetic_photo_data.merge(synthetic_like_data, 
                                               left_on='id', 
                                               right_on='photo_id')
influencer_photos = influencer_photos[['id','user_id_x','user_id_y']]
influencer_photos.rename(columns={"id": "photo_id",
                                  "user_id_x": "influencer_id",
                                  "user_id_y": "user_liked_id"},
                         inplace=True)

# Loop through influencers
for influencer in influencers:
    
    # Get influencer's photos
    photos = synthetic_photo_data[synthetic_photo_data['user_id'] == influencer]['id']
    
    # Get influencer's followers
    followers = synthetic_follows_data[synthetic_follows_data["followee_id"] == influencer]['follower_id']
    
    # Get around 45% - 55% of influencer's followers
    follower_pct_for_likes = random.uniform(0.45,0.55)
    
    # Get random set of influencer's followers
    random_followers = random.sample(list(followers), round(len(followers)*follower_pct_for_likes))
    
    # Loop through influencer photos
    for photo in photos:
        
        # Loop through random_followers to see if they liked that specific photo
        for follower in random_followers:
            
            if ~((influencer_photos['photo_id'] == photo) &
                 (influencer_photos['influencer_id'] == influencer) & 
                 (influencer_photos['user_liked_id'] == follower)).any():
                
                # Making sure that both follower and photo exist before the like
                follower_created_at = synthetic_user_data.loc[synthetic_user_data['id'] == follower,
                                                              'created_at'].values[0]
                photo_created_at = synthetic_photo_data.loc[synthetic_photo_data['id'] == photo,
                                                            'created_at'].values[0]
                created_at = follower_created_at if follower_created_at < photo_created_at else photo_created_at
                created_at = pd.to_datetime(created_at)
                created_at = generate_timestamp(start_date=created_at)
                
                # Create new row to add (ie follower liked photo by influencer)
                new_row = {'user_id': [follower],
                           'photo_id': [photo], 
                           'created_at': [created_at]}
                new_row = pd.DataFrame(new_row)
                synthetic_like_data = pd.concat([synthetic_like_data, new_row], ignore_index = True)
    
# Display data
synthetic_like_data

Unnamed: 0,user_id,photo_id,created_at
0,1708,15802,2020-08-14 15:43:07
1,2875,2969,2020-08-14 11:35:53
2,828,4216,2024-12-18 21:36:04
3,2369,24888,2018-02-05 23:48:07
4,174,22020,2021-03-15 22:34:30
...,...,...,...
337917,1820,21882,2021-11-24 08:02:15
337918,1073,21882,2024-02-11 22:05:41
337919,1158,21882,2024-09-04 20:56:29
337920,2342,21882,2021-05-16 20:57:48


In [None]:
# ~25% of influencers' followers should comment on their posts

# Display data
synthetic_comment_data

## Saving data

In [None]:
synthetic_user_data.to_csv("users.csv", index=False)
synthetic_photo_data.to_csv("photos.csv", index=False)
synthetic_tag_data.to_csv("tags.csv", index=False)
synthetic_photo_tag_data.to_csv("photo_tags.csv", index=False)
synthetic_like_data.to_csv("likes.csv", index=False)
synthetic_comment_data.to_csv("comments.csv", index=False)
synthetic_follows_data.to_csv("follows.csv", index=False)

In [None]:
synthetic_comment_data.to_csv("comments.csv", index=False)