In [1]:
import json
import random
from datetime import date, timedelta
import faker

In [2]:
fake = faker.Faker()


In [3]:
usernames = set()
usernames_no = 1000

# populate the set with 100 unique username
while len(usernames) < usernames_no:
    usernames.add(fake.user_name())

In [6]:
def get_random_name_and_gender():
    skew = .6 # 80% of user will be female
    male = random.random() > skew
    if male:
        return fake.name_male(), 'M'
    else:
        return fake.name_female(), 'F'
    
def get_users(usernames):
    users = []
    for username in usernames:
        name, gender = get_random_name_and_gender()
        user = {
            'username':username,
            'name':name,
            'gender':gender,
            'email':fake.email(),
            'age':fake.random_int(min=18, max=90),
            'address':fake.address(),
        }
        users.append(json.dumps(user))
    return users    

user = get_users(usernames)
user[:3]

['{"username": "gparks", "name": "Becky Williamson", "gender": "F", "email": "clementscaitlin@example.net", "age": 34, "address": "2000 Thomas Locks\\nNicholsstad, GA 57596"}',
 '{"username": "johnsonjanice", "name": "Joseph Coleman", "gender": "M", "email": "gwhite@example.org", "age": 74, "address": "9685 Strong Village\\nNew Morgan, AK 50653"}',
 '{"username": "lisa03", "name": "Walter Carroll", "gender": "M", "email": "jennifer35@example.net", "age": 19, "address": "PSC 6128, Box 3903\\nAPO AE 11812"}']

In [16]:
# Campaign name format:
# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency

def get_type():
    # Just some gibberish internal codes
    types = ['AKX', 'BYU', 'GRZ', 'KTR']
    return random.choice(types)

def get_start_end_dates():
    # Function used to calculate the duration of the campaign in days
    duration = random.randint(1, 2 * 365)
    offset = random.randint(-365, 365)
    start = date.today() - timedelta(days=offset)
    end = start + timedelta(days=duration)
    
    def _format_date(date_):
        return date_.strftime("%Y%m%d")
    return _format_date(start), _format_date(end)

def get_age():
    age = random.randrange(20, 46, 5)
    diff = random.randrange(5, 26, 5)
    return '{}-{}'.format(age, age + diff)

def get_currency():
    return random.choice(('GBP', 'EUR', 'USD'))

def get_gender():
    return random.choice(('M', 'F', 'B'))

def get_campaign_name():
    separator = '_'
    type_ = get_type()
    start, end = get_start_end_dates()
    age = get_age()
    gender = get_gender()
    currency = get_currency()
    return separator.join(
        (type_, start, end, age, gender, currency))

In [17]:
# Campaign Data:
# name, budget, spent, clicks, impressions

def get_campaign_data():
    """
        Function for getting the campaign data
        args:
            @name: Name of the campaign
            @budget: Total amount of money allocated to the campaign
            @spent: Total amount of money which has been spent
            @clicks: Number of clicks on a campaign advertisment
            @impressions: Total amount of time the campign has been fetch as a resource regardless of the numbers of clicks
    """
    name = get_campaign_name()
    budget = random.randint(10**3, 10**6)
    spent = random.randint(10**2, budget)
    clicks = int(random.triangular(10**2, 10**5, 0.2* 10**5))
    impressions = int(random.gauss(0.5 * 10**6, 2))
    return {
        'cmp_name':name,
        'cmp_bgt':budget,
        'cmp_spent':spent,
        'cmp_clicks':clicks,
        'cmp_impr':impressions
    }

In [18]:
def get_data(users):
    data = []
    for user in users:
        campaigns = [
            get_campaign_data()
            for _ in range(random.randint(2, 8))
        ]
        data.append({'user':user, 'campaigns':campaigns})
    return data

In [19]:
rough_data = get_data(user)
rough_data[:2]

[{'user': '{"username": "gparks", "name": "Becky Williamson", "gender": "F", "email": "clementscaitlin@example.net", "age": 34, "address": "2000 Thomas Locks\\nNicholsstad, GA 57596"}',
  'campaigns': [{'cmp_name': 'GRZ_20220709_20220727_35-40_F_EUR',
    'cmp_bgt': 59433,
    'cmp_spent': 24647,
    'cmp_clicks': 31515,
    'cmp_impr': 500001},
   {'cmp_name': 'AKX_20211015_20221217_30-45_F_USD',
    'cmp_bgt': 69479,
    'cmp_spent': 35410,
    'cmp_clicks': 30102,
    'cmp_impr': 499997},
   {'cmp_name': 'BYU_20220513_20230414_45-50_B_GBP',
    'cmp_bgt': 860969,
    'cmp_spent': 119913,
    'cmp_clicks': 25507,
    'cmp_impr': 500002},
   {'cmp_name': 'GRZ_20230319_20250215_30-50_F_GBP',
    'cmp_bgt': 225838,
    'cmp_spent': 7676,
    'cmp_clicks': 49377,
    'cmp_impr': 500002},
   {'cmp_name': 'KTR_20221216_20240619_45-55_F_EUR',
    'cmp_bgt': 1564,
    'cmp_spent': 702,
    'cmp_clicks': 17145,
    'cmp_impr': 500000}]},
 {'user': '{"username": "johnsonjanice", "name": "Josep

In [20]:
data = []
for datum in rough_data:
    for campaign in datum['campaigns']:
        campaign.update({'user':datum['user']})
        data.append(campaign)
data[:2]

[{'cmp_name': 'GRZ_20220709_20220727_35-40_F_EUR',
  'cmp_bgt': 59433,
  'cmp_spent': 24647,
  'cmp_clicks': 31515,
  'cmp_impr': 500001,
  'user': '{"username": "gparks", "name": "Becky Williamson", "gender": "F", "email": "clementscaitlin@example.net", "age": 34, "address": "2000 Thomas Locks\\nNicholsstad, GA 57596"}'},
 {'cmp_name': 'AKX_20211015_20221217_30-45_F_USD',
  'cmp_bgt': 69479,
  'cmp_spent': 35410,
  'cmp_clicks': 30102,
  'cmp_impr': 499997,
  'user': '{"username": "gparks", "name": "Becky Williamson", "gender": "F", "email": "clementscaitlin@example.net", "age": 34, "address": "2000 Thomas Locks\\nNicholsstad, GA 57596"}'}]

In [21]:
len(rough_data), len(data)

(1000, 5104)

In [23]:
with open('data.json','w') as stream:
    stream.write(json.dumps(data))