# Chapter 13 - Data Science
## Data Preparation

## 0 - Setting up the notebook

In [1]:
import json
import random
from datetime import date, timedelta

import faker

## 1 - Preparing the Data

In [2]:
# create the faker to populate the data
fake = faker.Faker()

In [3]:
# generate user profiles
# simulate user data coming from an API. It is a list
# of JSON strings (users).
def get_users(no_of_users):
    usernames = (
        fake.unique.user_name() for i in range(no_of_users)
    )
    genders = random.choices(
        ["M", "F", "O"], weights=[0.43, 0.47, 0.1], k=no_of_users
    )
    for username, gender in zip(usernames, genders):
        user = {
            "username": username,
            "name": get_random_name(gender),
            "gender": gender,
            "email": fake.email(),
            "age": fake.random_int(min=18, max=90),
            "address": fake.address(),
        }
        yield json.dumps(user)


def get_random_name(gender):
    match gender:
        case "F":
            name = fake.name_female()
        case "M":
            name = fake.name_male()
        case _:
            name = fake.name_nonbinary()
    return name


users = list(get_users(1000))
users[:3]

['{"username": "marissacantrell", "name": "Natasha Pitts", "gender": "F", "email": "brittanyjones@example.net", "age": 45, "address": "USS Mcknight\\nFPO AE 94126"}',
 '{"username": "katherinehamilton", "name": "Brett Gonzalez", "gender": "M", "email": "gsherman@example.org", "age": 60, "address": "6687 Christopher Wells\\nSouth Michele, CT 29069"}',
 '{"username": "moorejeremy", "name": "John Perkins", "gender": "M", "email": "taylordavis@example.net", "age": 80, "address": "6151 Williams Terrace\\nPort Johnmouth, AL 41707"}']

In [4]:
# campaign name format:
# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency
def get_type():
    # just some meaningless example codes
    types = ["AKX", "BYU", "GRZ", "KTR"]
    return random.choice(types)


def get_start_end_dates():
    duration = random.randint(1, 2 * 365)
    offset = random.randint(-365, 365)
    start = date.today() - timedelta(days=offset)
    end = start + timedelta(days=duration)

    def _format_date(date_):
        return date_.strftime("%Y%m%d")

    return _format_date(start), _format_date(end)


def get_age_range():
    age = random.randrange(20, 46, 5)
    diff = random.randrange(5, 26, 5)
    return "{}-{}".format(age, age + diff)


def get_gender():
    return random.choice(("M", "F", "A"))


def get_currency():
    return random.choice(("GBP", "EUR", "USD"))


def get_campaign_name():
    separator = "_"
    type_ = get_type()
    start, end = get_start_end_dates()
    age_range = get_age_range()
    gender = get_gender()
    currency = get_currency()
    return separator.join(
        (type_, start, end, age_range, gender, currency)
    )

In [5]:
# campaign data:
# name, budget, spent, clicks, impressions
def get_campaign_data():
    name = get_campaign_name()
    budget = random.randint(10**3, 10**6)
    spent = random.randint(10**2, budget)
    clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5))
    impressions = int(random.gauss(0.5 * 10**6, 2))
    return {
        "cmp_name": name,
        "cmp_bgt": budget,
        "cmp_spent": spent,
        "cmp_clicks": clicks,
        "cmp_impr": impressions,
    }

In [6]:
# assemble the logic to get the final version of the rough data
# data will be a list of dictionaries. Each dictionary will follow
# this structure:
# {'user': user_json, 'campaigns': [c1, c2, ...]}
# where user_json is the JSON string version of a user data dict
# and c1, c2, ... are campaign dicts as returned by
# get_campaign_data


def get_data(users):
    data = []
    for user in users:
        campaigns = [
            get_campaign_data()
            for _ in range(random.randint(2, 8))
        ]
        data.append({"user": user, "campaigns": campaigns})
    return data

## 2 - Cleaning the data

In [7]:
# fetch simulated rough data
rough_data = get_data(users)

rough_data[:2]  # let us take a peek

[{'user': '{"username": "marissacantrell", "name": "Natasha Pitts", "gender": "F", "email": "brittanyjones@example.net", "age": 45, "address": "USS Mcknight\\nFPO AE 94126"}',
  'campaigns': [{'cmp_name': 'KTR_20250919_20261202_20-25_A_EUR',
    'cmp_bgt': 571858,
    'cmp_spent': 116646,
    'cmp_clicks': 43160,
    'cmp_impr': 500000},
   {'cmp_name': 'BYU_20250126_20250529_40-65_A_EUR',
    'cmp_bgt': 470697,
    'cmp_spent': 184449,
    'cmp_clicks': 24313,
    'cmp_impr': 500000},
   {'cmp_name': 'GRZ_20250309_20260620_25-35_A_GBP',
    'cmp_bgt': 407455,
    'cmp_spent': 228427,
    'cmp_clicks': 34386,
    'cmp_impr': 499999},
   {'cmp_name': 'GRZ_20250720_20260115_45-60_F_USD',
    'cmp_bgt': 189484,
    'cmp_spent': 176086,
    'cmp_clicks': 42771,
    'cmp_impr': 499995}]},
 {'user': '{"username": "katherinehamilton", "name": "Brett Gonzalez", "gender": "M", "email": "gsherman@example.org", "age": 60, "address": "6687 Christopher Wells\\nSouth Michele, CT 29069"}',
  'campaig

In [8]:
# Let's start from having a different version of the data
# We want a list whose items will be dicts. Each dict is
# the original campaign dict plus the user JSON

data = []
for datum in rough_data:
    for campaign in datum["campaigns"]:
        campaign.update({"user": datum["user"]})
        data.append(campaign)
data[:2]  # let us take another peek

[{'cmp_name': 'KTR_20250919_20261202_20-25_A_EUR',
  'cmp_bgt': 571858,
  'cmp_spent': 116646,
  'cmp_clicks': 43160,
  'cmp_impr': 500000,
  'user': '{"username": "marissacantrell", "name": "Natasha Pitts", "gender": "F", "email": "brittanyjones@example.net", "age": 45, "address": "USS Mcknight\\nFPO AE 94126"}'},
 {'cmp_name': 'BYU_20250126_20250529_40-65_A_EUR',
  'cmp_bgt': 470697,
  'cmp_spent': 184449,
  'cmp_clicks': 24313,
  'cmp_impr': 500000,
  'user': '{"username": "marissacantrell", "name": "Natasha Pitts", "gender": "F", "email": "brittanyjones@example.net", "age": 45, "address": "USS Mcknight\\nFPO AE 94126"}'}]

In [9]:
# Warning: Uncommenting and executing this cell will overwrite data.json
# with open("data.json", "w") as stream:
#     stream.write(json.dumps(data))