# Data Generator For Medium Articles

In [1]:
# Base libraries
import os
import time
import datetime as dt
import json

# Scientific libraries
import numpy as np
import pandas as pd

# Visual libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Helper libraries
from tqdm.notebook import tqdm, trange
from colorama import Fore, Back, Style
import warnings
warnings.filterwarnings('ignore')

## Fake First Names And Last Names

In [2]:
first_names = [
    'Lane', 'Ivor', 'Roary', 'Shannon', 'Abdul', 'Mary', 'Cole', 'Desirae',
    'Jon', 'Alex', 'Mary', 'Leo', 'Bob', 'Teresa', 'Martha', 'Fridge',
    'Spencer', 'Natasha', 'Gideon', 'Mason', 'Jackson', 'Evelyn', 'Ella',
    'Avery', 'Scarlett', 'Jack', 'Micheal', 'Mike', 'Tom', 'Alby', 'Bruce',
    'Minho', 'Madison', 'Eleanor', 'Lisa', 'Wyatt', 'Julian', 'Hazel',
    'Gunther', 'Ellie', 'Vivian', 'Merlin', 'Arthur', 'Harry', 'Lincoln',
    'Jaxon', 'Mark', 'Lucy', 'Emma', 'Andrey', 'Marvin', 'Hudson', 'Christian',
    'Christina', 'Colton', 'Landon', 'Zoe', 'Hunter', 'Ivy', 'Kinsley',
    'Easton', 'Hailey', 'Miles', 'Robert', 'Jameson', 'Piper', 'Austin',
    'Everett', 'Madeline', 'Peyton', 'Parker', 'Wesley', 'Bryson', 'Steve',
    'Weston', 'Emmet', 'Sawyer', 'Silas', 'Bennet', 'Everleigh', 'Brooks',
    'Hadley', 'Waylon', 'Kingston', 'Cole', 'Faith', 'Ashton', 'Braxton',
    'Tyler', 'Bryce', 'Bentley', 'Cahrlie', 'Taylor', 'Ashley', 'Brandon',
    'Andrea', 'Parker', 'Myles', 'Legend', 'Eloise', 'Josie', 'King', 'Daisy',
    'Karter', 'Rhett', 'Alyssa', 'Dean', 'Graham', 'Blakely', 'Blake',
    'Hayden', 'Lilly', 'Ron', 'Edward', 'Kimberly', 'Tucker', 'Steven',
    'Lauren', 'Presley', 'Avery', 'Georgia', 'Oscar', 'Journee', 'Archer',
    'Brooke', 'Olive', 'River', 'Payton', 'Beckett', 'Jeremy', 'Preston',
    'Gracie', 'Blake', 'Paige', 'Remington', 'Hope', 'Walker', 'Paul',
    'Marley', 'Alexis', 'Millie', 'Holden', 'Nash', 'Garett', 'Jonathan',
    'Jonny', 'Bradley', 'Beckham', 'Lena', 'Vanessa'
]

last_names = [
    'Reese', 'Pierce', 'Gibson', 'Little', 'Fry', 'Colon', 'Palmer', 'Smith',
    'Cooper', 'Bloom', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia',
    'Miller', 'Davis', 'Rodriguez', 'Martinez', 'Hernandez', 'Lopez',
    'Gonzalez', 'Wilson', 'Anderson', 'Thomas', 'Taylor', 'Moore', 'Jackson',
    'Martin', 'Lee', 'Perez', 'Thompson', 'White', 'Harris', 'Sancherz',
    'Clark', 'Ramirez', 'Lewis', 'Robinson', 'Walker', 'Youmg', 'Allen'
    'King', 'Wright', 'Scott', 'Torres', 'Nguyen', 'Hill', 'Flores', 'Green',
    'Adams', 'Nelson', 'Baker', 'Hall', 'Rivera', 'Campbell', 'Mitchell',
    'Carter', 'Roberts', 'Gomez', 'Phillips', 'Evans', 'Turner', 'Diaz',
    'Parker', 'Cruz', 'Edwards', 'Collins', 'Reyes', 'Stewart', 'Morris',
    'Morales', 'Murphy', 'Cook', 'Rogers', 'Guiterrez', 'Ortiz', 'Morgan',
    'Cooper', 'Peterson', 'Bailey', 'Reed', 'Kelly', 'Howard', 'Ramos', 'Kim',
    'Cox', 'Ward', 'Richardson', 'Watson', 'Brooks', 'Chavez', 'Wood', 'James',
    'Bennet', 'Gray', 'Mendoza', 'Ruiz', 'Hughes', 'Price', 'Alvarez',
    'Castillo', 'Sanders', 'Patel', 'Myers', 'Long', 'Ross', 'Foster',
    'Jimenez', 'Powell', 'Jenkins', 'Perry', 'Russel', 'Sullivan', 'Bell',
    'Coleman'
]

# 17. Data Uniformity Article

#### 17.1 Unit uniformity

Create a fake data with date and average temperature:

In [3]:
temps = list(np.arange(10, 20, 1)) + [59, 58, 60]
dates = pd.date_range('2020-09-01', '2020-12-01')
temp_porbs = [.09, .09, .09, .1, .09, .1, .08, .1, .1, .1, 0.02, 0.02, 0.02]
data_dict = {
    'avg_temperature': np.random.choice(temps, size=92, p=temp_porbs),
    'date': dates
}

data = pd.DataFrame(data_dict)
# data.to_csv('november_2020/uniformity/data/unit_uniformity.csv', index=False)

#### 17.2 Date uniformity

Create a fake data of 1000 people, with random birthdays from '60s to 2020. Then replace some of the birthdays with incorrect DateTime format for later practice in the main article. 

In [4]:
size = 1000
birthdays = pd.date_range('1960-01-01', '2020-01-01')
birthdays_2 = ['2000-31-21', '1960-25-12', '1960-24-01']
data_dict = {
    'first_name': np.random.choice(first_names, size),
    'last_name': np.random.choice(last_names, size),
    'birthday': np.random.choice(birthdays, size)
}

data = pd.DataFrame(data_dict)
for i in range(1, 20):
    data.loc[np.random.randint(100, 900),
             'birthday'] = np.random.choice(birthdays_2)
data['birthday'] = data['birthday'].astype('str')
# data.to_csv('november_2020/uniformity/data/date_uniformity.csv', index=False)

# 22. Cross Field Validation

Create a fake data with size 10k. Each row contains data about one fake person with info of their full name, birthday, weight, height. After generating the dataset, create a new column for Body Mass Index and populate it using BMI formula. Then, create a new column to populate with age too. 

Next, choose 100 random people to replace with incorrect BMI and another 100 to replace with incorrect age to show as an example in the main article.

In [5]:
size = 10000
birthday = pd.date_range('01-01-1980', '01-01-2010')
weight = np.arange(50, 100)
height = np.arange(160, 195)

data_dict = {
    'first_name': np.random.choice(first_names, size),
    'last_name': np.random.choice(last_names, size),
    'birthday': np.random.choice(birthday, size),
    'weight': np.random.choice(weight, size),
    'height': np.random.choice(height, size)
}

data = pd.DataFrame(data_dict)

def bmi(row):
    """
    A function to calculate bmi of a person
    """
    bmi = row['weight'] / (row['height'] / 100) ** 2
    return bmi


def age(row):
    """
    A function to calculate the age of a person
    """
    today = dt.date.today()
    age = today.year - row['birthday'].year
    return age

data['bmi'] = data.apply(bmi, axis=1).round(1).astype('int')
data['age'] = data.apply(age, axis=1).round(1).astype('int')

bmi_incor_indices = np.random.choice(np.arange(0, 10000), 89)
age_incor_indices = np.random.choice(np.arange(0, 10000), 77)

# Generate incorrect bmi
data.loc[bmi_incor_indices, 'bmi'] = np.random.randint(10, 50)
# Generate incorrect age
data.loc[age_incor_indices, 'age'] = np.random.randint(20, 80)
# Save it to the necessary folder
# data.to_csv('november_2020/cross_field_validation/people.csv', index=False)