
<h1 align="center"><font size="5">How to Create Fake Data with Python</font></h1>

<h5 align="center"><font size="5">By:<br><br> Fatima, Sayeda </font></h5>

In [1]:
from IPython.display import HTML
def display_alert_color_1(message, color ,alert_type = "info"):
    display(HTML(f"<div class= 'alert alert-block alert-{alert_type}'> <H2> <span style='color:{color}'> {message} </span> </H2></div>"))
def display_alert_color_2(message, color ,alert_type = "info"):
    display(HTML(f"<div class= 'alert alert-block alert-{alert_type}'> <H3> <span style='color:{color}'> {message} </span> </H3></div>"))
from termcolor import colored     # To Display Colored Terminal Output
from datetime import datetime
script_start_time = (datetime.now()).strftime("%H:%M:%S")
display_alert_color_1("What the Process does?", "darkblue","success")
print(colored("The following process uses 'random' and 'barnum' python packages to create fake data ", 'green', attrs=['bold']))
display_alert_color_2("Get Script Start Time", "darkblue","info")
print(colored("Script Start Time: ", 'blue', attrs=['bold']) + colored(script_start_time, 'magenta', attrs=['bold']))
display_alert_color_2("Generate Fake Data", "darkmagenta","warning")

[1m[32mThe following process uses 'random' and 'barnum' python packages to create fake data [0m


[1m[34mScript Start Time: [0m[1m[35m19:19:30[0m


In [2]:
import random
import barnum
import pandas as pd
import re
import string
import datetime

# First create lists so that our function can restrict randomly generate data to assigned parameters
cost_centre_list = ["A001", "AB02", "CC28", "D456"]# contains strings representing cost centre codes.
profit_centre_list = ["00125", "23098", "23451", "00023", "87621"]# contains strings representing profit centre codes
currency_list = ["EUR", "USD", "INR", "KDD"]# contains strings representing different currency codes

# Define Function create_start_date that takes two optional parameters min_age and max_age with default values of 0.
def create_start_date(min_age=0, max_age=0):
    age = random.randint(min_age, max_age)# generates a random integer age between min_age and max_age inclusive
    start = datetime.date.today() - datetime.timedelta(days=random.randint(0, 365))# creates a random date within the past year by subtracting a random number of days from today's date. Finally, it subtracts age multiplied by 365 days from the randomly generated date to get a start date that is a random number of years in the past, between min_age and max_age inclusive.
    return start - datetime.timedelta(days=age * 365) 
start_date = create_start_date(min_age=1, max_age=5)# calls the function with arguments min_age=1 and max_age=5, and stores the returned value in a variable start_date
end_date = datetime.date.today()

def extract_data():
    # Use the barnum package to generate company data
    company_fields = [] # Create empty list to collect data
    email_suff = ["com", "org", "net", "edu"]
    for i in range(4): # creates 4 companies, increase the number as desired
        # Create a random company code with 2 capital letters + 2 numbers
        company_code = ''.join(random.choices(string.ascii_uppercase, k=2)) + ''.join(random.choices(string.digits, k=2))
        # Create Company name
        company_name = barnum.create_company_name(biz_type="Generic")
        # Create Company email
        ## Split the company name into words
        words = re.findall(r'\w+', company_name)
        ## Create email address using full company name without spaces + @ + first two words of company name + "email_suff"
        company_email = (words[0] + words[1].lower()) + "@" + words[0][0] + words[1][1] + "." + random.choice(email_suff)#suff
        data = {
            "company_name": company_name,
            "company_code": company_code,
            "city": barnum.create_city_state_zip(),
            "phone_number": barnum.create_phone(),
            "company_email": company_email
        }
        company_fields.append(data) # append created data to empty list "company_fields"
        company_data = pd.DataFrame(company_fields) # convert to pandas dataframe
        
        #### Now Generate Fake Invoice Data ####
        invoice_fields = [] # Create empty list to collect data
        for j in range(random.randint(100,200)): # Define minimum and maximum range
            first_name, last_name = barnum.create_name()
            name = first_name + " " + last_name
            invoice_value = random.randint(1000, 25000) # Define minimum and maximum values
            data = {
            "date": random.choice(pd.date_range(start=start_date, end=end_date).strftime("%Y-%m-%d")),
            "invoice": barnum.create_pw(),
            "text": barnum.create_sentence()[:10],
            "employee": name,
            "employee_email": last_name.lower() + "." + first_name[0].lower(),
            "cost_centre": random.choice(cost_centre_list),
            "profit_centre": random.choice(profit_centre_list),
            "currency": random.choice(currency_list),
            "value": invoice_value
            }
            invoice_fields.append(data) # append created data to empty list "invoice_fields"
        invoice_data = pd.DataFrame(invoice_fields) # convert to pandas dataframe
        invoice_data["date"] = pd.to_datetime(invoice_data["date"]) # date string to date object
        invoice_data["month"] = invoice_data["date"].dt.strftime("%b") # extract month from date object
        invoice_data["year"] = invoice_data["date"].dt.year # extract year from date object

        # Combine the company and invoice data
        complete_records = pd.concat([company_data] * len(invoice_data), ignore_index=True).merge(invoice_data, left_index=True, right_index=True, how='outer').sort_values(by=['company_code', 'date']).fillna("X", inplace=False)
        complete_records['employee_email'] = complete_records['employee_email']+"@"+(complete_records['company_email'].str.split('@').str[1])
        #print(list(complete_records.columns))# you may uncomment this line to get all column names as list and then rearrange or delete the names you don't want in the "final_columns" list below
        final_columns = ['company_code','company_name', 'city', 'company_email', 'phone_number', 'year', 'month', 'date', 'invoice', 'text', 'employee', 'employee_email', 'cost_centre', 'profit_centre', 'currency', 'value']
        complete_records = complete_records[complete_records["invoice"] != "X"].reset_index(drop=True).reindex(columns=final_columns)
    return complete_records

complete_records = extract_data() # run the function above and return dataframe with fake data
complete_records
#print(complete_records)

Unnamed: 0,company_code,company_name,city,company_email,phone_number,year,month,date,invoice,text,employee,employee_email,cost_centre,profit_centre,currency,value
0,MW96,Design Pacific Vision Organization,"(85554, Young, AZ)",Designpacific@Da.net,(801)670-5405,2020,Jul,2020-07-10 00:00:00,Eoi8eBT1,Commodo ni,Lidia Stallworth,stallworth.l@Da.net,CC28,23098,EUR,3204
1,MW96,Design Pacific Vision Organization,"(85554, Young, AZ)",Designpacific@Da.net,(801)670-5405,2020,Aug,2020-08-09 00:00:00,KJ0DrL6r,Tincidunt,Andres Asbury,asbury.a@Da.net,A001,00125,USD,23587
2,MW96,Design Pacific Vision Organization,"(85554, Young, AZ)",Designpacific@Da.net,(801)670-5405,2020,Sep,2020-09-21 00:00:00,Z83naMQQ,Ea iriure,Adell Pritchett,pritchett.a@Da.net,A001,00125,INR,15343
3,MW96,Design Pacific Vision Organization,"(85554, Young, AZ)",Designpacific@Da.net,(801)670-5405,2020,Oct,2020-10-03 00:00:00,Ob2PGr9x,Minim susc,Connie Mclean,mclean.c@Da.net,CC28,23451,USD,22570
4,MW96,Design Pacific Vision Organization,"(85554, Young, AZ)",Designpacific@Da.net,(801)670-5405,2020,Nov,2020-11-29 00:00:00,ZH2bpBC0,Dolore nul,Mckinley Posey,posey.m@Da.net,CC28,00023,KDD,3124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,VS23,Net Solutions Agency,"(93556, Ridgecrest, CA)",Netsolutions@No.net,(443)307-7290,2022,Sep,2022-09-30 00:00:00,ALyqm7A8,Sit illum,Shonda Milburn,milburn.s@No.net,CC28,00125,EUR,23484
107,VS23,Net Solutions Agency,"(93556, Ridgecrest, CA)",Netsolutions@No.net,(443)307-7290,2022,Nov,2022-11-07 00:00:00,D1MX8Tds,Duis feugi,Myrtis Martins,martins.m@No.net,AB02,23098,EUR,7712
108,VS23,Net Solutions Agency,"(93556, Ridgecrest, CA)",Netsolutions@No.net,(443)307-7290,2022,Nov,2022-11-11 00:00:00,mscAw44Q,Et diam pr,Vera Coffin,coffin.v@No.net,CC28,00125,EUR,24384
109,VS23,Net Solutions Agency,"(93556, Ridgecrest, CA)",Netsolutions@No.net,(443)307-7290,2022,Nov,2022-11-13 00:00:00,PH79uvqT,Quis iusto,Gena Stevens,stevens.g@No.net,CC28,23451,INR,11553


In [3]:
display_alert_color_2("Summarize Data in Pivot Tables", "darkmagenta","warning")
print(colored("We will create a simple pivot table to summarize invoice count by company.", 'green', attrs=['bold']))

[1m[32mWe will create a simple pivot table to summarize invoice count by company.[0m


In [4]:
pivot_table_1 = complete_records.pivot_table(index='company_name', values=['invoice', 'value'], aggfunc={'invoice':'count'})# groups data by "company_name" column and aggregates "invoice" column by counting the number of occurrences and the "value" column by summing up the values
pivot_table_1.reset_index(inplace=True) #moves the index to a new column named company_name using the reset_index method and 
pivot_table_1.index += 1 # sets the index to start from 1 by adding 1 to each index value.
pivot_table_1

Unnamed: 0,company_name,invoice
1,Design Pacific Vision Organization,28
2,East People Resource International,28
3,Net Solutions Agency,28
4,Technology Group,27


In [5]:
print(colored("We will create a second pivot table to summarize invoice count, total invoice value by currency and company.", 'magenta', attrs=['bold']))

[1m[35mWe will create a second pivot table to summarize invoice count, total invoice value by currency and company.[0m


In [6]:
pivot_table_2 = complete_records.pivot_table(index=['company_name', 'currency'], values=['invoice', 'value'], aggfunc={'invoice':'count', 'value':'sum'})# groups data pivot_table_1.reset_index(inplace=True) #moves the index to a new column named company_name using the reset_index method and 
pivot_table_2.reset_index(level=[0,1], inplace=True)# moves the index to new columns named 'company_name' and 'currency' using the reset_index method and 
pivot_table_2

Unnamed: 0,company_name,currency,invoice,value
0,Design Pacific Vision Organization,EUR,7,79889.0
1,Design Pacific Vision Organization,INR,4,63221.0
2,Design Pacific Vision Organization,KDD,4,34996.0
3,Design Pacific Vision Organization,USD,13,177804.0
4,East People Resource International,EUR,10,120844.0
5,East People Resource International,INR,8,110428.0
6,East People Resource International,KDD,5,79306.0
7,East People Resource International,USD,5,63787.0
8,Net Solutions Agency,EUR,9,130543.0
9,Net Solutions Agency,INR,9,78887.0


In [7]:
from datetime import datetime # we need to import this module again since it was overwritten when we "import datetime" 
script_end_time = (datetime.now()).strftime("%H:%M:%S")
# convert time string to datetime
t1 = datetime.strptime(script_start_time, "%H:%M:%S")
t2 = datetime.strptime(script_end_time, "%H:%M:%S")
delta = t2- t1
sec = delta.total_seconds()# get difference in seconds
min = (round(sec/60))# get difference in minutes
hours = round(sec / (60 * 60)) # get difference in hours
display_alert_color_2("Get Script End Time", "darkblue","info")
print(colored("Script End Time: ", 'blue', attrs=['bold']) + colored((script_end_time), 'magenta', attrs=['bold'])
     + colored("\n\nTotal Time Taken: ", 'green', attrs=['bold']) + colored((t2-t1), 'red', attrs=['bold'])
      + colored("\nSeconds: ", 'green', attrs=['bold']) + colored(sec, 'red', attrs=['bold'])
      + colored("\nMinutes: ", 'green', attrs=['bold']) + colored(min, 'red', attrs=['bold'])
      + colored("\nHours: ", 'green', attrs=['bold']) + colored(hours, 'red', attrs=['bold']))

[1m[34mScript End Time: [0m[1m[35m19:19:34[0m[1m[32m

Total Time Taken: [0m[1m[31m0:00:04[0m[1m[32m
Seconds: [0m[1m[31m4.0[0m[1m[32m
Minutes: [0m[1m[31m0[0m[1m[32m
Hours: [0m[1m[31m0[0m
