<a href="https://colab.research.google.com/github/ChloeReads/Software-Engineering-Pipeline/blob/main/Synthetic%20Data%20Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Synthetic Data Generation

## Setup Instructions

This code assumes you have only loaded Synthetic Data Generation.ipynb into Google colab, if you have already uploaded the below files you can skip the next step and move straight to Installing requirements.txt

Run the following code block and upload the following files from the Zip Archive:

*   requirements.txt





In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
!pip install -r requirements.txt

In [None]:
# Importing installed libraries
import pandas as pd
from faker import Faker

# As random is built into python it does not need to be installed like previous libraries
import random

# Setting Faker to us British Names/Locations
fake = Faker('en_GB')

In [None]:
# Creating Lists of Cities and departments as Faker can pull from a huge number and this makes the data closer to the production source

locations = ['Edinburgh', 'Glasgow', ' Aberdeen', 'Leeds', 'Manchester', 'London', 'Bristol', 'Cardiff', 'Belfast', 'Birmingham', 'Brighton']
departments = ['Human Resources', 'Risk Management','Sales','Marketing','Adminstration','Engineering','Investment','Sciences']

In [None]:
# Creating the Staff dataset

data = []
# Sets the number of Rows, this roughly matches production dataset size
for i in range(10000):
    new_row = {
        'ID': fake.unique.random_int(min=10000, max=99999), # Creates unique ID numbers withing an upper and lower limit
        'Full Name': fake.name(), # Creates the Full Name Column, the locale set above determines roughly what kind of names this generates, for this assessment this is fine however in real world scenaries could cause bias issues
        'Location': fake.random_element(elements=locations), # Use the predifined list of locations (Major UK Cities)
        'Department': fake.random_element(elements=departments) # Use the predefined list of departments
    }
    data.append(new_row)

    # Occasionally duplicate a row 5% of the time (to mimic a real world data quality issue)
    if random.random() < 0.05 and len(data) > 0:
        row_to_duplicate = random.choice(data)
        data.append(row_to_duplicate)


df_staff = pd.DataFrame(data)
display(df_staff.head())

In [None]:
# Extract unique IDs and their departments from the existing DataFrame
df_teams_data = df_staff[['ID', 'Department']].drop_duplicates().copy()

# Generates a fake Team for each Department.
teams_per_department = {}
for department in df_teams_data['Department'].unique():
    teams_per_department[department] = [f'{department} Team {i+1}' for i in range(random.randint(1, 4))] # 1 to 4 teams per department

# Assign a random team from the department's teams to each ID-Department pair
df_teams_data['Team'] = df_teams_data['Department'].apply(lambda x: random.choice(teams_per_department[x]))

df_teams = df_teams_data
display(df_teams.head())

In [None]:
# Get the IDs from the original DataFrame
ids = df_staff['ID'].unique().tolist()

# Generate random pay rates for each ID
pay_rates_data = []
for id in ids:
    pay_rates_data.append({
        'ID': id,
        'Pay Rate': round(fake.random_number(digits=2) + fake.pydecimal(left_digits=2, right_digits=2, positive=True), 2)
    })

df_pay_rates = pd.DataFrame(pay_rates_data)
display(df_pay_rates.head())

In [None]:
df_staff.to_csv('Staff.csv')
df_teams.to_csv('Teams.csv')
df_pay_rates.to_csv('PayRates.csv')

files.download('Staff.csv')
files.download('Teams.csv')
files.download('PayRates.csv')