NOTE :
The purpose of that notebook is to simulate an initial dataset - the original one can't be shared to third parties.
Business team gathered around 1.000 records of data and that is also the scope of the simulation below.
This small dataset was the seed for generating a large dataset suitable enough to meet the requirements of the project.
Particular values and probabilities were set to reflect some aspects of real life scenario.
To reflect the real life scenario, "fill_rate" feature is not reflected in the small dataset (this feature was unavailble); it will be simulated later in the large dataset.

In [2]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [3]:
#import os
#import sys

#os.chdir('C:/Projects/Metric Calculator')
#sys.path.append('C:/Projects')

In [4]:
#Importing qualitative feature's listings

from constants import (
    VOIVODSHIP_LIST,
    INDUSTRY_LIST,
    JOB_TYPE_LIST,
    CONTRACT_TYPE_DICT,
    BRAND_RECOGNITION_DICT,
    RECRUITMENT_DIFFICULTY_DICT,
    SHIFTS_DICT,
    OVERTIME_DICT,
    EMPLOYEE_SATISFACTION_DICT
    )

In [4]:
#Declaring the size of a simulated dataset

n_samples = 1000

In [5]:
#Generating 'Voivodship' feature's values

voivodship_data = np.random.choice(VOIVODSHIP_LIST, size = n_samples, p = [1/16] * 16)

In [6]:
#Generating 'Industry' data

industry_data = np.random.choice(INDUSTRY_LIST, size = n_samples, p = [0.6, 0.3, 0.1])

In [7]:
#Generating 'Job Type' data

job_type_data = np.random.choice(JOB_TYPE_LIST, size = n_samples, p = [1/6] * 6)

In [8]:
#Generating 'Salary' data
#It is assummed that particular job type has some specic average salary +/- 20% deviation

salary = {
    'Forklift operators, internal logistics operators, etc.' : 8480,
    'Machine operators, fitters, etc.' : 7420,
    'Packers, pickers, sorters, etc.' : 6360,
    'Production workers, production operator, etc.' : 7030,
    'Specialists, e.g. turners, welders, electricians etc.' : 10650,
    'Warehousment, warehouse workers, etc.' : 7200
}

salary_data = []

for job_type in job_type_data:
    
    if job_type == 'Forklift operators, internal logistics operators, etc.':
        salary_data.append(np.random.normal(loc = salary['Forklift operators, internal logistics operators, etc.'], scale = salary['Forklift operators, internal logistics operators, etc.'] * 0.2))
    elif job_type == 'Machine operators, fitters, etc.':
        salary_data.append(np.random.normal(loc=salary['Machine operators, fitters, etc.'], scale=salary['Machine operators, fitters, etc.'] * 0.2))
    elif job_type == 'Packers, pickers, sorters, etc.':
        salary_data.append(np.random.normal(loc=salary['Packers, pickers, sorters, etc.'], scale=salary['Packers, pickers, sorters, etc.'] * 0.2))
    elif job_type == 'Production workers, production operator, etc.':
        salary_data.append(np.random.normal(loc=salary['Production workers, production operator, etc.'], scale=salary['Production workers, production operator, etc.'] * 0.2))
    elif job_type == 'Specialists, e.g. turners, welders, electricians etc.':
        salary_data.append(np.random.normal(loc=salary['Specialists, e.g. turners, welders, electricians etc.'], scale=salary['Specialists, e.g. turners, welders, electricians etc.'] * 0.2))
    elif job_type == 'Warehousment, warehouse workers, etc.':
        salary_data.append(np.random.normal(loc=salary['Warehousment, warehouse workers, etc.'], scale=salary['Warehousment, warehouse workers, etc.'] * 0.2))

salary_data = np.round(salary_data, 2)

In [9]:
#Generating 'Bonus' data
#It is assummed that bonus can amount up to 30% of salary

bonus_factor_range = [0, 0.3] 

bonus_data = []

for salary in salary_data :
    
    bonus = None
    bonus = salary * np.random.uniform(bonus_factor_range[0], bonus_factor_range[1])
    bonus_data.append(bonus)
    
bonus_data = np.round(bonus_data, 2)

In [11]:
#Generating 'Benefit' data
#Benefits is a multichoice option (there are 3 options)

#Each option corresponds to one binary variable - they are the inputs
paid_meals = None
sport_card = None
medical_care = None

#Output categorical variable representing the combination of input binary variables
benefit = None

#Counter
i = 0

benefit_data = []

while i < n_samples :
    paid_meals = np.random.choice([0, 1])
    sport_card = np.random.choice([0, 1])
    medical_care = np.random.choice([0, 1])
  
    #Possible benefits' combinations
    if (paid_meals == 0 and sport_card == 0 and medical_care == 0):
        benefit = 'No benefits'
        
    elif (paid_meals == 1 and sport_card == 0 and medical_care == 0):
        benefit = 'One benefit'
    elif (paid_meals == 0 and sport_card == 1 and medical_care == 0):
        benefit = 'One benefit'
    elif (paid_meals == 0 and sport_card == 0 and medical_care == 1):
        benefit = 'One benefit'
        
    elif (paid_meals == 1 and sport_card == 1 and medical_care == 0):
        benefit = 'Two benefits'
    elif (sport_card == 1 and paid_meals == 0 and medical_care == 1):
        benefit = 'Two benefits'
    elif (medical_care == 0 and paid_meals == 1 and sport_card == 1):
        benefit = 'Two benefits'
        
    elif (paid_meals == 1 and sport_card == 1 and medical_care == 1):
        benefit = 'All benefits'
    
    benefit_data.append(benefit)
    i += 1

In [12]:
#Generating 'Contract length' data

options = list(range(1, 25)) + ['Indefinite']

contract_length_data = np.random.choice(options, size = n_samples)

In [13]:
#Generating 'Contract Type' data

contract_type_data = np.random.choice(CONTRACT_TYPE_DICT.keys(), size = n_samples, p = [0.15, 0.25, 0.60])

In [14]:
#Generating 'Brand recognition' data

brand_recognition_data = np.random.choice(BRAND_RECOGNITION_DICT.keys(), size = n_samples, p = [0.1, 0.15, 0.2, 0.35, 0.2])

In [15]:
#Generating 'Recruitment difficulty' data

recruitment_difficulty_data = np.random.choice(RECRUITMENT_DIFFICULTY_DICT.keys(), size = n_samples, p = [0.1, 0.2, 0.4, 0.2, 0.1])

In [16]:
#Generating 'Shifts' data

shifts_data = np.random.choice(SHIFTS_DICT.keys(), size = n_samples, p = [0.3, 0.2, 0.2, 0.3])

In [17]:
#Generating 'Overtime' data

overtime_data = np.random.choice(OVERTIME_DICT.keys(), size = n_samples, p = [0.2, 0.5, 0.3])

In [18]:
#Generating 'Overtime' data

employee_satisfaction_data = np.random.choice(EMPLOYEE_SATISFACTION_DICT.keys(), size = n_samples, p = [0.1, 0.15, 0.25, 0.35, 0.15])

In [20]:
#Creating 

df = pd.DataFrame({
    'Voivodship': voivodship_data,
    'Industry' : industry_data,
    'Job Type' : job_type_data,
    'Salary' : salary_data,
    'Bonus' : bonus_data,
    'Benefits' : benefit_data,
    'Contract Length' : contract_length_data,
    'Contract Type' : contract_type_data,
    'Brand Recognition' : brand_recognition_data,
    'Recruitment Difficulty' : recruitment_difficulty_data,
    'Shifts' : shifts_data,
    'Overtime' : overtime_data,
    'Employee Satisfaction' : employee_satisfaction_data
})

In [21]:
df.head(10)

Unnamed: 0,Voivodship,Industry,Job Type,Salary,Bonus,Benefits,Contract Length,Contract Type,Brand Recognition,Recruitment Difficulty,Shifts,Overtime,Employee Satisfaction
0,Małopolskie,Warehousing,"Machine operators, fitters, etc.",5462.52,978.73,One benefit,2,Contract of employment,Weak,Normal,Two 8hrs shifts,No overtime,Rather high
1,Świętokrzyskie,Manufacturing,"Specialists, e.g. turners, welders, electricia...",10197.17,1738.25,Two benefits,12,Mandate contract,Weak,Easy,One 8hrs shift,No overtime,Rather low
2,Zachodniopomorskie,Manufacturing,"Production workers, production operator, etc.",7568.14,818.59,One benefit,19,Contract of employment,Moderate,Normal,Two 8hrs shifts,No overtime,Rather high
3,Warmińsko-Mazurskie,Others,"Warehousment, warehouse workers, etc.",8825.89,2502.22,Two benefits,10,Contract work,Very weak,Very easy,Two 12hrs shifts,Ocassional overtime,Very low
4,Śląskie,Warehousing,"Packers, pickers, sorters, etc.",6782.31,1786.56,Two benefits,5,Contract of employment,Strong,Very difficult,Three 8hrs shifts,Ocassional overtime,Moderate
5,Warmińsko-Mazurskie,Manufacturing,"Forklift operators, internal logistics operato...",10622.78,120.89,One benefit,22,Mandate contract,Very strong,Difficult,One 8hrs shift,Regular overtime,Very low
6,Małopolskie,Manufacturing,"Specialists, e.g. turners, welders, electricia...",12258.43,3473.15,Two benefits,20,Contract of employment,Strong,Normal,Three 8hrs shifts,Ocassional overtime,Very low
7,Podlaskie,Others,"Production workers, production operator, etc.",5136.13,742.91,Two benefits,16,Contract of employment,Strong,Very difficult,Two 12hrs shifts,Ocassional overtime,Rather high
8,Dolnośląskie,Manufacturing,"Packers, pickers, sorters, etc.",6188.93,606.54,Two benefits,1,Contract of employment,Moderate,Easy,Three 8hrs shifts,Ocassional overtime,Rather high
9,Łódzkie,Manufacturing,"Machine operators, fitters, etc.",9337.03,944.49,Two benefits,17,Contract of employment,Strong,Normal,Two 12hrs shifts,No overtime,Moderate


In [23]:
df.to_excel('Datasets/small_dataset.xlsx', index = False)