# Normalising data to store them in MS SQL Server

For efficient database storage in MS SQL Server, we need to normalise the data to reduce redundancy and composite values. I will normalise the data with star schema. 

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('cleanData/main_data.csv')
data.head()

Unnamed: 0,EmployeeID,First_Name,Surname,StreetAddress,City,StateFull,ZipCode,Country,CountryFull,Age,...,DOB,level,Salary,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years,State_code
0,100001,Patrice,Moore,1427 Buckhannan Avenue,North Syracuse,New York,13212,US,United States,37.0,...,1986-01-05,CSuite,500000,1,JP_1000,2009,0,171.960659,14.133753,NY
1,100002,David,Rickards,4265 Graystone Lakes,Macon,Georgia,31206,US,United States,52.0,...,1971-07-13,Manager,70000,1,JP_1001,2009,0,171.960659,14.133753,GA
2,100003,Grace,Maldonado,1680 Hudson Street,Weehawken,New Jersey,7087,US,United States,34.0,...,1989-01-25,Individual Contributor,77000,0,JP_1027,2009,2013,49.3,4.052055,NJ
3,100004,Justin,Edgin,1262 Limer Street,Rome,Georgia,30165,US,United States,27.0,...,1996-05-01,CSuite,400000,0,JP_1041,2009,2013,52.566667,4.320548,GA
4,100005,Benjamin,Vargas,2431 Rainbow Road,Santa Ana,California,92705,US,United States,51.0,...,1972-05-05,Manager,51000,0,JP_1018,2009,2011,18.2,1.49589,CA


In [3]:
data.columns

Index(['EmployeeID', 'First_Name', 'Surname', 'StreetAddress', 'City',
       'StateFull', 'ZipCode', 'Country', 'CountryFull', 'Age', 'Office',
       'Start_Date', 'Termination_Date', 'Office_Type', 'Department',
       'Currency', 'Bonus_pct', 'Job_title', 'DOB', 'level', 'Salary',
       'Active Status', 'Job_Profile', 'start_year', 'termination_year',
       'tenure_months', 'tenure_years', 'State_code'],
      dtype='object')

## Finding dependency

### 1. Company/Office details

First, we are going to identify dependency with office.

In [4]:
company_details = pd.read_csv('cleanData/company_details.csv')
company_details.head()

Unnamed: 0,Office,Currency
0,NYC,USD
1,Boulder,USD
2,Oslo,NOK
3,SanJose,USD
4,London,GBP


In [5]:
data.head().loc[:,'Office':]

Unnamed: 0,Office,Start_Date,Termination_Date,Office_Type,Department,Currency,Bonus_pct,Job_title,DOB,level,Salary,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years,State_code
0,NYC,2009-05-04,,Corporate,Corporate,USD,1.0,CEO,1986-01-05,CSuite,500000,1,JP_1000,2009,0,171.960659,14.133753,NY
1,NYC,2009-05-04,,Corporate,Corporate,USD,0.2,HR Manager,1971-07-13,Manager,70000,1,JP_1001,2009,0,171.960659,14.133753,GA
2,NYC,2009-05-18,2013-06-05 00:00:00.000000,Corporate,Marketing,USD,0.15,Graphic Designer,1989-01-25,Individual Contributor,77000,0,JP_1027,2009,2013,49.3,4.052055,NJ
3,Boulder,2009-06-22,2013-10-16 00:00:00.000000,Corporate,Technology,USD,0.5,CTO,1996-05-01,CSuite,400000,0,JP_1041,2009,2013,52.566667,4.320548,GA
4,NYC,2009-07-13,2011-01-10 00:00:00.000000,Corporate,Customer Service,USD,0.15,Associate Account Manager,1972-05-05,Manager,51000,0,JP_1018,2009,2011,18.2,1.49589,CA


In [6]:
data.loc[(data['Office_Type']=='Technology'),'Office':'Job_Profile']

Unnamed: 0,Office,Start_Date,Termination_Date,Office_Type,Department,Currency,Bonus_pct,Job_title,DOB,level,Salary,Active Status,Job_Profile
48,Boulder,2010-01-25,2010-10-21 00:00:00.000000,Technology,Technology,USD,0.15,Software Engineer,1974-11-11,Individual Contributor,93000,0,JP_1043
49,Boulder,2010-02-01,2014-05-14 00:00:00.000000,Technology,Technology,USD,0.15,Software Engineer,1976-05-04,Individual Contributor,93000,0,JP_1043
53,Boulder,2010-02-15,2014-04-24 00:00:00.000000,Technology,Technology,USD,0.20,"Director, Engineering",1983-09-16,Director,100000,0,JP_1045
54,Oslo,2010-02-22,2019-08-19 00:00:00.000000,Technology,Technology,NOK,0.15,Software Engineer,1982-02-14,Individual Contributor,537000,0,JP_1043
55,Boulder,2010-03-08,2011-07-15 00:00:00.000000,Technology,Technology,USD,0.15,Software Engineer,1987-03-23,Individual Contributor,100000,0,JP_1043
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4957,Boulder,2021-11-22,,Technology,Technology,USD,0.15,Software Engineer,1977-08-05,Individual Contributor,77000,1,JP_1043
4961,NYC,2021-09-13,,Technology,Corporate,USD,0.50,Chief Human Resources Officer,1957-02-05,CSuite,266000,1,JP_1055
4962,SanFran,2021-06-21,,Technology,Sales,USD,0.15,Sales Team Lead,1974-12-22,Senior,95000,1,JP_1039
4964,Boulder,2021-01-25,,Technology,Technology,USD,0.15,Software Engineer,1975-07-05,Individual Contributor,77000,1,JP_1043


There seems to be a partial dependency on office and office type. We are going to normalise that.

In [7]:
d = data[['Office','Office_Type']]
company_details=company_details.merge(d, on='Office', how='left').drop_duplicates()
company_details

Unnamed: 0,Office,Currency,Office_Type
0,NYC,USD,Corporate
1664,NYC,USD,Technology
1796,Boulder,USD,Corporate
1797,Boulder,USD,Technology
3429,Oslo,NOK,Technology
3579,SanJose,USD,Corporate
4434,SanJose,USD,Technology
4496,London,GBP,Corporate
4746,Tokyo,JPY,Corporate
4796,HongKong,HKD,Technology


Dropping Unused Columns

In [8]:
company_details.drop(['Currency'], axis=1, inplace=True)

Creating ID for office table

In [9]:
company_details['Office_id']=[i for i in range(company_details.shape[0])]
company_details

Unnamed: 0,Office,Office_Type,Office_id
0,NYC,Corporate,0
1664,NYC,Technology,1
1796,Boulder,Corporate,2
1797,Boulder,Technology,3
3429,Oslo,Technology,4
3579,SanJose,Corporate,5
4434,SanJose,Technology,6
4496,London,Corporate,7
4746,Tokyo,Corporate,8
4796,HongKong,Technology,9


Adding `office_id` into main data

In [10]:
data = data.merge(company_details, on=['Office','Office_Type'], how='left')
data.head(10)

Unnamed: 0,EmployeeID,First_Name,Surname,StreetAddress,City,StateFull,ZipCode,Country,CountryFull,Age,...,level,Salary,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years,State_code,Office_id
0,100001,Patrice,Moore,1427 Buckhannan Avenue,North Syracuse,New York,13212,US,United States,37.0,...,CSuite,500000,1,JP_1000,2009,0,171.960659,14.133753,NY,0
1,100002,David,Rickards,4265 Graystone Lakes,Macon,Georgia,31206,US,United States,52.0,...,Manager,70000,1,JP_1001,2009,0,171.960659,14.133753,GA,0
2,100003,Grace,Maldonado,1680 Hudson Street,Weehawken,New Jersey,7087,US,United States,34.0,...,Individual Contributor,77000,0,JP_1027,2009,2013,49.3,4.052055,NJ,0
3,100004,Justin,Edgin,1262 Limer Street,Rome,Georgia,30165,US,United States,27.0,...,CSuite,400000,0,JP_1041,2009,2013,52.566667,4.320548,GA,2
4,100005,Benjamin,Vargas,2431 Rainbow Road,Santa Ana,California,92705,US,United States,51.0,...,Manager,51000,0,JP_1018,2009,2011,18.2,1.49589,CA,0
5,100006,Nelson,Grillo,3645 Coolidge Street,North Custer,Montana,59024,US,United States,30.0,...,Individual Contributor,76000,0,JP_1034,2009,2021,147.866667,12.153425,MT,0
6,100007,Kevin,Rainey,977 Black Oak Hollow Road,Santa Clara,California,95054,US,United States,33.0,...,Individual Contributor,56000,0,JP_1020,2009,2011,19.033333,1.564384,CA,0
7,100008,Melanie,Hurst,2751 Holden Street,San Diego,California,92103,US,United States,40.0,...,Individual Contributor,72000,0,JP_1034,2009,2012,32.266667,2.652055,CA,0
8,100009,Greg,Boon,4791 Loving Acres Road,Grapevine,Texas,76051,US,United States,31.0,...,Director,74000,1,JP_1035,2009,0,169.160659,13.903616,TX,0
9,100010,Frank,Stockdale,1413 Roy Alley,Centennial,Colorado,80111,US,United States,34.0,...,Individual Contributor,52000,0,JP_1020,2009,2014,58.366667,4.79726,CO,0


Dropping dependent columns

In [11]:
data.drop(['Office','Office_Type','Currency'], axis=1, inplace=True)
data.columns

Index(['EmployeeID', 'First_Name', 'Surname', 'StreetAddress', 'City',
       'StateFull', 'ZipCode', 'Country', 'CountryFull', 'Age', 'Start_Date',
       'Termination_Date', 'Department', 'Bonus_pct', 'Job_title', 'DOB',
       'level', 'Salary', 'Active Status', 'Job_Profile', 'start_year',
       'termination_year', 'tenure_months', 'tenure_years', 'State_code',
       'Office_id'],
      dtype='object')

In [12]:
company_details.reset_index(drop=True, inplace=True)
company_details

Unnamed: 0,Office,Office_Type,Office_id
0,NYC,Corporate,0
1,NYC,Technology,1
2,Boulder,Corporate,2
3,Boulder,Technology,3
4,Oslo,Technology,4
5,SanJose,Corporate,5
6,SanJose,Technology,6
7,London,Corporate,7
8,Tokyo,Corporate,8
9,HongKong,Technology,9


### 2. Job Details table

In [13]:
job_details = pd.read_csv('cleanData/job_details.csv')
job_details.head()

Unnamed: 0,Department,Job_title,Job_Profile,Salary,level,Bonus_pct
0,Corporate,CEO,JP_1000,500000.0,CSuite,1.0
1,Corporate,HR Manager,JP_1001,100000.0,Manager,0.2
2,Corporate,AR Specialist,JP_1002,65000.0,Individual Contributor,0.15
3,Corporate,AP Specialist,JP_1003,65000.0,Individual Contributor,0.15
4,Corporate,FP&A Analyst,JP_1004,70000.0,Individual Contributor,0.15


In [14]:
data.head().iloc[:,:13]

Unnamed: 0,EmployeeID,First_Name,Surname,StreetAddress,City,StateFull,ZipCode,Country,CountryFull,Age,Start_Date,Termination_Date,Department
0,100001,Patrice,Moore,1427 Buckhannan Avenue,North Syracuse,New York,13212,US,United States,37.0,2009-05-04,,Corporate
1,100002,David,Rickards,4265 Graystone Lakes,Macon,Georgia,31206,US,United States,52.0,2009-05-04,,Corporate
2,100003,Grace,Maldonado,1680 Hudson Street,Weehawken,New Jersey,7087,US,United States,34.0,2009-05-18,2013-06-05 00:00:00.000000,Marketing
3,100004,Justin,Edgin,1262 Limer Street,Rome,Georgia,30165,US,United States,27.0,2009-06-22,2013-10-16 00:00:00.000000,Technology
4,100005,Benjamin,Vargas,2431 Rainbow Road,Santa Ana,California,92705,US,United States,51.0,2009-07-13,2011-01-10 00:00:00.000000,Customer Service


In [15]:
data.head().iloc[:,13:]

Unnamed: 0,Bonus_pct,Job_title,DOB,level,Salary,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years,State_code,Office_id
0,1.0,CEO,1986-01-05,CSuite,500000,1,JP_1000,2009,0,171.960659,14.133753,NY,0
1,0.2,HR Manager,1971-07-13,Manager,70000,1,JP_1001,2009,0,171.960659,14.133753,GA,0
2,0.15,Graphic Designer,1989-01-25,Individual Contributor,77000,0,JP_1027,2009,2013,49.3,4.052055,NJ,0
3,0.5,CTO,1996-05-01,CSuite,400000,0,JP_1041,2009,2013,52.566667,4.320548,GA,2
4,0.15,Associate Account Manager,1972-05-05,Manager,51000,0,JP_1018,2009,2011,18.2,1.49589,CA,0


Identifier for job_details is `Job_Profile`, so it is already normalised. Dropping columns that existed in job_details table in main data

In [16]:
col_to_drop = list(job_details.columns)
col_to_drop.remove('Job_Profile')
col_to_drop

['Department', 'Job_title', 'Salary', 'level', 'Bonus_pct']

In [17]:
data.drop(col_to_drop, axis=1, inplace=True)
data.head()

Unnamed: 0,EmployeeID,First_Name,Surname,StreetAddress,City,StateFull,ZipCode,Country,CountryFull,Age,...,Termination_Date,DOB,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years,State_code,Office_id
0,100001,Patrice,Moore,1427 Buckhannan Avenue,North Syracuse,New York,13212,US,United States,37.0,...,,1986-01-05,1,JP_1000,2009,0,171.960659,14.133753,NY,0
1,100002,David,Rickards,4265 Graystone Lakes,Macon,Georgia,31206,US,United States,52.0,...,,1971-07-13,1,JP_1001,2009,0,171.960659,14.133753,GA,0
2,100003,Grace,Maldonado,1680 Hudson Street,Weehawken,New Jersey,7087,US,United States,34.0,...,2013-06-05 00:00:00.000000,1989-01-25,0,JP_1027,2009,2013,49.3,4.052055,NJ,0
3,100004,Justin,Edgin,1262 Limer Street,Rome,Georgia,30165,US,United States,27.0,...,2013-10-16 00:00:00.000000,1996-05-01,0,JP_1041,2009,2013,52.566667,4.320548,GA,2
4,100005,Benjamin,Vargas,2431 Rainbow Road,Santa Ana,California,92705,US,United States,51.0,...,2011-01-10 00:00:00.000000,1972-05-05,0,JP_1018,2009,2011,18.2,1.49589,CA,0


### 3. Normalising state and country

Normalising into state table

In [18]:
statedf = data[['StateFull','State_code']].drop_duplicates().reset_index(drop=True)
print(statedf.shape)
statedf

(52, 2)


Unnamed: 0,StateFull,State_code
0,New York,NY
1,Georgia,GA
2,New Jersey,NJ
3,California,CA
4,Montana,MT
5,Texas,TX
6,Colorado,CO
7,Oklahoma,OK
8,Florida,FL
9,North Dakota,ND


In [19]:
statedf.dropna()

Unnamed: 0,StateFull,State_code
0,New York,NY
1,Georgia,GA
2,New Jersey,NJ
3,California,CA
4,Montana,MT
5,Texas,TX
6,Colorado,CO
7,Oklahoma,OK
8,Florida,FL
9,North Dakota,ND


In [20]:
# Drop StateFull in main data
data.drop('StateFull', axis=1, inplace=True)
data.head()

Unnamed: 0,EmployeeID,First_Name,Surname,StreetAddress,City,ZipCode,Country,CountryFull,Age,Start_Date,Termination_Date,DOB,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years,State_code,Office_id
0,100001,Patrice,Moore,1427 Buckhannan Avenue,North Syracuse,13212,US,United States,37.0,2009-05-04,,1986-01-05,1,JP_1000,2009,0,171.960659,14.133753,NY,0
1,100002,David,Rickards,4265 Graystone Lakes,Macon,31206,US,United States,52.0,2009-05-04,,1971-07-13,1,JP_1001,2009,0,171.960659,14.133753,GA,0
2,100003,Grace,Maldonado,1680 Hudson Street,Weehawken,7087,US,United States,34.0,2009-05-18,2013-06-05 00:00:00.000000,1989-01-25,0,JP_1027,2009,2013,49.3,4.052055,NJ,0
3,100004,Justin,Edgin,1262 Limer Street,Rome,30165,US,United States,27.0,2009-06-22,2013-10-16 00:00:00.000000,1996-05-01,0,JP_1041,2009,2013,52.566667,4.320548,GA,2
4,100005,Benjamin,Vargas,2431 Rainbow Road,Santa Ana,92705,US,United States,51.0,2009-07-13,2011-01-10 00:00:00.000000,1972-05-05,0,JP_1018,2009,2011,18.2,1.49589,CA,0


Normalising into country table

In [21]:
set(data['Country']) # Check how many countries we have

{'GB', 'HK', 'JP', 'NO', 'US'}

In [22]:
set(data['CountryFull'])

{'Hong Kong', 'Japan', 'Norway', 'United Kingdom', 'United States'}

In [23]:
country_df = data[['Country','CountryFull']].drop_duplicates().reset_index(drop=True)
country_df

Unnamed: 0,Country,CountryFull
0,US,United States
1,NO,Norway
2,GB,United Kingdom
3,JP,Japan
4,HK,Hong Kong


In [24]:
# Dropping CountryFull in main data
data.drop('CountryFull', axis=1, inplace=True)
data.head()

Unnamed: 0,EmployeeID,First_Name,Surname,StreetAddress,City,ZipCode,Country,Age,Start_Date,Termination_Date,DOB,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years,State_code,Office_id
0,100001,Patrice,Moore,1427 Buckhannan Avenue,North Syracuse,13212,US,37.0,2009-05-04,,1986-01-05,1,JP_1000,2009,0,171.960659,14.133753,NY,0
1,100002,David,Rickards,4265 Graystone Lakes,Macon,31206,US,52.0,2009-05-04,,1971-07-13,1,JP_1001,2009,0,171.960659,14.133753,GA,0
2,100003,Grace,Maldonado,1680 Hudson Street,Weehawken,7087,US,34.0,2009-05-18,2013-06-05 00:00:00.000000,1989-01-25,0,JP_1027,2009,2013,49.3,4.052055,NJ,0
3,100004,Justin,Edgin,1262 Limer Street,Rome,30165,US,27.0,2009-06-22,2013-10-16 00:00:00.000000,1996-05-01,0,JP_1041,2009,2013,52.566667,4.320548,GA,2
4,100005,Benjamin,Vargas,2431 Rainbow Road,Santa Ana,92705,US,51.0,2009-07-13,2011-01-10 00:00:00.000000,1972-05-05,0,JP_1018,2009,2011,18.2,1.49589,CA,0


In [25]:
data.rename(columns={'Country':'Country_code'}, inplace=True)
country_df.rename(columns={'Country':'Country_code'}, inplace=True)

### 4. Employee Demographics table

In [26]:
emp_demo = pd.read_csv('cleanData/employee_details.csv')
emp_demo.head()

Unnamed: 0,EmployeeID,Gender,Gender Identity,Race/Ethnicity,Veteran,Disability,Education,Sexual Orientation
0,100001,female,female,White,0,0,Undergraduate,Heterosexual
1,100002,male,male,White,0,1,Undergraduate,Heterosexual
2,100003,female,female,Asian,0,0,Undergraduate,Heterosexual
3,100004,male,male,White,0,0,Undergraduate,Heterosexual
4,100005,male,male,Hispanic or Latino,0,0,Undergraduate,Missing


We are going to create employee information table instead of appending into emp_details table. I will add `Age` and `DOB` into emp_details. The info table will contain the street address, city, zipcode, country_code, start/termination date, tenure, and active status. Meanwhile the above table will be the employee demographics table.

In [27]:
extra_data = data[['EmployeeID','Age','DOB']]
extra_data.head()

Unnamed: 0,EmployeeID,Age,DOB
0,100001,37.0,1986-01-05
1,100002,52.0,1971-07-13
2,100003,34.0,1989-01-25
3,100004,27.0,1996-05-01
4,100005,51.0,1972-05-05


In [28]:
emp_demo=emp_demo.merge(extra_data, on='EmployeeID', how='left')
emp_demo.head()

Unnamed: 0,EmployeeID,Gender,Gender Identity,Race/Ethnicity,Veteran,Disability,Education,Sexual Orientation,Age,DOB
0,100001,female,female,White,0,0,Undergraduate,Heterosexual,37.0,1986-01-05
1,100002,male,male,White,0,1,Undergraduate,Heterosexual,52.0,1971-07-13
2,100003,female,female,Asian,0,0,Undergraduate,Heterosexual,34.0,1989-01-25
3,100004,male,male,White,0,0,Undergraduate,Heterosexual,27.0,1996-05-01
4,100005,male,male,Hispanic or Latino,0,0,Undergraduate,Missing,51.0,1972-05-05


In [29]:
# Dropping extra columns
data.drop(['Age','DOB'], axis=1, inplace=True)

### Employee Information table

In [30]:
data.columns

Index(['EmployeeID', 'First_Name', 'Surname', 'StreetAddress', 'City',
       'ZipCode', 'Country_code', 'Start_Date', 'Termination_Date',
       'Active Status', 'Job_Profile', 'start_year', 'termination_year',
       'tenure_months', 'tenure_years', 'State_code', 'Office_id'],
      dtype='object')

In [31]:
col_to_ext = ['EmployeeID', 'StreetAddress', 'City',
       'ZipCode', 'State_code', 'Country_code', 'Start_Date', 'Termination_Date',
       'Active Status', 'start_year', 'termination_year',
       'tenure_months', 'tenure_years']
emp_info = data[col_to_ext]
emp_info

Unnamed: 0,EmployeeID,StreetAddress,City,ZipCode,State_code,Country_code,Start_Date,Termination_Date,Active Status,start_year,termination_year,tenure_months,tenure_years
0,100001,1427 Buckhannan Avenue,North Syracuse,13212,NY,US,2009-05-04,,1,2009,0,171.960659,14.133753
1,100002,4265 Graystone Lakes,Macon,31206,GA,US,2009-05-04,,1,2009,0,171.960659,14.133753
2,100003,1680 Hudson Street,Weehawken,07087,NJ,US,2009-05-18,2013-06-05 00:00:00.000000,0,2009,2013,49.300000,4.052055
3,100004,1262 Limer Street,Rome,30165,GA,US,2009-06-22,2013-10-16 00:00:00.000000,0,2009,2013,52.566667,4.320548
4,100005,2431 Rainbow Road,Santa Ana,92705,CA,US,2009-07-13,2011-01-10 00:00:00.000000,0,2009,2011,18.200000,1.495890
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4963,104964,1381 Neville Street,Evansville,47711,IN,US,2021-03-08,2022-11-27 00:00:00.000000,0,2021,2022,20.966667,1.723288
4964,104965,3336 Fleming Way,Richmond,23233,VA,US,2021-01-25,,1,2021,0,29.160659,2.396766
4965,104966,226 White River Way,Salt Lake City,84111,UT,US,2021-08-23,,1,2021,0,22.160659,1.821424
4966,104967,3318 Maxwell Farm Road,Chico,95926,CA,US,2021-02-01,,1,2021,0,28.927325,2.377588


In [32]:
# Dropping extra columns in main table
col_to_ext.remove('EmployeeID')
data.drop(col_to_ext, axis=1, inplace=True)

In [33]:
data.head()

Unnamed: 0,EmployeeID,First_Name,Surname,Job_Profile,Office_id
0,100001,Patrice,Moore,JP_1000,0
1,100002,David,Rickards,JP_1001,0
2,100003,Grace,Maldonado,JP_1027,0
3,100004,Justin,Edgin,JP_1041,2
4,100005,Benjamin,Vargas,JP_1018,0


So, in total we have this tables:
- `office`  
- `job_details`
- `state_details`
- `country_details`
- `emp_demo`
- `emp_info`
- `main_details`

Here is the schema.

![erd-3.png](attachment:erd-3.png)