# Normalising data to store them in MS SQL Server

For efficient database storage in MS SQL Server, we need to normalise the data to reduce redundancy and composite values. I will normalise the data with star schema. 

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('cleanData/main_data.csv')
data.head()

Unnamed: 0,EmployeeID,First_Name,Surname,StreetAddress,City,State,StateFull,ZipCode,Country,CountryFull,...,Job_title,DOB,level,Salary,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years
0,100001,Patrice,Moore,1427 Buckhannan Avenue,North Syracuse,NY,New York,13212,US,United States,...,CEO,1/5/1986,CSuite,500000,1,JP_1000,2009,0,171.932094,14.131405
1,100002,David,Rickards,4265 Graystone Lakes,Macon,GA,Georgia,31206,US,United States,...,HR Manager,7/13/1971,Manager,70000,1,JP_1001,2009,0,171.932094,14.131405
2,100003,Grace,Maldonado,1680 Hudson Street,Weehawken,NJ,New Jersey,7087,US,United States,...,Graphic Designer,1/25/1989,Individual Contributor,77000,0,JP_1022,2009,2013,49.3,4.052055
3,100004,Justin,Edgin,1262 Limer Street,Rome,GA,Georgia,30165,US,United States,...,CTO,5/1/1996,CSuite,400000,0,JP_1036,2009,2013,52.566667,4.320548
4,100005,Benjamin,Vargas,2431 Rainbow Road,Santa Ana,CA,California,92705,US,United States,...,Associate Account Manager,5/5/1972,Manager,51000,0,JP_1015,2009,2011,18.2,1.49589


In [3]:
data.columns

Index(['EmployeeID', 'First_Name', 'Surname', 'StreetAddress', 'City', 'State',
       'StateFull', 'ZipCode', 'Country', 'CountryFull', 'Age', 'Office',
       'Start_Date', 'Termination_Date', 'Office_Type', 'Department',
       'Currency', 'Bonus_pct', 'Job_title', 'DOB', 'level', 'Salary',
       'Active Status', 'Job_Profile', 'start_year', 'termination_year',
       'tenure_months', 'tenure_years'],
      dtype='object')

## 1NF

### Finding dependency

#### 1. Company/Office details

First, we are going to identify dependency with office.

In [4]:
company_details = pd.read_csv('cleanData/company_details.csv')
company_details.head()

Unnamed: 0,Office,COL Amount,Currency
0,NYC,100,USD
1,Boulder,70,USD
2,Oslo,70,NOK
3,SanJose,90,USD
4,London,90,GBP


In [5]:
data.head().loc[:,'Office':]

Unnamed: 0,Office,Start_Date,Termination_Date,Office_Type,Department,Currency,Bonus_pct,Job_title,DOB,level,Salary,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years
0,NYC,2009-05-04,,Corporate,Corporate,USD,1.0,CEO,1/5/1986,CSuite,500000,1,JP_1000,2009,0,171.932094,14.131405
1,NYC,2009-05-04,,Corporate,Corporate,USD,0.2,HR Manager,7/13/1971,Manager,70000,1,JP_1001,2009,0,171.932094,14.131405
2,NYC,2009-05-18,2013-06-05 00:00:00.000000,Corporate,Marketing,USD,0.15,Graphic Designer,1/25/1989,Individual Contributor,77000,0,JP_1022,2009,2013,49.3,4.052055
3,Boulder,2009-06-22,2013-10-16 00:00:00.000000,Corporate,Technology,USD,0.5,CTO,5/1/1996,CSuite,400000,0,JP_1036,2009,2013,52.566667,4.320548
4,NYC,2009-07-13,2011-01-10 00:00:00.000000,Corporate,Customer Service,USD,0.15,Associate Account Manager,5/5/1972,Manager,51000,0,JP_1015,2009,2011,18.2,1.49589


In [6]:
data.loc[(data['Office_Type']=='Technology'),'Office':'Job_Profile']

Unnamed: 0,Office,Start_Date,Termination_Date,Office_Type,Department,Currency,Bonus_pct,Job_title,DOB,level,Salary,Active Status,Job_Profile
48,Boulder,2010-01-25,2010-10-21 00:00:00.000000,Technology,Technology,USD,0.15,Software Engineer,11/11/1974,Individual Contributor,93000,0,JP_1038
49,Boulder,2010-02-01,2014-05-14 00:00:00.000000,Technology,Technology,USD,0.15,Software Engineer,5/4/1976,Individual Contributor,93000,0,JP_1038
53,Boulder,2010-02-15,2014-04-24 00:00:00.000000,Technology,Technology,USD,0.20,"Director, Engineering",9/16/1983,Director,100000,0,JP_1039
54,Oslo,2010-02-22,2019-08-19 00:00:00.000000,Technology,Technology,NOK,0.15,Software Engineer,2/14/1982,Individual Contributor,537000,0,JP_1038
55,Boulder,2010-03-08,2011-07-15 00:00:00.000000,Technology,Technology,USD,0.15,Software Engineer,3/23/1987,Individual Contributor,100000,0,JP_1038
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4957,Boulder,2021-11-22,,Technology,Technology,USD,0.15,Software Engineer,8/5/1977,Individual Contributor,77000,1,JP_1038
4961,NYC,2021-09-13,,Technology,Corporate,USD,0.50,Chief Human Resources Officer,2/5/1957,CSuite,266000,1,JP_1049
4962,SanFran,2021-06-21,,Technology,Sales,USD,0.15,Sales Team Lead,12/22/1974,Senior,95000,1,JP_1034
4964,Boulder,2021-01-25,,Technology,Technology,USD,0.15,Software Engineer,7/5/1975,Individual Contributor,77000,1,JP_1038


There seems to be a partial dependency on office and office type. We are going to normalise that.

In [7]:
d = data[['Office','Office_Type']]
company_details=company_details.merge(d, on='Office', how='left').drop_duplicates()
company_details

Unnamed: 0,Office,COL Amount,Currency,Office_Type
0,NYC,100,USD,Corporate
1664,NYC,100,USD,Technology
1796,Boulder,70,USD,Corporate
1797,Boulder,70,USD,Technology
3429,Oslo,70,NOK,Technology
3579,SanJose,90,USD,Corporate
4434,SanJose,90,USD,Technology
4496,London,90,GBP,Corporate
4746,Tokyo,85,JPY,Corporate
4796,HongKong,85,HKD,Technology


Dropping Unused Columns

In [8]:
company_details.drop(['COL Amount','Currency'], axis=1, inplace=True)

Creating ID for office table

In [9]:
company_details['Office_id']=[i for i in range(company_details.shape[0])]
company_details

Unnamed: 0,Office,Office_Type,Office_id
0,NYC,Corporate,0
1664,NYC,Technology,1
1796,Boulder,Corporate,2
1797,Boulder,Technology,3
3429,Oslo,Technology,4
3579,SanJose,Corporate,5
4434,SanJose,Technology,6
4496,London,Corporate,7
4746,Tokyo,Corporate,8
4796,HongKong,Technology,9


Adding `office_id` into main data

In [10]:
data = data.merge(company_details, on=['Office','Office_Type'], how='left')
data.head(10)

Unnamed: 0,EmployeeID,First_Name,Surname,StreetAddress,City,State,StateFull,ZipCode,Country,CountryFull,...,DOB,level,Salary,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years,Office_id
0,100001,Patrice,Moore,1427 Buckhannan Avenue,North Syracuse,NY,New York,13212,US,United States,...,1/5/1986,CSuite,500000,1,JP_1000,2009,0,171.932094,14.131405,0
1,100002,David,Rickards,4265 Graystone Lakes,Macon,GA,Georgia,31206,US,United States,...,7/13/1971,Manager,70000,1,JP_1001,2009,0,171.932094,14.131405,0
2,100003,Grace,Maldonado,1680 Hudson Street,Weehawken,NJ,New Jersey,7087,US,United States,...,1/25/1989,Individual Contributor,77000,0,JP_1022,2009,2013,49.3,4.052055,0
3,100004,Justin,Edgin,1262 Limer Street,Rome,GA,Georgia,30165,US,United States,...,5/1/1996,CSuite,400000,0,JP_1036,2009,2013,52.566667,4.320548,2
4,100005,Benjamin,Vargas,2431 Rainbow Road,Santa Ana,CA,California,92705,US,United States,...,5/5/1972,Manager,51000,0,JP_1015,2009,2011,18.2,1.49589,0
5,100006,Nelson,Grillo,3645 Coolidge Street,North Custer,MT,Montana,59024,US,United States,...,6/21/1993,Individual Contributor,76000,0,JP_1029,2009,2021,147.866667,12.153425,0
6,100007,Kevin,Rainey,977 Black Oak Hollow Road,Santa Clara,CA,California,95054,US,United States,...,5/13/1990,Individual Contributor,56000,0,JP_1016,2009,2011,19.033333,1.564384,0
7,100008,Melanie,Hurst,2751 Holden Street,San Diego,CA,California,92103,US,United States,...,1/23/1983,Individual Contributor,72000,0,JP_1029,2009,2012,32.266667,2.652055,0
8,100009,Greg,Boon,4791 Loving Acres Road,Grapevine,TX,Texas,76051,US,United States,...,1/4/1992,Director,74000,1,JP_1030,2009,0,169.132094,13.901268,0
9,100010,Frank,Stockdale,1413 Roy Alley,Centennial,CO,Colorado,80111,US,United States,...,10/21/1989,Individual Contributor,52000,0,JP_1016,2009,2014,58.366667,4.79726,0


Dropping dependent columns

In [11]:
data.drop(['Office','Office_Type','Currency'], axis=1, inplace=True)
data.columns

Index(['EmployeeID', 'First_Name', 'Surname', 'StreetAddress', 'City', 'State',
       'StateFull', 'ZipCode', 'Country', 'CountryFull', 'Age', 'Start_Date',
       'Termination_Date', 'Department', 'Bonus_pct', 'Job_title', 'DOB',
       'level', 'Salary', 'Active Status', 'Job_Profile', 'start_year',
       'termination_year', 'tenure_months', 'tenure_years', 'Office_id'],
      dtype='object')

#### 4. Job Details table

In [12]:
job_details = pd.read_csv('cleanData/job_details.csv')
job_details.head()

Unnamed: 0,Department,Job_title,Job_Profile,Compensation,Level,Bonus %
0,Corporate,CEO,JP_1000,500000.0,CSuite,1.0
1,Corporate,HR Manager,JP_1001,100000.0,Manager,0.2
2,Corporate,AR Specialist,JP_1002,65000.0,Individual Contributor,0.15
3,Corporate,AP Specialist,JP_1003,65000.0,Individual Contributor,0.15
4,Corporate,FP&A Analyst,JP_1004,70000.0,Individual Contributor,0.15


In [13]:
data.head().iloc[:,:13]

Unnamed: 0,EmployeeID,First_Name,Surname,StreetAddress,City,State,StateFull,ZipCode,Country,CountryFull,Age,Start_Date,Termination_Date
0,100001,Patrice,Moore,1427 Buckhannan Avenue,North Syracuse,NY,New York,13212,US,United States,35,2009-05-04,
1,100002,David,Rickards,4265 Graystone Lakes,Macon,GA,Georgia,31206,US,United States,49,2009-05-04,
2,100003,Grace,Maldonado,1680 Hudson Street,Weehawken,NJ,New Jersey,7087,US,United States,32,2009-05-18,2013-06-05 00:00:00.000000
3,100004,Justin,Edgin,1262 Limer Street,Rome,GA,Georgia,30165,US,United States,25,2009-06-22,2013-10-16 00:00:00.000000
4,100005,Benjamin,Vargas,2431 Rainbow Road,Santa Ana,CA,California,92705,US,United States,49,2009-07-13,2011-01-10 00:00:00.000000


In [14]:
data.head().iloc[:,13:]

Unnamed: 0,Department,Bonus_pct,Job_title,DOB,level,Salary,Active Status,Job_Profile,start_year,termination_year,tenure_months,tenure_years,Office_id
0,Corporate,1.0,CEO,1/5/1986,CSuite,500000,1,JP_1000,2009,0,171.932094,14.131405,0
1,Corporate,0.2,HR Manager,7/13/1971,Manager,70000,1,JP_1001,2009,0,171.932094,14.131405,0
2,Marketing,0.15,Graphic Designer,1/25/1989,Individual Contributor,77000,0,JP_1022,2009,2013,49.3,4.052055,0
3,Technology,0.5,CTO,5/1/1996,CSuite,400000,0,JP_1036,2009,2013,52.566667,4.320548,2
4,Customer Service,0.15,Associate Account Manager,5/5/1972,Manager,51000,0,JP_1015,2009,2011,18.2,1.49589,0


Identifier for job_details is `Job_Profile`, so it is already normalised. Dropping columns that existed in job_details table in main data