In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import math

# default='warn'
pd.options.mode.chained_assignment = None  
pd.set_option('display.max_columns', None)

%matplotlib inline

In [2]:
# import data
df = pd.read_csv('data.csv',delimiter=',')
df.head()

Unnamed: 0,Surname,Name,Age,Gender,Country,Ethnicity,Start_date,Department,Position,Salary
0,Sweetwater,Alex,51,Male,United States,White,15-08-2011,Software Engineering,Software Engineering Manager,"$56,160.00"
1,Carabbio,Judith,30,Female,United States,White,11-11-2013,Software Engineering,Software Engineer,"$1,16,480.00"
2,Saada,Adell,31,Female,United States,White,05-11-2012,Software Engineering,Software Engineer,"$1,02,440.00"
3,Szabo,Andrew,34,Male,United States,White,07-07-2014,Software Engineering,Software Engineer,"$99,840.00"
4,Andreola,Colby,38,Female,United States,White,10-11-2014,Software Engineering,Software Engineer,"$99,008.00"


In [6]:
# check duplicates
df.duplicated().all()

False

In [7]:
# check null values 
df.isna().all()

Surname       False
Name          False
Age           False
Gender        False
Country       False
Ethnicity     False
Start_date    False
Department    False
Position      False
Salary        False
dtype: bool

In [19]:
# check data type
df.dtypes

Surname               object
Name                  object
Age                    int64
Gender                object
Country               object
Ethnicity             object
Start_date    datetime64[ns]
Department            object
Position              object
Salary               float64
dtype: object

In [17]:
# rectify data types 
df['Start_date'] = pd.to_datetime(df['Start_date'])
df['Salary'] = df['Salary'].str.replace('$','')
df['Salary'] = df['Salary'].str.replace(',','')
df['Salary'] = df['Salary'].astype(float)

In [12]:
#check data
df['Start_date']

0     2011-08-15
1     2013-11-11
2     2012-05-11
3     2014-07-07
4     2014-10-11
         ...    
169   2014-09-29
170   2012-05-14
171   2015-03-30
172   2014-10-11
173   2015-02-16
Name: Start_date, Length: 174, dtype: datetime64[ns]

In [18]:
df['Salary']

0       56160.0
1      116480.0
2      102440.0
3       99840.0
4       99008.0
         ...   
169     31200.0
170     29120.0
171    114816.0
172     88920.0
173     72696.0
Name: Salary, Length: 174, dtype: float64

In [21]:
# check gender values
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [22]:
# create data for male and female -- gender factor
df_female = df[df['Gender']=='Female']
df_male = df[df['Gender']=='Male']

In [23]:
# check Ethnicity values
df['Ethnicity'].unique()

array(['White', 'Two or more races', 'Hispanic',
       'Black or African American', 'Asian'], dtype=object)

In [24]:
# create data for white and non whites -- race factor
df_white = df[df['Ethnicity']=='White']
df_non_white = df[df['Ethnicity']!='White']

In [25]:
# create data for employees above 35 years of age. -- racial discrimination
df_over35 = df[df['Age']> 35]

In [52]:
# top 50 employees - highest tenure / longest in the organisation - gender and race factor
df_top50_tenure = df.sort_values(by='Start_date',ascending=True)[:50]

In [32]:
# now we should conduct he test for each department
df['Department'].unique()

array(['Software Engineering', 'Sales', 'Production', 'IT/IS',
       'Executive Office', 'Admin Offices'], dtype=object)

In [31]:
# we see that there are issues with software engineering values
df['Department'] = df['Department'].str.replace('Software Engineering     ','Software Engineering')
df['Department'] = df['Department'].str.replace('Production       ','Production')

In [33]:
# software Engineering
df_female_sw_engg = df[(df['Gender']=='Female') & (df['Department']=='Software Engineering')]
df_male_sw_engg = df[(df['Gender']=='Male') & (df['Department']=='Software Engineering')]

In [34]:
# sales
df_female_sales = df[(df['Gender']=='Female') & (df['Department']=='Sales')]
df_male_sales = df[(df['Gender']=='Male') & (df['Department']=='Sales')]

In [35]:
# production
df_female_prod = df[(df['Gender']=='Female') & (df['Department']=='Production')]
df_male_prod = df[(df['Gender']=='Male') & (df['Department']=='Production')]

In [36]:
# IT/IS
df_female_sales = df[(df['Gender']=='Female') & (df['Department']=='IT/IS')]
df_male_sales = df[(df['Gender']=='Male') & (df['Department']=='IT/IS')]

In [37]:
# executive office
df_female_exec_office = df[(df['Gender']=='Female') & (df['Department']=='Executive Office')]
df_male_exec_office = df[(df['Gender']=='Male') & (df['Department']=='Executive Office')]

In [38]:
# admin offices
df_female_admin_office = df[(df['Gender']=='Female') & (df['Department']=='Admin Offices')]
df_male_admin_office = df[(df['Gender']=='Male') & (df['Department']=='Admin Offices')]

### Let's begin our Hypothesis Testing.

__Questions and Hypothesis.__


__Q1.__ Is there any gender wage gap in the organisation on the basis of gender ?

__Q2.__ Is the company discriminating on the basis of race/ethnicity (white vs else) ?

__Q3.__ Is there racial discrimination for the employees above the age of 35 ?

__Q4.__ Are the top 50 longest working employees facing race or gender discrimination ?

__Q5.__ We should check for each department is there gender discrimination ?

__Q6.__ We should check for each department is there racial discrimination ?