In [69]:
# import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import math
from scipy.stats import sem
from scipy.stats import t

# default='warn'
pd.options.mode.chained_assignment = None  
pd.set_option('display.max_columns', None)

%matplotlib inline

In [2]:
# import data
df = pd.read_csv('data.csv',delimiter=',')
df.head()

Unnamed: 0,Surname,Name,Age,Gender,Country,Ethnicity,Start_date,Department,Position,Salary
0,Sweetwater,Alex,51,Male,United States,White,15-08-2011,Software Engineering,Software Engineering Manager,"$56,160.00"
1,Carabbio,Judith,30,Female,United States,White,11-11-2013,Software Engineering,Software Engineer,"$1,16,480.00"
2,Saada,Adell,31,Female,United States,White,05-11-2012,Software Engineering,Software Engineer,"$1,02,440.00"
3,Szabo,Andrew,34,Male,United States,White,07-07-2014,Software Engineering,Software Engineer,"$99,840.00"
4,Andreola,Colby,38,Female,United States,White,10-11-2014,Software Engineering,Software Engineer,"$99,008.00"


In [3]:
# check duplicates
df.duplicated().all()

False

In [4]:
# check null values 
df.isna().all()

Surname       False
Name          False
Age           False
Gender        False
Country       False
Ethnicity     False
Start_date    False
Department    False
Position      False
Salary        False
dtype: bool

In [5]:
# check data type
df.dtypes

Surname       object
Name          object
Age            int64
Gender        object
Country       object
Ethnicity     object
Start_date    object
Department    object
Position      object
Salary        object
dtype: object

In [6]:
# rectify data types 
df['Start_date'] = pd.to_datetime(df['Start_date'])
df['Salary'] = df['Salary'].str.replace('$','')
df['Salary'] = df['Salary'].str.replace(',','')
df['Salary'] = df['Salary'].astype(float)

In [7]:
#check data
df['Start_date']

0     2011-08-15
1     2013-11-11
2     2012-05-11
3     2014-07-07
4     2014-10-11
         ...    
169   2014-09-29
170   2012-05-14
171   2015-03-30
172   2014-10-11
173   2015-02-16
Name: Start_date, Length: 174, dtype: datetime64[ns]

In [8]:
df['Salary']

0       56160.0
1      116480.0
2      102440.0
3       99840.0
4       99008.0
         ...   
169     31200.0
170     29120.0
171    114816.0
172     88920.0
173     72696.0
Name: Salary, Length: 174, dtype: float64

In [13]:
df.columns

Index(['Age', 'Gender', 'Ethnicity', 'Start_date', 'Position', 'Salary'], dtype='object')

In [12]:
# remove unnecessary columns
df.drop(['Surname', 'Name','Country','Department'], axis = 1, inplace=True) 

In [14]:
# check gender values
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [42]:
# create data for male and female -- gender factor
df_female = df[df['Gender']=='Female']['Salary']
df_male = df[df['Gender']=='Male']['Salary']

In [16]:
# check Ethnicity values
df['Ethnicity'].unique()

array(['White', 'Two or more races', 'Hispanic',
       'Black or African American', 'Asian'], dtype=object)

In [44]:
# create data for white and non whites -- race factor
df_white = df[df['Ethnicity']=='White']['Salary']
df_non_white = df[df['Ethnicity']!='White']['Salary']

In [45]:
# create data for employees above 35 years of age. -- racial discrimination
df_over35_white = df[(df['Age']> 35) & (df['Ethnicity']=='White')]['Salary']
df_over35_non_white = df[(df['Age']> 35) & (df['Ethnicity']!='White')]['Salary']

In [104]:
# create data for white and non whites -- race factor
df_over35_female = df[(df['Age']> 35) & (df['Gender']=='Female')]['Salary']
df_over35_male = df[(df['Age']> 35) & (df['Gender']=='Male')]['Salary']

In [22]:
# top 50 employees - highest tenure / longest in the organisation - gender and race factor
df_top50_tenure = df.sort_values(by='Start_date',ascending=True)[:50]

In [47]:
# create data for male and female -- gender factor
df_top50_female = df_top50_tenure[df_top50_tenure['Gender']=='Female']['Salary']
df_top50_male = df_top50_tenure[df_top50_tenure['Gender']=='Male']['Salary']

In [48]:
# create data for male and female -- gender factor
df_top50_white = df_top50_tenure[df_top50_tenure['Ethnicity']=='White']['Salary']
df_top50_non_white = df_top50_tenure[df_top50_tenure['Ethnicity']!='White']['Salary']

### Let's begin our Hypothesis Testing.

Before proceeding lets assess the situations. The all of the condition satisfies :
* Unknown Variance, since sample data and assumed to be equal
* dependant samples

Therefor we can perform  Student's T Test. Calculate T value , p value to infer the hypothesis.

__Process :__

1. Calucate n (sample size) for both dataset.
2. Standard Deviation , s<sub>x</sub><sup>2</sup>, s<sub>y</sub><sup>2</sup>
3. Pooled Variance

   Formula : $$s_{p}^2 = \frac {(n_{x} - 1)s_{x}^2 + (n_{y} - 1)s_{y}^2}{n_{x} + n_{y} - 2}$$

4. Standard Error 

    Formula : $$std. err = \sqrt { \frac { s_{p}^2 }{n_{x}} + \frac { s_{p}^2 }{n_{y}}}$$

5. Degree of freedom 

    Formula : $$df = n_{x} + n_{y} -2$$

6. T Statistic Value 

    Formula : $$T = \frac {\bar d - \mu_{o}}{std.err} $$

7. Calculate P Value

    * Refer T Table for P value significance.

__We choose significance value of 0.05, 95%. α = 0.05__

</br>


__*Thumb Rule*__

1. If T score greater than 2, or lower than -2 we can easily reject the null Hypothesis.
2. P value lower than 0.0000 is extreamly significant, although we consider only upto 3.
    
</br>

__Questions and Hypothesis.__

For each case we consider 2 tail test, because of our hypothesis.

__Q1.__ Is there any gender wage gap in the organisation ?

$$H_{0}  :  \mu_{male} - \mu_{female} = 0 \space , \space no \space gender \space gap$$
$$H_{1}  :  \mu_{male} - \mu_{female} \neq 0 \space , \space gender \space gap \space present$$


__Q2.__ Is the company discriminating on the basis of race/ethnicity (white vs else) ?

$$H_{0}  :  \mu_{white} - \mu_{non-white} = 0 \space , \space no \space racial \space gap$$
$$H_{1}  :  \mu_{white} - \mu_{non-white} \neq 0 \space , \space racial \space gap \space present$$


__Q3.__ Is there racial discrimination for the employees above the age of 35 ?

$$H_{0}  :  \mu_{white-above35} - \mu_{non-white-above35} = 0 \space , \space no \space racial \space gap$$
$$H_{1}  :  \mu_{white-above35} - \mu_{non-white-above35} \neq 0 \space , \space racial \space gap \space present$$

__Q5.__ Is there gender discrimination for the employees above the age of 35 ?

$$H_{0}  :  \mu_{male-above35} - \mu_{female-above35} = 0 \space , \space no \space racial \space gap$$
$$H_{1}  :  \mu_{male-above35} - \mu_{female-above35} \neq 0 \space , \space racial \space gap \space present$$

__Q6.__ Are the top 50 longest working employees facing gender discrimination ?

$$H_{0}  :  \mu_{male-top50} - \mu_{female-top50} = 0 \space , \space no \space gender \space gap$$
$$H_{1}  :  \mu_{male-top50} - \mu_{female-top50} \neq 0 \space , \space gender \space gap \space present$$

__Q7.__ Are the top 50 longest working employees facing racial discrimination ?

$$H_{0}  :  \mu_{white-top50} - \mu_{non-white-top50} = 0 \space , \space no \space racial \space gap$$
$$H_{1}  :  \mu_{white-top50} - \mu_{non-white-top50} \neq 0 \space , \space racial \space gap \space present$$

__Q8.__ We should check for each department is there gender discrimination ?

$$H_{0}  :  \mu_{male-department} - \mu_{female-department} = 0 \space , \space no \space gender \space gap$$
$$H_{1}  :  \mu_{male-department} - \mu_{female-department} \neq 0 \space , \space gender \space gap \space present$$

__Q9.__ We should check for each department is there racial discrimination ?

$$H_{0}  :  \mu_{white-department} - \mu_{non-white-department} = 0 \space , \space no \space racial \space gap$$
$$H_{1}  :  \mu_{white-department} - \mu_{non-white-department} \neq 0 \space , \space racial \space gap \space present$$

In [158]:
## create function
def hypothesis_test(df1,df2,alpha,hypo):
    m1, m2 = df1.mean(), df2.mean()
    se1, se2 = sem(df1), sem(df2)
    se = math.sqrt(se1**2.0 + se2**2.0)
    ts = ((m1 - m2)-hypo) / se
    deg_free = len(df1) + len(df2) - 2
    crit_value = t.ppf(1.0 - alpha, deg_free)
    p = (1.0 - t.cdf(abs(ts), deg_free)) * 2.0
    result = ''
    if p < alpha:
        result = 'Reject the Null Hypothesis'
    else:
        result = "Don't Reject the Null Hypothesis"
        
    if np.isnan(ts):
        print('Any of the two dataset is either null or has only one item')
    else:
        print("T Statistic : ",ts)
        print("Degree of Freedom : ",deg_free)
        print("Critical Value : ",crit_value)
        print("P Value : ",p)
        print("")
        if result == 'Reject the Null Hypothesis':
            print('###########################################')
            print("Verdict : ",result)
            print('###########################################')
        else:
            print("Verdict : ",result)
        return ts

#### Q1. Is there any gender wage gap in the organisation ?

In [159]:
hypothesis_test(df_male,df_female,0.05,0)

T Statistic :  1.2508402639016816
Degree of Freedom :  172
Critical Value :  1.6537609493607581
P Value :  0.21269158371105035

Verdict :  Don't Reject the Null Hypothesis


1.2508402639016816

#### The result implies out hypothesis was true, that there was no discrimination against gender.

#### Q2. Is the company discriminating on the basis of race/ethnicity (white vs else) ?

In [160]:
hypothesis_test(df_white,df_non_white,0.05,0)

T Statistic :  -0.6572076630862012
Degree of Freedom :  172
Critical Value :  1.6537609493607581
P Value :  0.5119258627156795

Verdict :  Don't Reject the Null Hypothesis


-0.6572076630862012

#### The result implies out hypothesis was true, that there was is no racial profiling.

#### Q3. Is there racial discrimination for the employees above the age of 35 ?

In [161]:
hypothesis_test(df_over35_white,df_over35_non_white,0.05,0)

T Statistic :  0.5469636226703869
Degree of Freedom :  90
Critical Value :  1.6619610839969403
P Value :  0.5857573869489032

Verdict :  Don't Reject the Null Hypothesis


0.5469636226703869

#### The result implies out hypothesis was true, that there was is no racial profiling while the employees are abive the age of 35.

#### Q4. Is there gender discrimination for the employees above the age of 35 ?

In [162]:
hypothesis_test(df_over35_male,df_over35_female,0.05,0)

T Statistic :  1.4087675289655615
Degree of Freedom :  90
Critical Value :  1.6619610839969403
P Value :  0.16234944888715952

Verdict :  Don't Reject the Null Hypothesis


1.4087675289655615

#### The result implies out hypothesis was true, that there is no age discrimination against sex, above the age of 35.

#### Q5. Are the top 50 longest working employees facing gender discrimination ?

In [163]:
hypothesis_test(df_top50_male,df_top50_female,0.05,0)

T Statistic :  0.3527540483112269
Degree of Freedom :  48
Critical Value :  1.6772241953450393
P Value :  0.7258171918870246

Verdict :  Don't Reject the Null Hypothesis


0.3527540483112269

#### The result implies out hypothesis was true, that there is no gender discrimination on top 50 oldest working employees.

#### Q6. Are the top 50 longest working employees facing racial discrimination ?

In [164]:
hypothesis_test(df_top50_white,df_top50_non_white,0.05,0)

T Statistic :  -0.5143007160294015
Degree of Freedom :  48
Critical Value :  1.6772241953450393
P Value :  0.6094016746584257

Verdict :  Don't Reject the Null Hypothesis


-0.5143007160294015

#### The result implies out hypothesis was true, that there is no gender discrimination on top 50 oldest working employees.

#### Q7. We should check for each position is there gender discrimination ?

In [165]:
for i in ['Production Technician I','Area Sales Manager','Production Technician II','Production Manager',
          'Network Engineer','Database Administrator','Software Engineer','IT Support']:
    df_fe = df[(df['Gender']=='Female') & (df['Position']==i)]['Salary']
    df_m = df[(df['Gender']=='Male') & (df['Position']==i)]['Salary']
    print("####################",i,"####################")
    print("")
    a = hypothesis_test(df_fe,df_m,0.05,0)
    print("")
    print("")
    print("")

#################### Production Technician I ####################

T Statistic :  0.9272126831227164
Degree of Freedom :  71
Critical Value :  1.666599658219398
P Value :  0.35695740648515684

Verdict :  Don't Reject the Null Hypothesis



#################### Area Sales Manager ####################

T Statistic :  -1.068419561537325
Degree of Freedom :  21
Critical Value :  1.7207429028118775
P Value :  0.2974597027086754

Verdict :  Don't Reject the Null Hypothesis



#################### Production Technician II ####################

T Statistic :  0.05325231189065007
Degree of Freedom :  21
Critical Value :  1.7207429028118775
P Value :  0.958034218132644

Verdict :  Don't Reject the Null Hypothesis



#################### Production Manager ####################

T Statistic :  -0.5878803211399749
Degree of Freedom :  7
Critical Value :  1.894578605061305
P Value :  0.5750793434683144

Verdict :  Don't Reject the Null Hypothesis



#################### Network Engineer ############

#### We don't encounter any bias on the basis of gender in any position.

#### Q8. We should check for each position is there racial discrimination ?

In [166]:
for i in ['Production Technician I','Area Sales Manager','Production Technician II','Production Manager',
          'Network Engineer','Database Administrator','Software Engineer','IT Support']:
    df_w = df[(df['Ethnicity']=='White') & (df['Position']==i)]['Salary']
    df_nw = df[(df['Ethnicity']!='White') & (df['Position']==i)]['Salary']
    print("####################",i,"####################")
    print("")
    a = hypothesis_test(df_w,df_nw,0.05,0)
    print("")
    print("")
    print("")

#################### Production Technician I ####################

T Statistic :  -0.3774148917203454
Degree of Freedom :  71
Critical Value :  1.666599658219398
P Value :  0.7069911330462606

Verdict :  Don't Reject the Null Hypothesis



#################### Area Sales Manager ####################

T Statistic :  0.4280565206294255
Degree of Freedom :  21
Critical Value :  1.7207429028118775
P Value :  0.6729680218006755

Verdict :  Don't Reject the Null Hypothesis



#################### Production Technician II ####################

T Statistic :  -0.21077316957806802
Degree of Freedom :  21
Critical Value :  1.7207429028118775
P Value :  0.8350960794664801

Verdict :  Don't Reject the Null Hypothesis



#################### Production Manager ####################

T Statistic :  -0.43109245871924246
Degree of Freedom :  7
Critical Value :  1.894578605061305
P Value :  0.6793652591403596

Verdict :  Don't Reject the Null Hypothesis



#################### Network Engineer #########

In [167]:
# check software engineer position
df[(df['Ethnicity']=='White') & (df['Position']=='Software Engineer')]

Unnamed: 0,Age,Gender,Ethnicity,Start_date,Position,Salary
1,30,Female,White,2013-11-11,Software Engineer,116480.0
2,31,Female,White,2012-05-11,Software Engineer,102440.0
3,34,Male,White,2014-07-07,Software Engineer,99840.0
4,38,Female,White,2014-10-11,Software Engineer,99008.0


In [168]:
df[(df['Ethnicity']!='White') & (df['Position']=='Software Engineer')]

Unnamed: 0,Age,Gender,Ethnicity,Start_date,Position,Salary
123,38,Female,Black or African American,2012-09-01,Software Engineer,118809.6
159,30,Female,Asian,2013-11-11,Software Engineer,115460.8


#### All of the positions does not racially discriminate, except Software Engineer. In this we saw a negative t-statistic, which means the non white folks were provided more than the white people. The p-value was 0.04, which means for 96% of the time it will be true.

### Conclusion :

Spark Foundation is a corporation which doesn't discriminate against any standards. It can one of the best places that an individual can work in. On the contrary, in Software Engineer profile, non white people were compensated more than the white people, it could be due to any company policies like relocation or anything, so we cannot directly jump to conclusions. __Our final verdict is Spark Foundation Inc. does not have pay gap within their organisation.__