# Simpson's Paradox
Use `admission_data.csv` for this exercise.

In [72]:
# Load and view first few lines of dataset
import pandas as pd
df = pd.read_csv('admission_data.csv')
df

Unnamed: 0,student_id,gender,major,admitted
0,35377,female,Chemistry,False
1,56105,male,Physics,True
2,31441,female,Chemistry,False
3,51765,male,Physics,True
4,53714,female,Physics,True
...,...,...,...,...
495,26950,male,Physics,False
496,25775,female,Chemistry,True
497,47243,female,Chemistry,True
498,52067,male,Physics,True


### Proportion and admission rate for each gender

In [73]:
# Proportion of students that are female
# df.groupby(['gender']).size()
df['gender'].value_counts()

female    257
male      243
Name: gender, dtype: int64

In [74]:
df[df['gender']=='female']

Unnamed: 0,student_id,gender,major,admitted
0,35377,female,Chemistry,False
2,31441,female,Chemistry,False
4,53714,female,Physics,True
5,50693,female,Chemistry,False
7,27648,female,Chemistry,True
...,...,...,...,...
492,26737,female,Chemistry,False
493,56006,female,Physics,True
494,30192,female,Chemistry,True
496,25775,female,Chemistry,True


In [75]:
number_of_female = df['gender'].value_counts().loc['female']
number_of_female

257

In [76]:
total_number = len(df)
total_number

500

In [77]:
number_of_female / total_number

0.514

In [78]:
# Proportion of students that are male
number_of_male = df['gender'].value_counts().loc['male']
number_of_male / total_number

0.486

In [79]:
# Admission rate for females
df_female = df[df['gender']=='female']

df_female['admitted'].value_counts()

False    183
True      74
Name: admitted, dtype: int64

In [80]:
df_female['admitted'].value_counts().loc[True]

74

In [81]:
df_female['admitted'].value_counts().loc[True] / number_of_female

0.28793774319066145

In [82]:
# Admission rate for males
# Admission rate for females
df_male = df[df['gender']=='male']

df_male['admitted'].value_counts()
df_male['admitted'].value_counts().loc[True] / number_of_male 

0.48559670781893005

### Proportion and admission rate for physics majors of each gender

In [86]:
df_female

Unnamed: 0,student_id,gender,major,admitted
0,35377,female,Chemistry,False
2,31441,female,Chemistry,False
4,53714,female,Physics,True
5,50693,female,Chemistry,False
7,27648,female,Chemistry,True
...,...,...,...,...
492,26737,female,Chemistry,False
493,56006,female,Physics,True
494,30192,female,Chemistry,True
496,25775,female,Chemistry,True


In [89]:
# What proportion of female students are majoring in physics?
df_female['major'].value_counts().loc['Physics']

31

In [90]:
df_female['major'].value_counts().loc['Physics'] / number_of_female

0.12062256809338522

In [91]:
# What proportion of male students are majoring in physics?
df_male['major'].value_counts().loc['Physics']

225

In [92]:
df_male['major'].value_counts().loc['Physics'] / number_of_male

0.9259259259259259

In [105]:
# Admission rate for female physics majors
df_female_phy = df_female[df_female['major']=='Physics']
df_female_phy.head()

Unnamed: 0,student_id,gender,major,admitted
4,53714,female,Physics,True
47,54442,female,Physics,True
59,27446,female,Physics,True
66,29216,female,Physics,False
85,30213,female,Physics,False


In [101]:
print('Number of students:', len(df_female_phy))
df_female_phy['admitted'].value_counts().loc[True]

Number of students: 31


23

In [102]:
df_female_phy['admitted'].value_counts().loc[True] / len(df_female_phy)

0.7419354838709677

In [107]:
# Admission rate for male physics majors
df_male_phy = df_male[df_male['major']=='Physics']
df_male_phy.head()

Unnamed: 0,student_id,gender,major,admitted
1,56105,male,Physics,True
3,51765,male,Physics,True
6,25946,male,Physics,True
8,55247,male,Physics,False
9,35838,male,Physics,True


In [108]:
print('Number of students:', len(df_male_phy))
df_male_phy['admitted'].value_counts().loc[True]

Number of students: 225


116

In [109]:
df_male_phy['admitted'].value_counts().loc[True] / len(df_male_phy)

0.5155555555555555

### Proportion and admission rate for chemistry majors of each gender

In [114]:
# What proportion of female students are majoring in chemistry?
df_female['major'].value_counts().loc['Chemistry'] / number_of_female

0.8793774319066148

In [115]:
# What proportion of male students are majoring in chemistry?
df_male['major'].value_counts().loc['Chemistry'] / number_of_male

0.07407407407407407

In [127]:
# Admission rate for female chemistry majors
len_female_chem = len(df_female[(df_female['major']=='Chemistry')])
len_female_chem_admited = len(df_female[(df_female['major']=='Chemistry') & (df_female['admitted']==True)])

len_female_chem_admited / len_female_chem

0.22566371681415928

In [130]:
# Admission rate for male chemistry majors
len_male_chem = len(df_male[(df_male['major']=='Chemistry')])
len_male_chem_admited = len(df_male[(df_male['major']=='Chemistry') & (df_male['admitted']==True)])

len_male_chem_admited / len_male_chem

0.1111111111111111

### Admission rate for each major

In [134]:
# Admission rate for physics majors
len(df[(df['major']=='Physics') & (df['admitted']==True)]) / len(df[(df['major']=='Physics')])

0.54296875

In [132]:
# Admission rate for chemistry majors
len(df[(df['major']=='Chemistry') & (df['admitted']==True)]) / len(df[(df['major']=='Chemistry')])

244