In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori


In [202]:
df = pd.read_csv("survey_results_responses.csv")
q = ['EdLevel', 'Age1stCode', 'Employment', 'YearsCode', 'Gender']
df = df[q]
df = df.dropna()
df_arr = df.to_numpy()[1:][:]

In [203]:
te = TransactionEncoder()
te_ary = te.fit(df_arr).transform(df_arr)
df_out = pd.DataFrame(te_ary, columns=te.columns_)

In [204]:
out = apriori(df_out, min_support=0.01, use_colnames=True)
out['length'] = out['itemsets'].apply(lambda x: len(x))
doubles = out[out['length'] == 2]
singles = out[out['length'] == 1]

In [205]:
gender = singles[(singles["itemsets"] == {"Woman"}) | (singles["itemsets"] == {"Man"})]
gender = gender["support"].to_numpy()

In [206]:
single_dict = {}
for i, row in singles.iterrows():
    x, = row["itemsets"]
    single_dict[x] = row["support"]
doubles = doubles[doubles["itemsets"].apply(lambda x: True if "Man" in x or "Woman" in x else False)]

In [207]:
doubles["confidence"] = doubles.apply(lambda x: x["support"] / gender[0] 
                           if "Man" in x["itemsets"] 
                           else x["support"] / gender[1], axis = 1)
doubles.sort_values(by="confidence", ascending=False, inplace=True)

In [208]:
def getNonGenderItem(x):
    for item in x:
        if item != "Man" and item != "Woman":
            return item
doubles["single"] = doubles["itemsets"].apply(getNonGenderItem)
doubles["lift"] = doubles.apply(lambda x: x["confidence"] / single_dict[x["single"]], axis=1)
men = doubles[doubles["itemsets"].apply(lambda x: True if "Man" in x else False)]
women = doubles[doubles["itemsets"].apply(lambda x: True if "Woman" in x else False)]

In [209]:
men

Unnamed: 0,support,itemsets,length,confidence,single,lift
203,0.597165,"(Employed full-time, Man)",2,0.655852,Employed full-time,1.010036
84,0.49172,"(11 - 17 years, Man)",2,0.540043,11 - 17 years,1.012794
199,0.389643,"(Man, Bachelor’s degree (B.A., B.S., B.Eng., e...",2,0.427935,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",1.004373
125,0.216747,"(18 - 24 years, Man)",2,0.238048,18 - 24 years,0.990032
215,0.193187,"(Master’s degree (M.A., M.S., M.Eng., MBA, etc...",2,0.212172,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",1.003116
171,0.126545,"(5 - 10 years, Man)",2,0.138981,5 - 10 years,0.991859
224,0.12182,"(Student, full-time, Man)",2,0.133792,"Student, full-time",0.970675
222,0.117158,(Some college/university study without earning...,2,0.128672,Some college/university study without earning ...,1.003106
221,0.103606,"(Secondary school (e.g. American high school, ...",2,0.113788,"Secondary school (e.g. American high school, G...",0.999555
211,0.090451,"(Man, Independent contractor, freelancer, or s...",2,0.09934,"Independent contractor, freelancer, or self-em...",1.023435


In [210]:
women

Unnamed: 0,support,itemsets,length,confidence,single,lift
209,0.031829,"(Employed full-time, Woman)",2,0.638563,Employed full-time,0.983411
202,0.022902,"(Woman, Bachelor’s degree (B.A., B.S., B.Eng.,...",2,0.459466,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",1.078376
93,0.021646,"(11 - 17 years, Woman)",2,0.434273,11 - 17 years,0.814433
131,0.016934,"(Woman, 18 - 24 years)",2,0.339736,18 - 24 years,1.412945
226,0.0117,"(Master’s degree (M.A., M.S., M.Eng., MBA, etc...",2,0.234722,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",1.109727


In [None]:
"""
We ran a market basket analysis on survey results from Stack Overflow 2021 annual developer survey. 
This survey had over 80,000 participants from 180 different countries. We chose to look af information 
including gender, employment and education level. We strongly felt that we could find gender bias 
considering that women are typically not encouraged to enter the STEM field and hoped to get some 
insight through our data analysis. We chose a small support value of 0.01 for this exact reason. 
A larger support value would lead us to having almost no results for women. 

Format -> (Confidence, Lift)
Full-Time Employment: 
    Man: 0.655, 1.01
    Woman: 0.639, 0.983

Bachelors Degree: 
    Man: 0.428, 1.004
    Woman: 0.459, 1.078

Youngsters Coding: 
    Man: 0.540, 1.012
    Woman: 0.434, 0.814

Young Adults Coding: 
    Man: 0.238, 0.990
    Woman: 0.339, 1.412

Masters Degree: 
    Man: 0.212, 1.003
    Woman: 0.234, 1.109

There is a slight gender disparity in favor of females, which is quite surprising. This may largely 
be due to lack of data on woman and because we had to lower the support significantly to get any woman 
results to show up. Furthermore, there are data points for specific ages for men, but for women only 
ranges show up, indicating the disparity in the amount of data between genders. This supports the concern 
that women are not as prevalent in the stem fields as men are, although our data here suggests there is not 
much of a disparity.
""" 