In [7]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('survey.csv')

# Display the first 5 rows
print(df.head(5).to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types
print(df.info())

| Timestamp           | Age   | Gender   | Country        | state   | self_employed   | family_history   | treatment   | work_interfere   | no_employees   | remote_work   | tech_company   | benefits   | care_options   | wellness_program   | seek_help   | anonymity   | leave              | mental_health_consequence   | phys_health_consequence   | coworkers    | supervisor   | mental_health_interview   | phys_health_interview   | mental_vs_physical   | obs_consequence   | comments   |
|:--------------------|:------|:---------|:---------------|:--------|:----------------|:-----------------|:------------|:-----------------|:---------------|:--------------|:---------------|:-----------|:---------------|:-------------------|:------------|:------------|:-------------------|:----------------------------|:--------------------------|:-------------|:-------------|:--------------------------|:------------------------|:---------------------|:------------------|:-----------|
| 2014-08-27 11:29:31 | 

In [2]:
# List of columns to check unique values
columns_to_check = ['treatment', 'Gender', 'family_history', 'no_employees', 
                    'remote_work', 'work_interfere', 'tech_company', 'benefits', 
                    'anonymity', 'leave', 'mental_health_interview']

# Iterate through each column and print unique values
for col in columns_to_check:
    unique_values = df[col].unique()
    if (len(unique_values) > 20):
        # Sample 20 of them if there are too many unique values
        print(f"Unique values for {col} (sample of 20): {pd.Series(unique_values).sample(20, replace=False).tolist()}")
    else:
        # Otherwise print all unique values from the column
        print(f"Unique values for {col}: {unique_values.tolist()}")

Unique values for treatment: ['Yes', 'No']
Unique values for Gender (sample of 20): ['male', 'Malr', 'Female ', 'maile', 'Mail', 'male leaning androgynous', 'Female', 'cis male', 'queer/she/they', 'Enby', 'A little about you', 'Female (cis)', 'fluid', 'Mal', 'Cis Male', 'f', 'woman', 'something kinda male?', 'femail', 'Male ']
Unique values for family_history: ['No', 'Yes']
Unique values for no_employees: ['6-25', 'More than 1000', '26-100', '100-500', '1-5', '500-1000']
Unique values for remote_work: ['No', 'Yes']
Unique values for work_interfere: ['Often', 'Rarely', 'Never', 'Sometimes', nan]
Unique values for tech_company: ['Yes', 'No']
Unique values for benefits: ['Yes', "Don't know", 'No']
Unique values for anonymity: ['Yes', "Don't know", 'No']
Unique values for leave: ['Somewhat easy', "Don't know", 'Somewhat difficult', 'Very difficult', 'Very easy']
Unique values for mental_health_interview: ['No', 'Yes', 'Maybe']


In [3]:
# 1. Prevalence of Treatment
treatment_counts = df['treatment'].value_counts()
treatment_percentage = (treatment_counts / df.shape[0]) * 100

print("\n**1. Prevalence of Treatment**\n")
print(f"Number of people who sought treatment: {treatment_counts['Yes']}")
print(f"Percentage of people who sought treatment: {treatment_percentage['Yes']:.2f}%\n")



**1. Prevalence of Treatment**

Number of people who sought treatment: 637
Percentage of people who sought treatment: 50.60%



In [4]:
# 2. Treatment by Gender
# Simplify the 'Gender' column
def simplify_gender(gender):
    if gender.lower() in ['male', 'm', 'cis male', 'cis-male/trans-male', 'maile', 'male (cis)', 'msle', 'mal', 'male ', 'man', 'make']:
        return 'Male'
    elif gender.lower() in ['female', 'f', 'woman', 'femake', 'female (cis)', 'cis female', 'female ']:
        return 'Female'
    else:
        return 'Other'

df['simplified_gender'] = df['Gender'].astype(str).apply(simplify_gender)

# Compute crosstab and proportions
gender_treatment_ct = pd.crosstab(df['simplified_gender'], df['treatment'])
gender_treatment_prop = gender_treatment_ct.div(gender_treatment_ct.sum(1).astype(float), axis=0)

print("\n**2. Treatment by Gender**\n")
print("Crosstab of Gender and Treatment:\n")
print(gender_treatment_ct.to_markdown(numalign="left", stralign="left"))
print("\nProportion of each gender who sought treatment:\n")
print(gender_treatment_prop.to_markdown(numalign="left", stralign="left"))


**2. Treatment by Gender**

Crosstab of Gender and Treatment:

| simplified_gender   | No   | Yes   |
|:--------------------|:-----|:------|
| Female              | 76   | 169   |
| Male                | 540  | 447   |
| Other               | 6    | 21    |

Proportion of each gender who sought treatment:

| simplified_gender   | No       | Yes      |
|:--------------------|:---------|:---------|
| Female              | 0.310204 | 0.689796 |
| Male                | 0.547112 | 0.452888 |
| Other               | 0.222222 | 0.777778 |


In [8]:
# 3. Family History and Treatment
family_treatment_ct = pd.crosstab(df['family_history'], df['treatment'])
family_treatment_prop = family_treatment_ct.div(family_treatment_ct.sum(1).astype(float), axis=0)

print("\n**3. Family History and Treatment**\n")
print("Crosstab of Family History and Treatment:\n")
print(family_treatment_ct.to_markdown(numalign="left", stralign="left"))
print("\nProportion of individuals with and without family history who sought treatment:\n")
print(family_treatment_prop.to_markdown(numalign="left", stralign="left"))



**3. Family History and Treatment**

Crosstab of Family History and Treatment:

| family_history   | No   | Yes   |
|:-----------------|:-----|:------|
| No               | 495  | 272   |
| Yes              | 127  | 365   |

Proportion of individuals with and without family history who sought treatment:

| family_history   | No       | Yes      |
|:-----------------|:---------|:---------|
| No               | 0.645372 | 0.354628 |
| Yes              | 0.25813  | 0.74187  |


In [9]:
# 4. Company Size and Discussion
df['discussed_mental_health'] = df.apply(lambda row: 'Yes' if row['supervisor'] == 'Yes' or row['coworkers'] == 'Yes' else 'No', axis=1)
company_discussion_ct = pd.crosstab(df['no_employees'], df['discussed_mental_health'])
company_discussion_prop = company_discussion_ct.div(company_discussion_ct.sum(1).astype(float), axis=0)

print("\n**4. Company Size and Discussion**\n")
print("Crosstab of Company Size and Discussion of Mental Health:\n")
print(company_discussion_ct.to_markdown(numalign="left", stralign="left"))
print("\nProportion of individuals in each company size category who discussed mental health:\n")
print(company_discussion_prop.to_markdown(numalign="left", stralign="left"))



**4. Company Size and Discussion**

Crosstab of Company Size and Discussion of Mental Health:

| no_employees   | No   | Yes   |
|:---------------|:-----|:------|
| 1-5            | 85   | 77    |
| 100-500        | 95   | 81    |
| 26-100         | 167  | 122   |
| 500-1000       | 37   | 23    |
| 6-25           | 146  | 144   |
| More than 1000 | 183  | 99    |

Proportion of individuals in each company size category who discussed mental health:

| no_employees   | No       | Yes      |
|:---------------|:---------|:---------|
| 1-5            | 0.524691 | 0.475309 |
| 100-500        | 0.539773 | 0.460227 |
| 26-100         | 0.577855 | 0.422145 |
| 500-1000       | 0.616667 | 0.383333 |
| 6-25           | 0.503448 | 0.496552 |
| More than 1000 | 0.648936 | 0.351064 |


In [10]:
# 5. Remote Work and Work Interference
remote_work_interfere_ct = pd.crosstab(df['remote_work'], df['work_interfere'])
remote_work_interfere_prop = remote_work_interfere_ct.div(remote_work_interfere_ct.sum(1).astype(float), axis=0)

print("\n**5. Remote Work and Work Interference**\n")
print("Crosstab of Remote Work and Work Interference:\n")
print(remote_work_interfere_ct.to_markdown(numalign="left", stralign="left"))
print("\nProportion of individuals in each remote work category who reported different levels of work interference:\n")
print(remote_work_interfere_prop.to_markdown(numalign="left", stralign="left"))



**5. Remote Work and Work Interference**

Crosstab of Remote Work and Work Interference:

| remote_work   | Never   | Often   | Rarely   | Sometimes   |
|:--------------|:--------|:--------|:---------|:------------|
| No            | 157     | 93      | 119      | 322         |
| Yes           | 56      | 51      | 54       | 143         |

Proportion of individuals in each remote work category who reported different levels of work interference:

| remote_work   | Never    | Often    | Rarely   | Sometimes   |
|:--------------|:---------|:---------|:---------|:------------|
| No            | 0.227207 | 0.134588 | 0.172214 | 0.465991    |
| Yes           | 0.184211 | 0.167763 | 0.177632 | 0.470395    |


In [11]:
# 6. Tech Companies and Treatment
tech_treatment_ct = pd.crosstab(df['tech_company'], df['treatment'])
tech_treatment_prop = tech_treatment_ct.div(tech_treatment_ct.sum(1).astype(float), axis=0)

print("\n**6. Tech Companies and Treatment**\n")
print("Crosstab of Tech Company and Treatment:\n")
print(tech_treatment_ct.to_markdown(numalign="left", stralign="left"))
print("\nProportion of individuals in tech and non-tech companies who sought treatment:\n")
print(tech_treatment_prop.to_markdown(numalign="left", stralign="left"))



**6. Tech Companies and Treatment**

Crosstab of Tech Company and Treatment:

| tech_company   | No   | Yes   |
|:---------------|:-----|:------|
| No             | 105  | 123   |
| Yes            | 517  | 514   |

Proportion of individuals in tech and non-tech companies who sought treatment:

| tech_company   | No       | Yes      |
|:---------------|:---------|:---------|
| No             | 0.460526 | 0.539474 |
| Yes            | 0.501455 | 0.498545 |


In [12]:
# 7. Benefits and Treatment
benefits_treatment_ct = pd.crosstab(df['benefits'], df['treatment'])
benefits_treatment_prop = benefits_treatment_ct.div(benefits_treatment_ct.sum(1).astype(float), axis=0)

print("\n**7. Benefits and Treatment**\n")
print("Crosstab of Benefits and Treatment:\n")
print(benefits_treatment_ct.to_markdown(numalign="left", stralign="left"))
print("\nProportion of individuals with and without benefits who sought treatment:\n")
print(benefits_treatment_prop.to_markdown(numalign="left", stralign="left"))



**7. Benefits and Treatment**

Crosstab of Benefits and Treatment:

| benefits   | No   | Yes   |
|:-----------|:-----|:------|
| Don't know | 257  | 151   |
| No         | 193  | 181   |
| Yes        | 172  | 305   |

Proportion of individuals with and without benefits who sought treatment:

| benefits   | No       | Yes      |
|:-----------|:---------|:---------|
| Don't know | 0.629902 | 0.370098 |
| No         | 0.516043 | 0.483957 |
| Yes        | 0.360587 | 0.639413 |


In [13]:
# 8. Anonymity and Seeking Help
anonymity_treatment_ct = pd.crosstab(df['anonymity'], df['treatment'])
anonymity_treatment_prop = anonymity_treatment_ct.div(anonymity_treatment_ct.sum(1).astype(float), axis=0)

print("\n**8. Anonymity and Seeking Help**\n")
print("Crosstab of Anonymity and Treatment:\n")
print(anonymity_treatment_ct.to_markdown(numalign="left", stralign="left"))
print("\nProportion of individuals with different perceived levels of anonymity who sought treatment:\n")
print(anonymity_treatment_prop.to_markdown(numalign="left", stralign="left"))



**8. Anonymity and Seeking Help**

Crosstab of Anonymity and Treatment:

| anonymity   | No   | Yes   |
|:------------|:-----|:------|
| Don't know  | 448  | 371   |
| No          | 27   | 38    |
| Yes         | 147  | 228   |

Proportion of individuals with different perceived levels of anonymity who sought treatment:

| anonymity   | No       | Yes      |
|:------------|:---------|:---------|
| Don't know  | 0.547009 | 0.452991 |
| No          | 0.415385 | 0.584615 |
| Yes         | 0.392    | 0.608    |


In [18]:
# 9. Leave and Treatment
leave_treatment_ct = pd.crosstab(df['leave'], df['treatment'])
leave_treatment_prop = leave_treatment_ct.div(leave_treatment_ct.sum(1).astype(float), axis=0)

print("\n**9. Leave and Treatment**\n")
print("Crosstab of Leave and Treatment:\n")
print(leave_treatment_ct.to_markdown(numalign="left", stralign="left"))
print("\nProportion of individuals with different perceived ease of taking leave who sought treatment:\n")
print(leave_treatment_prop.to_markdown(numalign="left", stralign="left"))



**9. Leave and Treatment**

Crosstab of Leave and Treatment:

| leave              | No   | Yes   |
|:-------------------|:-----|:------|
| Don't know         | 309  | 254   |
| Somewhat difficult | 44   | 82    |
| Somewhat easy      | 135  | 131   |
| Very difficult     | 31   | 67    |
| Very easy          | 103  | 103   |

Proportion of individuals with different perceived ease of taking leave who sought treatment:

| leave              | No       | Yes      |
|:-------------------|:---------|:---------|
| Don't know         | 0.548845 | 0.451155 |
| Somewhat difficult | 0.349206 | 0.650794 |
| Somewhat easy      | 0.507519 | 0.492481 |
| Very difficult     | 0.316327 | 0.683673 |
| Very easy          | 0.5      | 0.5      |


In [15]:
# 10. Interview Comfort and Treatment
interview_treatment_ct = pd.crosstab(df['mental_health_interview'], df['treatment'])
interview_treatment_prop = interview_treatment_ct.div(interview_treatment_ct.sum(1).astype(float), axis=0)

print("\n**10. Interview Comfort and Treatment**\n")
print("Crosstab of Mental Health Interview Comfort and Treatment:\n")
print(interview_treatment_ct.to_markdown(numalign="left", stralign="left"))
print("\nProportion of individuals with different comfort levels discussing mental health in interviews who sought treatment:\n")
print(interview_treatment_prop.to_markdown(numalign="left", stralign="left"))


**10. Interview Comfort and Treatment**

Crosstab of Mental Health Interview Comfort and Treatment:

| mental_health_interview   | No   | Yes   |
|:--------------------------|:-----|:------|
| Maybe                     | 125  | 82    |
| No                        | 479  | 529   |
| Yes                       | 18   | 26    |

Proportion of individuals with different comfort levels discussing mental health in interviews who sought treatment:

| mental_health_interview   | No       | Yes      |
|:--------------------------|:---------|:---------|
| Maybe                     | 0.603865 | 0.396135 |
| No                        | 0.475198 | 0.524802 |
| Yes                       | 0.409091 | 0.590909 |
