In [1]:
import requests
import pandas as pd
import re
import numpy as np
import os

import math
import statistics

from bs4 import BeautifulSoup

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.rcParams['pdf.fonttype'] = 42

%matplotlib inline

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'}

# Download the scraped data

In [2]:
df = pd.read_csv('nyc_therapists_complete.csv')
df.head(3)

Unnamed: 0,age_focus,community_focus,description,grad_yr,issues,lang,license_state,mental_health,name,phone,race_focus,school,sexuality,specialities,therapy_types,title,url,yrs_in_practice,zipcode
0,Adults,,"""Welcome to Manhattan Mental Health Counseling...",,ADHD\n ...,,004643 New York,Mood Disorders,Manhattan Mental Health Counseling,(212) 960-8626,,,,Depression\n ...,,"Counselor,LMHC",https://www.psychologytoday.com/us/therapists/...,,10018
1,Adolescents / Teenagers (14 to 19)\n ...,Bisexual Allied\n ...,"""Why do people seek therapy? Usually it's beca...",1981.0,Anger Management\n ...,,026039 New York,,Ellen S. Daniels,(646) 797-5689,,Fordham Graduate School of Social Services,Bisexual\n ...,Relationship Issues\n ...,Eclectic\n\n\nInterpersonal\n\n\nPsychodynamic...,"Clinical Social Work/Therapist,LCSW-R",https://www.psychologytoday.com/us/therapists/...,20+ Years,10001
2,,,"""The American Institute for Cognitive Therapy ...",,,,007334-1 New York,,American Institute for Cognitive Therapy,(646) 762-1256,,,,Anxiety\n ...,Acceptance and Commitment Therapy (ACT)\n\n\n ...,"Psychologist,PhD",https://www.psychologytoday.com/us/therapists/...,,10022


# Clean the data

## Clean years in practice

In [3]:
df.yrs_in_practice.value_counts()

None         1917
20+ Years     613
10+ Years     391
15+ Years     342
30+ Years     340
8 Years       206
6 Years       201
10 Years      194
7 Years       179
9 Years       160
5 Years       147
11 Years      117
4 Years       109
12 Years       91
14 Years       75
13 Years       69
3 Years        69
15 Years       64
16 Years       61
19 Years       59
17 Years       52
20 Years       48
2 Years        47
18 Years       46
21 Years       45
24 Years       39
22 Years       33
25 Years       32
23 Years       32
< 2 Years      32
29 Years       28
26 Years       28
27 Years       27
32 Years       22
28 Years       22
34 Years       19
31 Years       17
30 Years       17
37 Years       16
39 Years       16
33 Years       16
36 Years       15
38 Years       11
35 Years        8
40 Years        7
43 Years        6
44 Years        5
< 1 Year        5
41 Years        5
42 Years        4
47 Years        4
1 Year          3
48 Years        3
45 Years        2
46 Years        1
57 Years  

### Turn years of experience into ranges
```
0-4 years
5-9
10-14
15-29
20-24
25-29
30-34
35-39
40+
```
* I counted 10+ to be in the 10-14 range, and 20+ to be in the 20-24 range, etc.

In [4]:
# First, get rid of 'Years'

df.yrs_in_practice = df.yrs_in_practice.str[:-5]
df.yrs_in_practice.value_counts()

        1917
20+      613
10+      391
15+      342
30+      340
8        206
6        201
10       194
7        179
9        160
5        147
11       117
4        109
12        91
14        75
3         69
13        69
15        64
16        61
19        59
17        52
20        48
2         47
18        46
21        45
24        39
22        33
23        32
25        32
< 2       32
26        28
29        28
27        27
28        22
32        22
34        19
31        17
30        17
37        16
33        16
39        16
36        15
38        11
35         8
40         7
43         6
44         5
< 1        5
41         5
47         4
42         4
48         3
1          3
45         2
46         1
57         1
49         1
Name: yrs_in_practice, dtype: int64

In [5]:
yrs_cleaned = []
for year in df.yrs_in_practice:
    if year == '':
        yrs_cleaned.append(np.nan)
    elif '+' in year:
        yr = year.split('+')[0]
        yrs_cleaned.append(int(yr))
    elif '< ' in year:
        yr = year.split('< ')[1]
        yrs_cleaned.append(int(yr))
    else:
        yrs_cleaned.append(int(year))
# yrs_cleaned

In [6]:
yrs_range = []
for yr in yrs_cleaned:
    if 0 <= yr <= 4:
        yrs_range.append('0-4')
    elif 5 <= yr <= 9:
        yrs_range.append('05-9')
    elif 10 <= yr <= 14:
        yrs_range.append('10-14')
    elif 15 <= yr <= 19:
        yrs_range.append('15-19')
    elif 20 <= yr <= 24:
        yrs_range.append('20-24')
    elif 25 <= yr <= 29:
        yrs_range.append('25-29')
    elif 30 <= yr <= 34:
        yrs_range.append('30-34')
    elif 35 <= yr <= 39:
        yrs_range.append('35-39')
    elif yr >= 40:
        yrs_range.append('40+')
    else:
        yrs_range.append('None')
# yrs_range

In [7]:
df['yrs_in_practice_range'] = yrs_range

In [8]:
df.yrs_in_practice_range.value_counts(dropna=False)

None     1917
10-14     937
05-9      893
20-24     810
15-19     624
30-34     431
0-4       265
25-29     137
35-39      66
40+        39
Name: yrs_in_practice_range, dtype: int64

## Clean graduation year & turn them into integers

* Set "None" as 0, then into NaN

In [9]:
# df.grad_yr.value_counts(dropna=False)

In [10]:
df['grad_yr'] = df.grad_yr.replace('None', 0)
df['grad_yr'] = df.grad_yr.astype(int)
df['grad_yr'] = df.grad_yr.replace({0:np.nan})

## Tag what age groups therapists focus on by assigning 0 or 1 to therapists that focus on different ages

In [11]:
df['age_focus'] = df.age_focus.str.replace('  ','')
df['age_focus'] = df.age_focus.str.replace('\n\n','')
# df.age_focus.value_counts()

In [12]:
df['agefocus_toddlers'] = df.age_focus.str.contains("Toddlers").astype(int)
df['agefocus_children'] = df.age_focus.str.contains("Children").astype(int)
df['agefocus_preteen-tweens'] = df.age_focus.str.contains("Preteens").astype(int)
df['agefocus_teenagers'] = df.age_focus.str.contains("Teenagers").astype(int)
df['agefocus_adults'] = df.age_focus.str.contains("Adults").astype(int)
df['agefocus_elders'] = df.age_focus.str.contains("Elders").astype(int)

# Do the same for sexuality focus

In [13]:
df['sexuality'] = df.sexuality.str.replace('  ','')
df['sexuality'] = df.sexuality.str.replace('\n\n','')
# df.sexuality.value_counts()

In [14]:
df['sexualityfocus_bisexual'] = df.sexuality.str.contains("Bisexual").astype(int)
df['sexualityfocus_gay'] = df.sexuality.str.contains("Gay").astype(int)
df['sexualityfocus_lesbian'] = df.sexuality.str.contains("Lesbian").astype(int)

# And race focus

In [15]:
df.race_focus.value_counts(dropna=False)

None                                                         5049
African-American and Hispanic and Latino                      222
Other Racial or Ethnic Background                             167
African-American and Other Racial or Ethnic Background        145
Hispanic and Latino and Other Racial or Ethnic Background     109
Hispanic and Latino                                           100
Asian and Other Racial or Ethnic Background                    93
African-American and Asian                                     62
Asian                                                          58
Asian and Hispanic and Latino                                  41
African-American                                               38
Asian and Pacific Islander                                     14
African-American and Native American                            6
Hispanic and Latino and Native American                         5
Native American and Other Racial or Ethnic Background           3
Other Raci

In [16]:
df['racefocus_afam'] = df.race_focus.str.contains("African-American").astype(int)
df['racefocus_latino_hispanic'] = df.race_focus.str.contains("Hispanic and Latino").astype(int)
df['racefocus_asian'] = df.race_focus.str.contains("Asian").astype(int)
df['racefocus_pacific_islander'] = df.race_focus.str.contains("Pacific Islander").astype(int)
df['racefocus_native_am'] = df.race_focus.str.contains("Native American").astype(int)
df['racefocus_other_races'] = df.race_focus.str.contains("Other").astype(int)

# And community focus

In [17]:
df['community_focus'] = df.community_focus.str.replace('  ','')
df['community_focus'] = df.community_focus.str.replace('\n\n','')
# df.community_focus.value_counts()

In [18]:
# Get the list of community focuses
list(set(df.community_focus.str.split("\n").sum()))

['Queer Allied',
 'HIV / AIDS Allied',
 'Lesbian Allied',
 'Body Positivity',
 'Transgender Allied',
 'Gay Allied',
 'Veterans',
 'Open Relationships Non-Monogamy',
 'Intersex Allied',
 'Vegan Allied',
 'Sex-Positive, Kink Allied',
 'Non-Binary Allied',
 'Sex Worker Allied',
 'Bisexual Allied',
 'Racial Justice Allied',
 'Aviation Professionals',
 'None',
 'Cancer']

In [19]:
df['community_bisexual'] = df.community_focus.str.contains("Bisexual Allied").astype(int)
df['community_gay'] = df.community_focus.str.contains("Gay Allied").astype(int)
df['community_trans'] = df.community_focus.str.contains("Transgender Allied").astype(int)
df['community_lesbian'] = df.community_focus.str.contains("Lesbian Allied").astype(int)
df['community_HIV'] = df.community_focus.str.contains("HIV / AIDS Allied").astype(int)
df['community_vets'] = df.community_focus.str.contains("Veterans").astype(int)
df['community_cancer'] = df.community_focus.str.contains("HIV / AIDS Allied").astype(int)
df['community_aviation'] = df.community_focus.str.contains("Aviation Professionals").astype(int)
df['community_bodypos'] = df.community_focus.str.contains("Body Positivity").astype(int)
df['community_intersex'] = df.community_focus.str.contains("Intersex Allied").astype(int)
df['community_nonbinary'] = df.community_focus.str.contains("Non-Binary Allied").astype(int)
df['community_openrelationship'] = df.community_focus.str.contains("Open Relationships").astype(int)
df['community_queer'] = df.community_focus.str.contains("Queer Allied").astype(int)
df['community_racialjustice'] = df.community_focus.str.contains("Racial Justice Allied").astype(int)
df['community_sexworker'] = df.community_focus.str.contains("Sex Worker").astype(int)
df['community_sexpos'] = df.community_focus.str.contains("Sex-Positive, Kink Allied").astype(int)
df['community_vegan'] = df.community_focus.str.contains("Vegan Allied").astype(int)

# And for issues

In [20]:
df['issues'] = df.issues.str.replace('  ','')
df['issues'] = df.issues.str.replace('\n\n','')
df.issues.value_counts()

None                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    172
Anger Management\nBehavioral Issues\nCoping Skills\nFamily Conflict\nGrief\nObsessive-Compulsive (OCD)\nPeer Relationships\nRelationship Issues\nSchool Issues\nSelf Esteem\nStress                                                                                                                                                         

In [21]:
# Get the list of issue focuses
issues_list = list(set(df.issues.str.split("\n").sum()))

In [22]:
issues_list

['Bariatric Surgery Psychological Evaluations',
 'Borderline Personality',
 'gender id/transition, kink friendly',
 'problem siblings',
 'migration,dealing with money',
 'Unresolved grief',
 'Neurological testing',
 'Oncology, Bereavement',
 'Sexuality including Queer, Questioning, Asexual',
 'immigration issues, psychological evaluation',
 'Workplace Relationships',
 'Sexual Abuse',
 'Ethnic Minority/Race Issues',
 'Immigration Evaluations',
 'education and learning issues',
 'Sexual Development and Sexual Identity',
 "Couple's Communication & Relationship Enhancement",
 'Relational Trauma',
 'Premarital counseling',
 'Body Image, Eating, Perfectionism, Shame',
 'Life transitions/Unresolved pain from the past',
 'marriage couseling',
 'Family therapy',
 'Compulsive shopping',
 'artistic challenges',
 'teach couples how to communicate/infidelity issues',
 'Adolescence and Young Adulthood',
 'Relationship Breakups/Divorce',
 'Family of Origin Conflict',
 'Resistances to getting to top o

## Too many issues. I'm going to select:

* family
* anxiety
* alcohol
* drug
* work/career
* relationship
* stress
* trauma
* divorce
* sexual abuse

In [23]:
df['issues_family'] = df.issues.str.contains("family", case=False).astype(int)
df['issues_anxiety'] = df.issues.str.contains("anxiety", case=False).astype(int)
df['issues_alcohol'] = df.issues.str.contains("alcohol", case=False).astype(int)
df['issues_drug'] = df.issues.str.contains("drug", case=False).astype(int)
df['issues_career'] = df.issues.str.contains("career|work|job", case=False).astype(int)
df['issues_relationship'] = df.issues.str.contains("relationship", case=False).astype(int)
df['issues_stress'] = df.issues.str.contains("stress", case=False).astype(int)
df['issues_trauma'] = df.issues.str.contains("trauma", case=False).astype(int)
df['issues_divorce'] = df.issues.str.contains("divorce", case=False).astype(int)
df['issues_sexabuse'] = df.issues.str.contains("sexual abuse", case=False).astype(int)
df['issues_domesticabuse'] = df.issues.str.contains("domestic abuse", case=False).astype(int)

# Do the same for speciality

In [24]:
df['specialities'] = df.specialities.str.replace('  ','')
df['specialities'] = df.specialities.str.replace('\n\n','')
# df.specialities.value_counts()

In [25]:
df['specialities'] = df.specialities.fillna('None')

In [26]:
# Get the list of speciality focuses
specialities_list = list(set(df.specialities.str.split("\n").sum())) 
specialities_list

['Drug Abuse',
 'Cultural and bicultural identity',
 'Emotional Disturbance',
 'Leadership Coaching',
 "Women's Issues, Self-image, sexual",
 'Elderly Persons Disorders',
 'Borderline Personality',
 'Cultural/Cross-Cultural Issues',
 'performance anxiety and enhancement',
 'Identity Issues',
 'Social Anxiety, Dating Anxiety',
 'Life Transitions',
 'Narcissistic Personality',
 'Eating Disorders',
 'Career/ Worklife issues, Resumes',
 'Ethnic/Cultural Issues',
 'Sexual Disorders & Concerns',
 'Self-defeating behavior',
 'Interfaith, Premarital',
 'Medical Detox',
 'Peer Relationships',
 'Sports Performance',
 'Relationship Issues',
 'Traumatic Brain Injury',
 'Developmental Disorders',
 'Decison Making, Moving Forward',
 "Men's Issues",
 'Trichotillomania',
 'Sexual Abuse',
 'Teacher Stress, Early Childhood Issues',
 'Medication Management',
 'Couples/ Sexual Therapy',
 'Women',
 'Working with Yoga, Chakras/Energy Work.',
 'Racial Identity',
 'Creative Blocks & Performance Anx.',
 'Stres

## Also too many in the list, so I'm going to choose:

* pregnancy
* addiction
* stress
* anxiety
* sexual abuse
* relationship
* women
* trauma

In [27]:
df['specialities_pregnancy'] = df.specialities.str.contains("pregnancy", case=False).astype(int)
df['specialities_addiction'] = df.specialities.str.contains("addiction", case=False).astype(int)
df['specialities_stress'] = df.specialities.str.contains("stress", case=False).astype(int)
df['specialities_sexabuse'] = df.specialities.str.contains("sexual abuse", case=False).astype(int)
df['specialities_relationship'] = df.specialities.str.contains("relationship", case=False).astype(int)
df['specialities_women'] = df.specialities.str.contains("women", case=False).astype(int)
df['specialities_trauma'] = df.specialities.str.contains("pregnancy", case=False).astype(int)

# Tag therapy types

I chose:
* CBT
* Psychoanalytic
* Psychodynamic
* Marital
* Family
* Interpersonal

In [28]:
df['therapy_types'] = df.therapy_types.str.replace('  ','')
df['therapy_types'] = df.therapy_types.str.replace('\n\n','')
df.therapy_types.value_counts()

None                                                                                                                                                                                                                                                                                                                                                                                                                                                                    243
Psychoanalytic\nPsychodynamic                                                                                                                                                                                                                                                                                                                                                                                                                                            87
Eclectic\nPsychodynamic                                                         

In [29]:
df['therapytype_cbt'] = df.therapy_types.str.contains("cbt", case=False).astype(int)
df['therapytype_psychoanalytic'] = df.therapy_types.str.contains("psychoanalytic", case=False).astype(int)
df['therapytype_psychodynamic'] = df.therapy_types.str.contains("psychodynamic", case=False).astype(int)
df['therapytype_marital'] = df.therapy_types.str.contains("marital", case=False).astype(int)
df['therapytype_family'] = df.therapy_types.str.contains("family", case=False).astype(int)
df['therapytype_interpersonal'] = df.therapy_types.str.contains("interpersonal", case=False).astype(int)

# Tag titles

I chose:

* PhD
* LCSW
* PsyD
* Psychologist
* LMFT
* LMHC

In [30]:
df.title.value_counts(dropna=False)

Clinical Social Work/Therapist,LCSW                          1262
Psychologist,PhD                                              954
Psychologist,PsyD                                             395
Clinical Social Work/Therapist,LCSW-R                         265
Clinical Social Work/Therapist,LMSW                           258
Counselor,LMHC                                                124
Counselor,MA,LMHC                                              70
Clinical Social Work/Therapist,PhD,LCSW                        65
Marriage & Family Therapist,LMFT                               46
Clinical Social Work/Therapist,MSW,LCSW                        45
Clinical Social Work/Therapist,LCSW,BCD                        41
Psychologist,Ph.D.                                             38
Clinical Social Work/Therapist                                 29
Clinical Social Work/Therapist,MA,LCSW                         29
Clinical Social Work/Therapist,LCSW,CASAC                      26
Marriage &

In [31]:
df['title'] = df.title.fillna('None')

In [32]:
sum(df.title.str.contains("phd|ph.d", case=False).astype(int))

1471

In [33]:
df['title_phd'] = df.title.str.contains("phd|ph.d", case=False).astype(int)
df['title_LCSW'] = df.title.str.contains("LCSW", case=False).astype(int)
df['title_PsyD'] = df.title.str.contains("PsyD", case=False).astype(int)
df['title_psychologist'] = df.title.str.contains("psychologist", case=False).astype(int)
df['title_LMFT'] = df.title.str.contains("LMFT", case=False).astype(int)
df['title_LMHC'] = df.title.str.contains("LMHC", case=False).astype(int)

In [34]:
df

Unnamed: 0,age_focus,community_focus,description,grad_yr,issues,lang,license_state,mental_health,name,phone,...,therapytype_psychodynamic,therapytype_marital,therapytype_family,therapytype_interpersonal,title_phd,title_LCSW,title_PsyD,title_psychologist,title_LMFT,title_LMHC
0,Adults,,"""Welcome to Manhattan Mental Health Counseling...",,ADHD\nBehavioral Issues\nCareer Counseling\nCo...,,004643 New York,Mood Disorders,Manhattan Mental Health Counseling,(212) 960-8626,...,0,0,0,0,0,0,0,0,0,1
1,Adolescents / Teenagers (14 to 19)\nAdults\nEl...,Bisexual Allied\nCancer\nGay Allied\nLesbian A...,"""Why do people seek therapy? Usually it's beca...",1981.0,Anger Management\nAnxiety\nBeing a Mother- or ...,,026039 New York,,Ellen S. Daniels,(646) 797-5689,...,1,0,0,1,0,1,0,0,0,0
2,,,"""The American Institute for Cognitive Therapy ...",,,,007334-1 New York,,American Institute for Cognitive Therapy,(646) 762-1256,...,0,0,0,0,1,0,0,1,0,0
3,Adults,,"""I work with Young Adults, Career Focused Indi...",,A way to stop OBSESSION after BREAKUP\nAlcohol...,,079426 New York,,Katy Goodman Psychotherapy - Relationship Expert,(917) 791-1107,...,1,0,0,1,0,1,0,0,0,0
4,Adolescents / Teenagers (14 to 19)\nAdults\nEl...,Bisexual Allied\nBody Positivity\nGay Allied\n...,"""We’re in this together! As a clinical social...",,Bipolar Disorder\nCareer Counseling\nCoping Sk...,,079136 New York,,Amy Gottheimer,(516) 701-2206,...,1,0,0,1,0,1,0,0,0,0
5,Adults\nElders (65+),Cancer\nVeterans,"""Are you unhappy or dissatisfied in your relat...",1983.0,Anger Management\nCareer Counseling\nChronic I...,,008224 New York,Personality Disorders,Linda Larkin,(646) 351-1736,...,1,1,1,0,1,0,0,1,0,0
6,Adults,,"""Feeling down? Anxious? Stuck? Starting someth...",2014.0,Addiction\nAlcohol Abuse\nAnger Management\nCo...,,021582 New York,Mood Disorders,Yuliya Bulba,(844) 257-5734,...,0,0,0,0,0,0,1,1,0,0
7,Adults,,"""I believe that therapy is an art. My patients...",1991.0,Addiction\nCoping Skills\nDivorce\nEating Diso...,,044135 New York,Dissociative Disorders\n ...,Diane M Barnes,(212) 587-7618,...,1,0,0,0,0,1,0,0,0,0
8,Preteens / Tweens (11 to 13)\nAdolescents / Te...,Bisexual Allied\nGay Allied\nLesbian Allied\nT...,"""Whether you are seeking to improve your relat...",2016.0,Addiction\nAlcohol Abuse\nAnger Management\nBe...,,001548 New York,Impulse Control Disorders\n ...,Lauren Alexandra Eavarone,(631) 483-2034,...,1,1,1,0,0,0,0,0,1,0
9,Adults\nElders (65+),,"""Therapy is a collaborative, creative, and re-...",,Antisocial Personality\nAnxiety\nCareer Counse...,,008046 New York,Mood Disorders\n ...,Robert Buck,(646) 351-1988,...,1,1,1,1,0,1,0,0,0,0


# Save the cleaned dataframe as csv

In [35]:
df.to_csv('nyc_therapists_cleaned.csv', index=False)