# Download the scraped data

In [9]:
df = pd.read_csv('nyc_therapists_updated_complete.csv')
df.head(3)

Unnamed: 0,age_focus,community_focus,cost,description,grad_yr,issues,lang,license_state,mental_health,name,...,race_focus,school,sexuality,sliding,specialities,therapy_types,title,url,yrs_in_practice,zipcode
0,Adolescents / Teenagers (14 to 19)\n ...,Veterans,$250,"""My goal is to help you achieve a sense of con...",2013.0,Adoption\n ...,Russian,021021 New York,Mood Disorders\n ...,Maria Shifrin,...,,Adelphi University,Bisexual\n ...,,Infertility\n ...,Emotionally Focused\n\n\nInterpersonal\n\n\nPs...,"Psychologist,PhD",https://www.psychologytoday.com/us/therapists/...,5 Years,10019
1,Adults,,$135 - $195,"""Change IS possible. Good therapeutic work can...",2013.0,Career Counseling\n ...,,085775 New York,,Nicole Smith,...,,New York University,,1.0,Anxiety\n ...,Attachment-based\n\n\nCoaching\n\n\nCognitive ...,"Clinical Social Work/Therapist,MBA,LCSW",https://www.psychologytoday.com/us/therapists/...,,10018
2,Preteens / Tweens (11 to 13)\n ...,Aviation Professionals\n ...,$190 - $200,"""A board certified medical support hypnosis pr...",2016.0,Child or Adolescent\n ...,"French, French, Spanich and Spanish",,,Sylvain Coulon Hypnosis,...,,Hypnotherapy Academy of america,,1.0,Anxiety\n ...,Hypnotherapy\n\n\n ...,"CMS,CHT,FIBH",https://www.psychologytoday.com/us/therapists/...,3 Years,10019


# Clean the data

## Clean years in practice

In [3]:
df.yrs_in_practice.value_counts()

None         1930
20+ Years     614
10+ Years     390
15+ Years     341
30+ Years     339
8 Years       204
6 Years       201
10 Years      194
7 Years       180
9 Years       160
5 Years       146
11 Years      117
4 Years       109
12 Years       91
14 Years       75
3 Years        70
13 Years       69
15 Years       64
16 Years       61
19 Years       59
17 Years       52
20 Years       48
2 Years        47
18 Years       46
21 Years       45
24 Years       39
22 Years       33
< 2 Years      32
25 Years       32
23 Years       32
27 Years       28
29 Years       28
26 Years       28
28 Years       23
32 Years       22
34 Years       19
30 Years       17
31 Years       17
33 Years       16
39 Years       16
37 Years       16
36 Years       15
38 Years       11
35 Years        8
40 Years        7
43 Years        6
41 Years        5
< 1 Year        5
44 Years        5
47 Years        4
42 Years        4
1 Year          3
48 Years        3
45 Years        2
49 Years        1
57 Years  

### Turn years of experience into ranges
```
0-4 years
5-9
10-14
15-29
20-24
25-29
30-34
35-39
40+
```
* I counted 10+ to be in the 10-14 range, and 20+ to be in the 20-24 range, etc.

In [4]:
# First, get rid of 'Years'

df.yrs_in_practice = df.yrs_in_practice.str[:-5]
df.yrs_in_practice.value_counts()

        1930
20+      614
10+      390
15+      341
30+      339
8        204
6        201
10       194
7        180
9        160
5        146
11       117
4        109
12        91
14        75
3         70
13        69
15        64
16        61
19        59
17        52
20        48
2         47
18        46
21        45
24        39
22        33
23        32
< 2       32
25        32
27        28
26        28
29        28
28        23
32        22
34        19
31        17
30        17
37        16
33        16
39        16
36        15
38        11
35         8
40         7
43         6
< 1        5
44         5
41         5
42         4
47         4
1          3
48         3
45         2
46         1
49         1
57         1
Name: yrs_in_practice, dtype: int64

In [5]:
yrs_cleaned = []
for year in df.yrs_in_practice:
    if year == '':
        yrs_cleaned.append(np.nan)
    elif '+' in year:
        yr = year.split('+')[0]
        yrs_cleaned.append(int(yr))
    elif '< ' in year:
        yr = year.split('< ')[1]
        yrs_cleaned.append(int(yr))
    else:
        yrs_cleaned.append(int(year))
# yrs_cleaned

In [6]:
yrs_range = []
for yr in yrs_cleaned:
    if 0 <= yr <= 4:
        yrs_range.append('0-4')
    elif 5 <= yr <= 9:
        yrs_range.append('05-9')
    elif 10 <= yr <= 14:
        yrs_range.append('10-14')
    elif 15 <= yr <= 19:
        yrs_range.append('15-19')
    elif 20 <= yr <= 24:
        yrs_range.append('20-24')
    elif 25 <= yr <= 29:
        yrs_range.append('25-29')
    elif 30 <= yr <= 34:
        yrs_range.append('30-34')
    elif 35 <= yr <= 39:
        yrs_range.append('35-39')
    elif yr >= 40:
        yrs_range.append('40+')
    else:
        yrs_range.append('None')
# yrs_range

In [7]:
df['yrs_in_practice_range'] = yrs_range

In [8]:
df.yrs_in_practice_range.value_counts(dropna=False)

None     1930
10-14     936
05-9      891
20-24     811
15-19     623
30-34     430
0-4       266
25-29     139
35-39      66
40+        39
Name: yrs_in_practice_range, dtype: int64

## Clean graduation year & turn them into integers

* Set "None" as 0, then into NaN

In [9]:
df.grad_yr.value_counts(dropna=False)

NaN       1941
2011.0     183
2012.0     164
2008.0     161
2005.0     159
2013.0     156
2009.0     151
2007.0     147
2004.0     147
2006.0     145
2010.0     144
2003.0     130
2000.0     119
2014.0     118
1995.0     115
2002.0     115
2001.0     111
1998.0     103
2015.0     103
1999.0     101
1996.0      96
1997.0      95
1994.0      92
1993.0      85
2017.0      82
2016.0      75
1992.0      73
1987.0      67
1990.0      64
1991.0      60
1983.0      59
1989.0      56
1986.0      54
1980.0      54
1988.0      52
1981.0      50
2018.0      48
1985.0      48
1982.0      48
1979.0      47
1978.0      46
1984.0      40
1977.0      33
1975.0      32
1976.0      32
1974.0      22
1972.0      20
1973.0      17
1968.0      13
1971.0      12
2019.0       9
1969.0       9
1970.0       7
1967.0       6
1964.0       5
1960.0       3
1966.0       3
1962.0       2
1961.0       1
1957.0       1
Name: grad_yr, dtype: int64

In [10]:
df['grad_yr'] = df.grad_yr.astype(int, errors='ignore')

In [11]:
df.dtypes

age_focus                 object
community_focus           object
cost                      object
description               object
grad_yr                  float64
issues                    object
lang                      object
license_state             object
mental_health             object
name                      object
phone                     object
race_focus                object
school                    object
sexuality                 object
sliding                  float64
specialities              object
therapy_types             object
title                     object
url                       object
yrs_in_practice           object
zipcode                    int64
yrs_in_practice_range     object
dtype: object

# Clean cost and sliding scale

In [12]:
# No cleaning needed!

df.sliding.value_counts(dropna=False)

1.0    4670
NaN    1461
Name: sliding, dtype: int64

## When there's a range for the cost per session, I am going to replace it with an average of the range

In [10]:
df.cost.value_counts(dropna=False)

NaN            1481
$150 - $200     377
$150 - $250     241
$200 - $250     206
$100 - $150     152
$100 - $200     129
$200 - $300     125
$150 - $300      84
$100 - $250      81
$150             77
$120 - $200      75
$200             73
$120 - $150      68
$250 - $300      65
$180 - $250      65
$80 - $150       64
$200+            61
$150+            60
$150 - $180      50
$120 - $180      50
$250 - $350      50
$250             49
$170 - $250      47
$100 - $180      39
$170 - $200      38
$80 - $200       34
$90 - $150       32
$200 - $350      30
$80 - $120       27
$180 - $200      27
               ... 
$210 - $310       1
$185 - $225       1
$100 - $210       1
$450 - $500       1
$240 - $310       1
$150 - $480       1
$230 - $280       1
$240 - $500       1
$500+             1
$125 - $160       1
$50 - $90         1
$120 - $350       1
$120 - $270       1
$250 - $330       1
$125 - $400       1
$210 - $300       1
$40 - $300        1
$160 - $170       1
$100 - $275       1


In [11]:
df['cost'] = df.cost.str.replace('$','')
df['cost'] = df.cost.str.replace('+','')

In [12]:
df.cost

0             250
1       135 - 195
2       190 - 200
3        70 - 140
4       150 - 200
5       190 - 240
6       100 - 200
7             NaN
8             NaN
9             NaN
10            NaN
11      120 - 160
12            NaN
13      200 - 250
14            350
15      220 - 250
16      150 - 250
17      190 - 400
18      150 - 250
19      100 - 200
20      250 - 150
21       70 - 270
22       60 - 100
23      100 - 170
24            NaN
25            150
26      110 - 120
27      180 - 270
28      150 - 240
29      100 - 130
          ...    
6101          NaN
6102    100 - 250
6103          NaN
6104     80 - 125
6105          NaN
6106    100 - 275
6107          NaN
6108          NaN
6109          NaN
6110          NaN
6111          NaN
6112    180 - 225
6113    100 - 270
6114          NaN
6115          NaN
6116          NaN
6117          NaN
6118          NaN
6119          NaN
6120          NaN
6121          NaN
6122          NaN
6123          NaN
6124          NaN
6125      

In [13]:
cost_cleaned = []
for price in df.cost:
    try:
        if '-' in price:
            price = price.split(' - ')

            # Convert list of string into integers
            price = list(map(int, price))
            price = statistics.mean(price)
        else:
            price = int(price)
    except:
        price = np.nan
    cost_cleaned.append(price)

In [14]:
df['cost_cleaned'] = cost_cleaned

In [19]:
df.cost_cleaned.value_counts(dropna=False)

statistics.median(df.cost_cleaned.dropna())
df.cost_cleaned.describe()

count     4555.000000
mean       197.677607
std        553.196204
min         30.000000
25%        150.000000
50%        175.000000
75%        222.500000
max      33750.000000
Name: cost_cleaned, dtype: float64

## Tag what age groups therapists focus on by assigning 0 or 1 to therapists that focus on different ages

In [19]:
df['age_focus'] = df.age_focus.str.replace('  ','')
df['age_focus'] = df.age_focus.str.replace('\n\n','')
# df.age_focus.value_counts()

In [20]:
df['agefocus_toddlers'] = df.age_focus.str.contains("Toddlers").astype(int)
df['agefocus_children'] = df.age_focus.str.contains("Children").astype(int)
df['agefocus_preteen-tweens'] = df.age_focus.str.contains("Preteens").astype(int)
df['agefocus_teenagers'] = df.age_focus.str.contains("Teenagers").astype(int)
df['agefocus_adults'] = df.age_focus.str.contains("Adults").astype(int)
df['agefocus_elders'] = df.age_focus.str.contains("Elders").astype(int)

# Do the same for sexuality focus

In [21]:
df['sexuality'] = df.sexuality.str.replace('  ','')
df['sexuality'] = df.sexuality.str.replace('\n\n','')
# df.sexuality.value_counts()

In [22]:
df['sexualityfocus_bisexual'] = df.sexuality.str.contains("Bisexual").astype(int)
df['sexualityfocus_gay'] = df.sexuality.str.contains("Gay").astype(int)
df['sexualityfocus_lesbian'] = df.sexuality.str.contains("Lesbian").astype(int)

# And race focus

In [23]:
df.race_focus.value_counts(dropna=False)

None                                                         5053
African-American and Hispanic and Latino                      222
Other Racial or Ethnic Background                             168
African-American and Other Racial or Ethnic Background        145
Hispanic and Latino and Other Racial or Ethnic Background     108
Hispanic and Latino                                           101
Asian and Other Racial or Ethnic Background                    93
Asian                                                          64
African-American and Asian                                     62
Asian and Hispanic and Latino                                  42
African-American                                               38
Asian and Pacific Islander                                     14
African-American and Native American                            6
Hispanic and Latino and Native American                         5
Native American and Other Racial or Ethnic Background           3
Other Raci

In [24]:
df['racefocus_afam'] = df.race_focus.str.contains("African-American").astype(int)
df['racefocus_latino_hispanic'] = df.race_focus.str.contains("Hispanic and Latino").astype(int)
df['racefocus_asian'] = df.race_focus.str.contains("Asian").astype(int)
df['racefocus_pacific_islander'] = df.race_focus.str.contains("Pacific Islander").astype(int)
df['racefocus_native_am'] = df.race_focus.str.contains("Native American").astype(int)
df['racefocus_other_races'] = df.race_focus.str.contains("Other").astype(int)

# And community focus

In [25]:
df['community_focus'] = df.community_focus.str.replace('  ','')
df['community_focus'] = df.community_focus.str.replace('\n\n','')
# df.community_focus.value_counts()

In [26]:
# Get the list of community focuses
list(set(df.community_focus.str.split("\n").sum()))

['Veterans',
 'Vegan Allied',
 'Transgender Allied',
 'Aviation Professionals',
 'Non-Binary Allied',
 'Sex-Positive, Kink Allied',
 'Open Relationships Non-Monogamy',
 'Queer Allied',
 'Lesbian Allied',
 'Intersex Allied',
 'Cancer',
 'Gay Allied',
 'Body Positivity',
 'Sex Worker Allied',
 'HIV / AIDS Allied',
 'Racial Justice Allied',
 'Bisexual Allied',
 'None']

In [27]:
df['community_bisexual'] = df.community_focus.str.contains("Bisexual Allied").astype(int)
df['community_gay'] = df.community_focus.str.contains("Gay Allied").astype(int)
df['community_trans'] = df.community_focus.str.contains("Transgender Allied").astype(int)
df['community_lesbian'] = df.community_focus.str.contains("Lesbian Allied").astype(int)
df['community_HIV'] = df.community_focus.str.contains("HIV / AIDS Allied").astype(int)
df['community_vets'] = df.community_focus.str.contains("Veterans").astype(int)
df['community_cancer'] = df.community_focus.str.contains("HIV / AIDS Allied").astype(int)
df['community_aviation'] = df.community_focus.str.contains("Aviation Professionals").astype(int)
df['community_bodypos'] = df.community_focus.str.contains("Body Positivity").astype(int)
df['community_intersex'] = df.community_focus.str.contains("Intersex Allied").astype(int)
df['community_nonbinary'] = df.community_focus.str.contains("Non-Binary Allied").astype(int)
df['community_openrelationship'] = df.community_focus.str.contains("Open Relationships").astype(int)
df['community_queer'] = df.community_focus.str.contains("Queer Allied").astype(int)
df['community_racialjustice'] = df.community_focus.str.contains("Racial Justice Allied").astype(int)
df['community_sexworker'] = df.community_focus.str.contains("Sex Worker").astype(int)
df['community_sexpos'] = df.community_focus.str.contains("Sex-Positive, Kink Allied").astype(int)
df['community_vegan'] = df.community_focus.str.contains("Vegan Allied").astype(int)

# And for issues

In [3]:
df['issues'] = df.issues.str.replace('  ','')
df['issues'] = df.issues.str.replace('\n\n','')
df.issues.value_counts()

None                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        173
Anger Management\nBehavioral Issues\nCoping Skills\nFamily Conflict\nGrief\nObsessive-Compulsive (OCD)\nPeer Relationships\nRelationship Issues\nSchool Issues\nSelf Esteem\nStress                                                                                                                     

In [4]:
# Get the list of issue focuses
issues_list = list(set(df.issues.str.split("\n").sum()))

In [7]:
issues_list

['Developmental Trauma',
 'job stress management',
 'Performance and Social Anxiety',
 'Technology Overload',
 'OTD (off the derech "path") ex religious Jews ',
 'Asian and Asian-American',
 'Vulnerability avoidance',
 'Adult ADD',
 'I work with complicated situations ',
 'Self-harm/cutting, Rape, Crisis Mgmt.',
 'Existential issues',
 'Couple Therapy, Family Therapy',
 'Establishing a fulfilling Life',
 'Career Issues',
 'Somatic & Autoimmune Disorders',
 'Habit Disorders',
 'Race-based trauma and stressors',
 'Cofounders Counseling/Coaching',
 'creative, artistic blocks and impasses',
 'Cognitive Behavioral Therapy for Insomnia',
 "Crohn's Disease and Ulcerative Colitis(IBD)",
 'Issues Specific to Creative Professions',
 'Self Esteem Issues',
 'Cancer Diagnosis, Life After Cancer',
 'Body Dysmorphia',
 'Alternative lifestyle, Kink & Poly',
 'Motivation',
 'Fear and phobias of any types',
 'HYPNOSIS',
 'online dating',
 'Chronic Procrastination ',
 'cultural issues',
 'Chronic Illness

## Too many issues. I'm going to select:

* family
* anxiety
* alcohol
* drug
* work/career
* relationship
* stress
* trauma
* divorce
* sexual abuse

In [31]:
df['issues_family'] = df.issues.str.contains("family", case=False).astype(int)
df['issues_anxiety'] = df.issues.str.contains("anxiety", case=False).astype(int)
df['issues_alcohol'] = df.issues.str.contains("alcohol", case=False).astype(int)
df['issues_drug'] = df.issues.str.contains("drug", case=False).astype(int)
df['issues_career'] = df.issues.str.contains("career|work|job", case=False).astype(int)
df['issues_relationship'] = df.issues.str.contains("relationship", case=False).astype(int)
df['issues_stress'] = df.issues.str.contains("stress", case=False).astype(int)
df['issues_trauma'] = df.issues.str.contains("trauma", case=False).astype(int)
df['issues_divorce'] = df.issues.str.contains("divorce", case=False).astype(int)
df['issues_sexabuse'] = df.issues.str.contains("sexual abuse", case=False).astype(int)
df['issues_domesticabuse'] = df.issues.str.contains("domestic abuse", case=False).astype(int)

# Do the same for speciality

In [32]:
df['specialities'] = df.specialities.str.replace('  ','')
df['specialities'] = df.specialities.str.replace('\n\n','')
# df.specialities.value_counts()

In [33]:
df['specialities'] = df.specialities.fillna('None')

In [34]:
# Get the list of speciality focuses
specialities_list = list(set(df.specialities.str.split("\n").sum())) 
specialities_list

['Behavioral Issues',
 'Borderline Personality',
 'Dissociative Disorders',
 'Creativity',
 'Domestic Abuse',
 'Immigration and identity/cultural',
 'emotional eating',
 'MARRIAGE AND COUPLES COUNSELING',
 'Self-esteem issues',
 'Adult Psychotherapy',
 'Self-Harming',
 'job related stress',
 'Developmental Disorders',
 'Integration of Sexuality & Spirituality',
 'Quality of Life issues',
 'Couples and relationship concerns',
 "Men's Midlife Issues",
 'Transgender',
 'Indo-American, Asian, body image',
 'Gender issues',
 'Cross-cultural issues',
 "Asperger's Syndrome",
 'ACOA, Performing Arts',
 'Cultural/Cross-Cultural Issues',
 'Acculturation Issues',
 'Identity Issues',
 'living with cancer',
 'Coping Skills',
 'creative blocks',
 'Diabetes',
 'Creative Blocks',
 'Mindfulness Stress Reduction',
 'Thinking Disorders',
 'Hypnotherapy',
 'Gay',
 'Anger Management',
 'Neuropsychological Evaluation',
 'Couples/Marriage Counseling',
 'Sleep management',
 'Communication',
 'creative blocks,

## Also too many in the list, so I'm going to choose:

* pregnancy
* addiction
* stress
* anxiety
* sexual abuse
* relationship
* women
* trauma

In [35]:
df['specialities_pregnancy'] = df.specialities.str.contains("pregnancy", case=False).astype(int)
df['specialities_addiction'] = df.specialities.str.contains("addiction", case=False).astype(int)
df['specialities_stress'] = df.specialities.str.contains("stress", case=False).astype(int)
df['specialities_sexabuse'] = df.specialities.str.contains("sexual abuse", case=False).astype(int)
df['specialities_relationship'] = df.specialities.str.contains("relationship", case=False).astype(int)
df['specialities_women'] = df.specialities.str.contains("women", case=False).astype(int)
df['specialities_trauma'] = df.specialities.str.contains("pregnancy", case=False).astype(int)

# Tag therapy types

I chose:
* CBT
* Psychoanalytic
* Psychodynamic
* Marital
* Family
* Interpersonal

In [36]:
df['therapy_types'] = df.therapy_types.str.replace('  ','')
df['therapy_types'] = df.therapy_types.str.replace('\n\n','')
df.therapy_types.value_counts()

None                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     246
Psychoanalytic\nPsychodynamic                                                                                                                                                                                                                                                                                                                                                                                                                                                             87
Eclectic\nPsychodynamic                       

In [37]:
df['therapytype_cbt'] = df.therapy_types.str.contains("cbt", case=False).astype(int)
df['therapytype_psychoanalytic'] = df.therapy_types.str.contains("psychoanalytic", case=False).astype(int)
df['therapytype_psychodynamic'] = df.therapy_types.str.contains("psychodynamic", case=False).astype(int)
df['therapytype_marital'] = df.therapy_types.str.contains("marital", case=False).astype(int)
df['therapytype_family'] = df.therapy_types.str.contains("family", case=False).astype(int)
df['therapytype_interpersonal'] = df.therapy_types.str.contains("interpersonal", case=False).astype(int)

# Tag titles

I chose:

* PhD
* LCSW
* PsyD
* Psychologist
* LMFT
* LMHC

In [38]:
df.title.value_counts(dropna=False)

Clinical Social Work/Therapist,LCSW                    1264
Psychologist,PhD                                        956
Psychologist,PsyD                                       396
Clinical Social Work/Therapist,LCSW-R                   266
Clinical Social Work/Therapist,LMSW                     260
Counselor,LMHC                                          123
Counselor,MA,LMHC                                        71
Clinical Social Work/Therapist,PhD,LCSW                  65
Marriage & Family Therapist,LMFT                         46
Clinical Social Work/Therapist,MSW,LCSW                  45
Clinical Social Work/Therapist,LCSW,BCD                  40
Psychologist,Ph.D.                                       38
Clinical Social Work/Therapist                           29
Clinical Social Work/Therapist,MA,LCSW                   29
Psychologist                                             26
Clinical Social Work/Therapist,LCSW,CASAC                26
Marriage & Family Therapist,MA,LMFT     

In [39]:
df['title'] = df.title.fillna('None')

In [40]:
sum(df.title.str.contains("phd|ph.d", case=False).astype(int))

1473

In [41]:
df['title_phd'] = df.title.str.contains("phd|ph.d", case=False).astype(int)
df['title_LCSW'] = df.title.str.contains("LCSW", case=False).astype(int)
df['title_PsyD'] = df.title.str.contains("PsyD", case=False).astype(int)
df['title_psychologist'] = df.title.str.contains("psychologist", case=False).astype(int)
df['title_LMFT'] = df.title.str.contains("LMFT", case=False).astype(int)
df['title_LMHC'] = df.title.str.contains("LMHC", case=False).astype(int)

# Save the cleaned dataframe as csv

In [42]:
df.to_csv('nyc_therapists_updated_cleaned.csv', index=False)