# Collection

In [None]:
import requests
import pandas as pd
import time

In [None]:
# Lists of medical specialties
specs_fr = ["dentiste", "medecin-generaliste", "pediatre", "gynecologue", "ophtalmologue", "dermatologue", "osteopathe", "pedicure-podologue", "sage-femme", "orl-oto-rhino-laryngologie", "chirurgien-urologue", "rhumatologue", "endocrinologue", "chirurgien-orthopediste", "dieteticien", "psychologue", "neurologue", "psychiatre", "radiologue", "cardiologue", "gastro-enterologue",'chirurgien-plastique','acupuncteur','chiropracteur','geriatre','hematologue','homeopathe','medecin-du-sport','pneumologue','medecine-nucleaire','chirurgien']
specs_it = ["odontoiatra", "medico-di-medicina-generale", 'pediatra','ginecologo-ostetrico','oftalmologo','dermatologo-venereologo','osteopata','podologo','ostetrico','otorinolaringoiatra','urologo','reumatologo','endocrinologo','ortopedico-traumatologo','nutrizionista','psicologo','neurologo','psichiatra','radiologo-diagnostico','cardiologo','gastroenterologo','chirurgo-plastico','agopuntore','chiropratico','geriatra','ematologo','omeopata','medico-dello-sport','pneumologo','medico-nucleare','chirurgo-generale']
specs_de = ["prophylaxeassistentin", "allgemeinmedizin",'kinderheilkunde-kinder-und-jugendmedizin','frauenarzt','augenheilkunde','hautarzt','osteopath','podologe-fusspflege','hebamme','facharzt-fur-hno','urologie','rheumatologie','endokrinologe','orthopadie','diatassistent','psychologischer-psychotherapeut-psychotherapeutin','neurologie','psychiatrie-und-psychotherapie','radiologe','kardiologie','gastroenterologie','plastische-und-asthetische-chirurgie','akupunktur','chiropraktik','geriater','hamatologe-onkologe','homoopathie','sportwissenschaftler','pneumologe','nuklearmedizin','allgemeiner-chirurg']
specs_sub = ['Dentist','Generalist Physician','Pediatrician','Gynecologist','Optician','Dermatologist-Venereologist','Osteopath','Pedicure-Podologist','Obstetrician','Otorhinolaryngologist','Urologist','Rheumatologist','Endocrinologist','Orthopedist','Nutritionist','Psychologist','Neurologist','Psychiatrist','Radiologist','Cardiologist','Gastroenterologist','Plastic Surgeon','Acupuncturist','Chiropractor','Geriatrician','Hematologist','Homeopath','Sports Physician','Pulmonologist','Nuclear Physician','General Surgeon']

In [None]:
# Collect search data from doctolib, spec: medical specialty, ct: two letters for country, country, npages: (optional) max number of pages of result to collect, rt: [optional) return dataframe]
def CollectData(spec,ct,country, npages=None, rt=False):

    # Doctors IDs
    headers = {
        'authority': 'www.doctolib.'+ct,
        'accept': 'application/json',
        'content-type': 'application/json; charset=utf-8',
        'referer': 'https://www.doctolib.'+ct+'/'+spec+'/'+country,
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    }
    spec_id = 0

    docs = []
    last = 0
    count = 0
    nextURL = '/'+spec+'/'+country
    while last==0 : 
        try:
            response = requests.get('https://www.doctolib.'+ct+nextURL, headers=headers)
            time.sleep(0.75)
            docs = docs + response.json()["data"]["doctors"]
            if spec_id == 0:
              spec_id = response.json()["data"]["speciality"]["id"]
            nextURL = '/'+spec+'/'+country+"?page="+str(count+2)
            count+=1
            if len(docs)<20*count:
                break
            if count == npages:
                break
        except :
            last = 1
    print(spec, ct, len(docs))

    df = pd.DataFrame(docs).drop_duplicates("id")
    # Saving Intermediate version to avoid repeating previous step
    df.to_csv(spec+'_'+ct+'_v1.csv')

    # Availabilities
    params = {
        'speciality_id': str(spec_id),
        'search_result_format': 'json',
    }

    N = df.shape[0]
    df["n_avail"] = ["Here"]*N
    df["avail"] = ["Here"]*N
    count = 0
    for profileId in df.id:
        try:
          response = requests.get("https://www.doctolib."+ct+"/search_results/"+str(profileId)+".json", params=params, headers=headers)
          time.sleep(0.75)
        except requests.exceptions.ConnectionError:
          time.sleep(15)
          print("slp")
          try:
            response = requests.get("https://www.doctolib."+ct+"/search_results/"+str(profileId)+".json", params=params, headers=headers)
          except requests.exceptions.ConnectionError:
            time.sleep(15)
            print("slp")
            response = requests.get("https://www.doctolib."+ct+"/search_results/"+str(profileId)+".json", params=params, headers=headers)
        try:
          df["n_avail"].iloc[count] = response.json()["total"]
        except:
          print("STOP AT AVAILABILITIES, TAKE NOTE OF count")
          print("count = ",count)
          df.to_csv(spec+'_'+ct+'_v1.csv')
          print(response.json()["total"])
        if(response.json()["total"]>0):
            df["avail"].iloc[count] = response.json()["availabilities"]
        else:
            try:
                df["avail"].iloc[count] = response.json()["next_slot"]
            except:
                df["avail"].iloc[count] = None
        count = count+1
    # Saving Intermediate version to avoid repeating previous step
    df.to_csv(spec+'_'+ct+'_v2.csv')

    # Profile Info
    headers = {
        'authority': 'www.doctolib.'+ct,
        'accept': 'application/json',
        'content-type': 'application/json; charset=utf-8',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    }

    df['fees'] = ["Here"]*N
    df["details"] = ["Here"]*N
    df["places"] = ["Here"]*N
    df["languages"] = ["Here"]*N
    i = 0
    for docURL in df.iloc.link:
      try:
        response = requests.get('https://www.doctolib.'+ct+docURL, headers=headers)
        time.sleep(0.75)
      except:
        time.sleep(15)
        print("slp2")
        response = requests.get('https://www.doctolib.'+ct+docURL, headers=headers)
      try:
        df.fees.iloc[i] = response.json()["data"]["fees"]
      except:
        print("STOP AT PROFILE INFO, TAKE NOTE OF i")
        print("i = ",i)
        print(response.status_code)
        print(docURL)
        df.to_csv(spec+'_'+ct+'_v2.csv')
        print(response.json()["data"].keys())
        df.fees.iloc[i] = None
      try:
        df.details.iloc[i] = response.json()["data"]["details"]
      except:
        df.details.iloc[i] = None
      try:
          df.languages.iloc[i] = response.json()["data"]["profile"]["languages"]
      except:
          df.languages.iloc[i] = [{'iso_code': ct}]
      try:
          df.places.iloc[i] = response.json()["data"]["place"]
      except:
          try:
              df.places.iloc[i] = response.json()["data"]["places"]
          except:
              df.places.iloc[i] = None
      i=i+1
      
    df.to_csv(spec+'_'+ct+'.csv')
    if rt:
        return df

In [None]:
ct = "it"
country = "italia"
for spec in specs_it:
    CollectData(spec,ct, country)

In [None]:
# Remake second search ( Caused by error during collection)
spec = "allgemeinmedizin"
ct = "de"
country = "deutschland"


count = 0

print(spec,"stt")
df = pd.read_csv(spec+"_"+ct+"_v1"+".csv")

headers = {
    'authority': 'www.doctolib.'+ct,
    'accept': 'application/json',
    'content-type': 'application/json; charset=utf-8',
    'referer': 'https://www.doctolib.'+ct+'/'+spec+'/'+country,
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
}
response = requests.get('https://www.doctolib.'+ct+'/'+spec+'/'+country, headers=headers)
print(response.status_code)
spec_id = response.json()["data"]["speciality"]["id"]
params = {
    'speciality_id': spec_id,
    'search_result_format': 'json',
}
try: 
    print(len(df.avail))
except:
    N = df.shape[0]
    df['n_avail'] = [None]*N
    df["avail"] = [None]*N
for profileId in df.iloc[count:].id:
    try:
      response = requests.get("https://www.doctolib."+ct+"/search_results/"+str(profileId)+".json", params=params, headers=headers)
      time.sleep(0.75)
    except requests.exceptions.ConnectionError:
      time.sleep(5)
      print("slp")
      try:
        response = requests.get("https://www.doctolib."+ct+"/search_results/"+str(profileId)+".json", params=params, headers=headers)
      except requests.exceptions.ConnectionError:
        time.sleep(5)
        print("slp")
        response = requests.get("https://www.doctolib."+ct+"/search_results/"+str(profileId)+".json", params=params, headers=headers)
    try:
      df["n_avail"].iloc[count] = response.json()["total"]
    except:
      print("STOP AT AVAILABILITIES, TAKE NOTE OF count")
      print("count = ",count)
      df.to_csv(spec+'_'+ct+'_v1.csv')
      print(response.json()["total"])
    if(response.json()["total"]>0):
        df["avail"].iloc[count] = response.json()["availabilities"]
    else:
        try:
            df["avail"].iloc[count] = response.json()["next_slot"]
        except:
            df["avail"].iloc[count] = None
    count = count+1
# Saving Intermediate version to avoid repeating previous step
df.to_csv(spec+'_'+ct+'_v2.csv')

allgemeinmedizin stt
200


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


STOP AT AVAILABILITIES, TAKE NOTE OF count
count =  210


JSONDecodeError: ignored

In [None]:
# Remake third search, conitnue from position "i"
spec = 'frauenarzt'
ct = "de"
country = "deutschland"
i_val = 149

print(spec,"stt")
df = pd.read_csv(spec+"_"+ct+"_v2"+".csv")
i = i_val
headers = {
    'authority': 'www.doctolib.'+ct,
    'accept': 'application/json',
    'content-type': 'application/json; charset=utf-8',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
}

try: 
    print(len(df.fees))
except:
    N = df.shape[0]
    df['fees'] = [None]*N
    df["details"] = [None]*N
    df["places"] = [None]*N
    df["languages"] = [None]*N
for docURL in df.iloc[i_val:].link:
    try:
        response = requests.get('https://www.doctolib.'+ct+docURL, headers=headers)
        time.sleep(0.75)
    except:
        time.sleep(5)
        print("slp2")
        response = requests.get('https://www.doctolib.'+ct+docURL, headers=headers)
    try:
        df.fees.iloc[i] = response.json()["data"]["fees"]
    except:
        print(response.status_code)
        print(docURL)
        print(i)
        df.to_csv(spec+'_'+ct+'_v2.csv')
        print(response.json()["data"].keys())
        df.fees.iloc[i] = None
    try:
        df.details.iloc[i] = response.json()["data"]["details"]
    except:
        df.details.iloc[i] = None
    try:
        df.languages.iloc[i] = response.json()["data"]["profile"]["languages"]
    except:
        df.languages.iloc[i] = [{'iso_code': ct}]
    try:
        df.places.iloc[i] = response.json()["data"]["place"]
    except:
        try:
            df.places.iloc[i] = response.json()["data"]["places"]
        except:
            df.places.iloc[i] = None
    i=i+1
df.to_csv(spec+'_'+ct+'.csv')

# Preprocessing

In [None]:
import re

In [None]:
#### file_name as "pediatra_it" with specialty and country, path_from and path_to must include '/' at the end
def pre_proc(file_name, path_from, path_to, day=4):
    date="2023-01-{:02d}".format(day)
    ct = file_name[-2:]
    df = pd.read_csv("{}{}.csv".format(path_from,file_name))

    # Languages
    n_langs = 32
    languages = ["English","Spanish","Italian","German","Portuguese","Japanese","Chinese","Arabic","Hebrew","Swedish","Korean","Persian","Romanian","Polish","Russian","Greek","Kabyle","Berber","Hungarian","Catalan","Dutch","Iranian","Armenian","Vietnamite","French","Turkish","Kurdish","Czech","Ukrainien","Slovak","Bulgarian","French Sign Language"]
    lang_codes = ['gb','es','it','de','pt','jp','cn','ar','il','se','kr','fa','ro','pl','ru','gr','dz','ma','hu','ca','nl','ir','hy','vn','fr','tr','ku','cz','uk','sk','bg','sgn']
    langs_cols = [[] for _ in range(n_langs)]

    # Details, places
    check = list()
    cash = list()
    card = list()
    home_visit = list()
    avail_this_week = list()
    avail_next_week = list()
    accessible = list()
    handicap = list()

    # Availability
    today = "2023-01-04"
    this_week = today[:-2] + str(int(today[-2:])+7)
    next_week = this_week[:-2] + str(int(this_week[-2:])+7)


    # ITERATION THROUGH DATAFRAME
    for i in range(len(df)):

        # Languages
        langs_doc = re.findall(r"(?<=')[a-z][a-z][a-z]?(?=')", df.loc[i, "languages"])
        if (langs_doc == [] or (langs_doc == ['fr'] and ct != 'fr')):
            langs_doc = [ct]
        for j in range(n_langs):
            langs_cols[j].append(  lang_codes[j] in langs_doc  )
        
        # Payment Methods and Home Visit
        check.append(  re.findall(r"(?<='check': )\w+", df.loc[i, "details"]) == ['True']  )
        cash.append(  re.findall(r"(?<='cash': )\w+", df.loc[i, "details"]) == ['True']  )
        card.append(  re.findall(r"(?<='credit_card': )\w+", df.loc[i, "details"]) == ['True']  )
        home_visit.append(  re.findall(r"(?<='home_visit': )\w+", df.loc[i, "details"])  == ['True'])

        # Availability
        if (df.loc[i, "n_avail"] == 0):
            avail_this_week.append(False)
            avail_next_week.append(False)
        else:
            slots_avail = re.findall(r"(?<='date': ')[\w\d\s-]+(?=')", df.loc[i, "avail"])
            slot1 = slots_avail[0]
            avail_this_week.append(True if (today <= slot1 <= this_week) else False)

            is_busy = True
            for slot in slots_avail:
                if (this_week <= slot <= next_week):
                    is_busy = False
                if (not is_busy):
                    break
            avail_next_week.append(True if (not is_busy) else False)            
        
        # Accessibility
        floor0 = '0' in re.findall(r"(?<='floor': )\w+", df.loc[i, "places"])
        elevator = 'True' in re.findall(r"(?<='elevator': )\w+", df.loc[i, "places"])
        accessible.append(floor0 or elevator)
        handicap.append('True' in re.findall(r"(?<='handicap': )\w+", df.loc[i, "places"]))
        
        
    ## UPDATING DATAFRAME
    for i in range(n_langs):
        df[languages[i]] = langs_cols[i]
    df["accept_check"] = check
    df["accept_cash"] = cash
    df["accept_credit_card"] = card
    df["home_visit"] = home_visit
    df["avail_this_week"] = avail_this_week
    df["avail_next_week"] = avail_next_week
    df["accessible"] = accessible
    df["handicap"] = handicap

    cols_to_drop = ['cloudinary_public_id','position','place_id','fees','details','places','languages','Unnamed: 0','is_directory', 'kind','name_with_title', 'displayed_regulation_sector','priority_speciality', 'place_id','organization_status','exact_match' ]
    df = df.drop(cols_to_drop, axis=1)

    df.to_csv("{}{}_proc.csv".format(path_to, file_name))