In [63]:
import os
import time
import pandas as pd
from getpass import getuser
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from datetime import datetime

user = getuser()

# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"
output_folder = f"C:/Users/{user}/Documents/GitHub/tennis-homophily/data/atp"
url = "https://www.atptour.com/en/rankings/doubles?RankRange=1-5000&Region=all&DateWeek=2024-05-20"

def configure_driver():
    options = Options()
    options.add_argument("--verbose")
    options.add_argument("--log-path=chromedriver.log")
    service = Service(chrome_driver_path)
    return webdriver.Chrome(service=service, options=options)

def scrape_data(url):
    all_data = []
    driver = configure_driver()
    driver.get(url)
    driver.minimize_window()
    
    try:
        WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, "mega-table")))
        time.sleep(5)  # Allow additional time for the page to load completely
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        rows = soup.find_all("tr", class_="lower-row")  # Locate rows with class "lower-row"
        if not rows:
            print("No rows found with class 'lower-row'.")
        
        processed_rows = []
        
        for index, row in enumerate(rows, start=1):
            try:
                rank = "N/A"
                player_name = "N/A"
                player_profile_link = "N/A"
                points = "N/A"
                tournaments_played = "N/A"
                
                # Extract rank
                rank_td = row.find("td", class_="rank bold heavy tiny-cell")
                if rank_td:
                    rank = rank_td.get_text(strip=True)
                
                # Extract player name and profile link
                player_td = row.find("td", class_="player bold heavy large-cell")
                if player_td:
                    player_name_tag = player_td.find("a")
                    if player_name_tag:
                        player_name = player_name_tag.get_text(strip=True)
                        player_profile_link = player_name_tag["href"]
                
                # Extract points
                points_td = row.find("td", class_="points center bold extrabold small-cell")
                if points_td:
                    points = points_td.get_text(strip=True)
                
                # Extract tournaments played
                tournaments_td = row.find("td", class_="tourns center small-cell")
                if tournaments_td:
                    tournaments_played = tournaments_td.get_text(strip=True)
                
                print(f"Processing row {index}: Rank {rank}, Player {player_name}, Points {points}, Tournaments Played {tournaments_played}, Profile Link {player_profile_link}")  # Print the current row being processed
                
                # Only add row if tournaments_played is not "N/A"
                if tournaments_played != "N/A":
                    processed_rows.append([rank, player_name, points, tournaments_played, player_profile_link])
            except Exception as e:
                print(f"Error processing row {index}: {e}")
    except Exception as e:
        print(f"Error loading page: {e}")
    finally:
        driver.quit()
    
    return processed_rows

def save_to_excel(data):
    df = pd.DataFrame(data, columns=["Rank", "Player", "Points", "Tournaments Played", "Player Profile Link"])
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_excel_filename = os.path.join(output_folder, f"atp_doubles_{timestamp}.xlsx")
    df.to_excel(output_excel_filename, index=False)
    print(f"All data saved to {output_excel_filename}")

def main():
    data = scrape_data(url)
    if data:
        save_to_excel(data)

if __name__ == "__main__":
    main()


Processing row 1: Rank 1T, Player M. Granollers, Points 8,130, Tournaments Played N/A, Profile Link /en/players/marcel-granollers/g710/overview
Processing row 2: Rank 1T, Player H. Zeballos, Points 8,130, Tournaments Played N/A, Profile Link /en/players/horacio-zeballos/z184/overview
Processing row 3: Rank 3, Player M. Ebden, Points 7,390, Tournaments Played N/A, Profile Link /en/players/matthew-ebden/e690/overview
Processing row 4: Rank 4, Player R. Bopanna, Points 7,210, Tournaments Played N/A, Profile Link /en/players/rohan-bopanna/b757/overview
Processing row 5: Rank 5, Player J. Salisbury, Points 6,630, Tournaments Played N/A, Profile Link /en/players/joe-salisbury/so70/overview
Processing row 6: Rank 6, Player R. Ram, Points 6,585, Tournaments Played N/A, Profile Link /en/players/rajeev-ram/r548/overview
Processing row 7: Rank 7, Player I. Dodig, Points 6,230, Tournaments Played N/A, Profile Link /en/players/ivan-dodig/d646/overview
Processing row 8: Rank 8, Player A. Krajicek, P

Processing row 386: Rank 386, Player B. Pujol Navarro, Points 157, Tournaments Played N/A, Profile Link /en/players/bruno-pujol-navarro/p0cu/overview
Processing row 387: Rank 387, Player J. Hrazdil, Points 156, Tournaments Played N/A, Profile Link /en/players/jan-hrazdil/h0hx/overview
Processing row 388: Rank 388, Player A. Andrade, Points 156, Tournaments Played N/A, Profile Link /en/players/andres-andrade/ag08/overview
Processing row 389: Rank 389, Player V. Durasovic, Points 155, Tournaments Played N/A, Profile Link /en/players/viktor-durasovic/dc76/overview
Processing row 390: Rank 390, Player N. Sinha, Points 155, Tournaments Played N/A, Profile Link /en/players/nitin-kumar-sinha/sy50/overview
Processing row 391: Rank 391, Player J. Charlton, Points 154, Tournaments Played N/A, Profile Link /en/players/joshua-charlton/c09y/overview
Processing row 392: Rank 392, Player M. Poljicak, Points 154, Tournaments Played N/A, Profile Link /en/players/mili-poljicak/p0gq/overview
Processing r

Processing row 772: Rank 772, Player A. Dougaz, Points 57, Tournaments Played N/A, Profile Link /en/players/aziz-dougaz/df88/overview
Processing row 773: Rank 773, Player A. Matusevich, Points 57, Tournaments Played N/A, Profile Link /en/players/anton-matusevich/m0fa/overview
Processing row 774: Rank 774, Player A. Nefve, Points 57, Tournaments Played N/A, Profile Link /en/players/axel-nefve/n0ba/overview
Processing row 775: Rank 775, Player L. Giacomini, Points 57, Tournaments Played N/A, Profile Link /en/players/luca-giacomini/gk24/overview
Processing row 776: Rank 776, Player N. Sanchez Izquierdo, Points 57, Tournaments Played N/A, Profile Link /en/players/nikolas-sanchez-izquierdo/s0gz/overview
Processing row 777: Rank 777, Player B. Kozlov, Points 56, Tournaments Played N/A, Profile Link /en/players/boris-kozlov/k0af/overview
Processing row 778: Rank 778, Player T. de Bakker, Points 56, Tournaments Played N/A, Profile Link /en/players/thiemo-de-bakker/d776/overview
Processing row 

Processing row 1037: Rank 1037, Player N. Perfetti, Points 33, Tournaments Played N/A, Profile Link /en/players/noah-perfetti/p0jk/overview
Processing row 1038: Rank 1038, Player E. Bogo, Points 33, Tournaments Played N/A, Profile Link /en/players/enrique-bogo/bk43/overview
Processing row 1039: Rank 1039, Player A. Kuznetsov, Points 32, Tournaments Played N/A, Profile Link /en/players/andrey-kuznetsov/kb54/overview
Processing row 1040: Rank 1040, Player A. Rogers, Points 32, Tournaments Played N/A, Profile Link /en/players/andrew-rogers/r0d3/overview
Processing row 1041: Rank 1041, Player M. Mazza, Points 32, Tournaments Played N/A, Profile Link /en/players/manuel-mazza/m0en/overview
Processing row 1042: Rank 1042, Player I. Gakhov, Points 32, Tournaments Played N/A, Profile Link /en/players/ivan-gakhov/ge28/overview
Processing row 1043: Rank 1043, Player M. Howse, Points 32, Tournaments Played N/A, Profile Link /en/players/matthew-howse/h0it/overview
Processing row 1044: Rank 1044, Pl

Processing row 1334: Rank 1334T, Player K. Maamoun, Points 18, Tournaments Played N/A, Profile Link /en/players/karim-mohamed-maamoun/mj70/overview
Processing row 1335: Rank 1334T, Player V. Radjenovic, Points 18, Tournaments Played N/A, Profile Link /en/players/vuk-radjenovic/r0io/overview
Processing row 1336: Rank 1336, Player A. Kotzen, Points 18, Tournaments Played N/A, Profile Link /en/players/alexander-kotzen/k0el/overview
Processing row 1337: Rank 1337, Player N. Compagnucci, Points 18, Tournaments Played N/A, Profile Link /en/players/nicolas-compagnucci/cf13/overview
Processing row 1338: Rank 1338, Player J. Roberts, Points 18, Tournaments Played N/A, Profile Link /en/players/justin-roberts/rh17/overview
Processing row 1339: Rank 1339, Player S. Santibanez, Points 18, Tournaments Played N/A, Profile Link /en/players/sebastian-santibanez/sp83/overview
Processing row 1340: Rank 1340, Player M. Coman, Points 18, Tournaments Played N/A, Profile Link /en/players/mihai-alexandru-coma

Processing row 1609: Rank 1609, Player V. Ahti, Points 10, Tournaments Played N/A, Profile Link /en/players/vesa-ahti/a0g9/overview
Processing row 1610: Rank 1610T, Player A. Bancalari, Points 10, Tournaments Played N/A, Profile Link /en/players/alejandro-bancalari/b0lt/overview
Processing row 1611: Rank 1610T, Player N. Jadoun, Points 10, Tournaments Played N/A, Profile Link /en/players/nicolas-jadoun/j0dl/overview
Processing row 1612: Rank 1612, Player A. Burdet, Points 10, Tournaments Played N/A, Profile Link /en/players/adrien-burdet/b0ln/overview
Processing row 1613: Rank 1613, Player B. Kisantal, Points 10, Tournaments Played N/A, Profile Link /en/players/botond-kisantal/k0i9/overview
Processing row 1614: Rank 1614T, Player K. Collignon, Points 10, Tournaments Played N/A, Profile Link /en/players/kylian-collignon/c0pl/overview
Processing row 1615: Rank 1614T, Player A. Colombo, Points 10, Tournaments Played N/A, Profile Link /en/players/andrea-colombo/c0pg/overview
Processing row

Processing row 1893: Rank 1891T, Player D. Yoshimura, Points 5, Tournaments Played N/A, Profile Link /en/players/daiki-yoshimura/y0al/overview
Processing row 1894: Rank 1891T, Player T. Zeuch, Points 5, Tournaments Played N/A, Profile Link /en/players/tom-zeuch/z09g/overview
Processing row 1895: Rank 1895T, Player L. Carraro, Points 5, Tournaments Played N/A, Profile Link /en/players/luciano-carraro/c0c5/overview
Processing row 1896: Rank 1895T, Player L. Renard, Points 5, Tournaments Played N/A, Profile Link /en/players/lucas-renard/rb49/overview
Processing row 1897: Rank 1897T, Player A. Dorofeev, Points 5, Tournaments Played N/A, Profile Link /en/players/artiom-dorofeev/d0kb/overview
Processing row 1898: Rank 1897T, Player V. Manukyan, Points 5, Tournaments Played N/A, Profile Link /en/players/vardan-manukyan/m0tn/overview
Processing row 1899: Rank 1897T, Player C. Schuetze, Points 5, Tournaments Played N/A, Profile Link /en/players/caspar-schuetze/s0ch/overview
Processing row 1900:

Processing row 2200: Rank 2200T, Player B. Alarcon, Points 3, Tournaments Played N/A, Profile Link /en/players/benjamin-denis-alarcon/a0g1/overview
Processing row 2201: Rank 2200T, Player L. Borg, Points 3, Tournaments Played N/A, Profile Link /en/players/leo-borg/b0jg/overview
Processing row 2202: Rank 2200T, Player P. Iamachkine, Points 3, Tournaments Played N/A, Profile Link /en/players/petr-iamachkine/i371/overview
Processing row 2203: Rank 2200T, Player Y. Su, Points 3, Tournaments Played N/A, Profile Link /en/players/yu-hsiang-su/s0t9/overview
Processing row 2204: Rank 2200T, Player M. Sudzum, Points 3, Tournaments Played N/A, Profile Link /en/players/marcel-marlon-sudzum/s0db/overview
Processing row 2205: Rank 2205T, Player D. Javia, Points 3, Tournaments Played N/A, Profile Link /en/players/dev-javia/j0cl/overview
Processing row 2206: Rank 2205T, Player M. Zayid, Points 3, Tournaments Played N/A, Profile Link /en/players/mubarak-shannan-zayid/ac39/overview
Processing row 2207: 

Processing row 2470: Rank 2467T, Player G. Di Natale, Points 2, Tournaments Played N/A, Profile Link /en/players/gian-matias-di-natale/d0en/overview
Processing row 2471: Rank 2467T, Player M. Figl, Points 2, Tournaments Played N/A, Profile Link /en/players/maximilian-figl/f0fb/overview
Processing row 2472: Rank 2467T, Player J. Jin, Points 2, Tournaments Played N/A, Profile Link /en/players/jeremy-jin/j0d4/overview
Processing row 2473: Rank 2467T, Player A. Laborde, Points 2, Tournaments Played N/A, Profile Link /en/players/arthur-laborde/l0kj/overview
Processing row 2474: Rank 2467T, Player E. Roux, Points 2, Tournaments Played N/A, Profile Link /en/players/eloi-roux/r0kl/overview
Processing row 2475: Rank 2467T, Player A. Van Baal, Points 2, Tournaments Played N/A, Profile Link /en/players/aloys-j-willem-van-baal/v0cv/overview
Processing row 2476: Rank 2467T, Player M. Woerndle, Points 2, Tournaments Played N/A, Profile Link /en/players/matthew-woerndle/w0ar/overview
Processing row 2

Processing row 2597: Rank 107, Player Manuel Guinard, Points 801, Tournaments Played 22, Profile Link /en/players/manuel-guinard/gh33/overview
Processing row 2598: Rank 108, Player Marcus Willis, Points 800, Tournaments Played 34, Profile Link /en/players/marcus-willis/w521/overview
Processing row 2599: Rank 109, Player Philipp Oswald, Points 798, Tournaments Played 35, Profile Link /en/players/philipp-oswald/o305/overview
Processing row 2600: Rank 110, Player Piotr Matuszewski, Points 798, Tournaments Played 39, Profile Link /en/players/piotr-matuszewski/m09u/overview
Processing row 2601: Rank 111, Player Yannick Hanfmann, Points 790, Tournaments Played 6, Profile Link /en/players/yannick-hanfmann/h997/overview
Processing row 2602: Rank 112, Player Marco Bortolotti, Points 788, Tournaments Played 33, Profile Link /en/players/marco-bortolotti/bh08/overview
Processing row 2603: Rank 113, Player JiSung Nam, Points 786, Tournaments Played 32, Profile Link /en/players/jisung-nam/n652/overv

Processing row 2820: Rank 330, Player Sander Jong, Points 191, Tournaments Played 16, Profile Link /en/players/sander-jong/j0cu/overview
Processing row 2821: Rank 331, Player Quentin Halys, Points 190, Tournaments Played 5, Profile Link /en/players/quentin-halys/hb64/overview
Processing row 2822: Rank 332, Player Alexander Merino, Points 190, Tournaments Played 24, Profile Link /en/players/alexander-merino/mm31/overview
Processing row 2823: Rank 333, Player Carlos Sanchez Jover, Points 188, Tournaments Played 19, Profile Link /en/players/carlos-sanchez-jover/s0cg/overview
Processing row 2824: Rank 334, Player Zvonimir Babic, Points 188, Tournaments Played 19, Profile Link /en/players/zvonimir-babic/bm54/overview
Processing row 2825: Rank 335, Player Roman Safiullin, Points 187, Tournaments Played 8, Profile Link /en/players/roman-safiullin/sx50/overview
Processing row 2826: Rank 336, Player Antoine Bellier, Points 186, Tournaments Played 5, Profile Link /en/players/antoine-bellier/bp41

Processing row 3066: Rank 573T, Player Luca Van Assche, Points 90, Tournaments Played 3, Profile Link /en/players/luca-van-assche/v0dz/overview
Processing row 3067: Rank 577, Player Marton Fucsovics, Points 90, Tournaments Played 4, Profile Link /en/players/marton-fucsovics/f724/overview
Processing row 3068: Rank 578, Player Casper Ruud, Points 90, Tournaments Played 4, Profile Link /en/players/casper-ruud/rh16/overview
Processing row 3069: Rank 579, Player Jannik Sinner, Points 90, Tournaments Played 5, Profile Link /en/players/jannik-sinner/s0ag/overview
Processing row 3070: Rank 580, Player Felix Auger-Aliassime, Points 90, Tournaments Played 6, Profile Link /en/players/felix-auger-aliassime/ag37/overview
Processing row 3071: Rank 581, Player Frances Tiafoe, Points 90, Tournaments Played 6, Profile Link /en/players/frances-tiafoe/td51/overview
Processing row 3072: Rank 582, Player Maxime Janvier, Points 90, Tournaments Played 7, Profile Link /en/players/maxime-janvier/j620/overview


Processing row 3248: Rank 758, Player Oliver Okonkwo, Points 58, Tournaments Played 6, Profile Link /en/players/oliver-okonkwo/o0bc/overview
Processing row 3249: Rank 759, Player Jerome Kym, Points 58, Tournaments Played 6, Profile Link /en/players/jerome-kym/k0ep/overview
Processing row 3250: Rank 760, Player Mats Rosenkranz, Points 58, Tournaments Played 8, Profile Link /en/players/mats-rosenkranz/rh62/overview
Processing row 3251: Rank 761, Player Juan Bautista Torres, Points 58, Tournaments Played 12, Profile Link /en/players/juan-bautista-torres/t0dm/overview
Processing row 3252: Rank 762, Player Alafia Ayeni, Points 58, Tournaments Played 13, Profile Link /en/players/alafia-ayeni/a09p/overview
Processing row 3253: Rank 763, Player Vadym Ursu, Points 58, Tournaments Played 14, Profile Link /en/players/vadym-ursu/u176/overview
Processing row 3254: Rank 764, Player Fabio Coelho, Points 58, Tournaments Played 15, Profile Link /en/players/fabio-coelho/c09s/overview
Processing row 3255

Processing row 3499: Rank 1009, Player Ye Cong Mo, Points 35, Tournaments Played 18, Profile Link /en/players/ye-cong-mo/m0bw/overview
Processing row 3500: Rank 1010, Player Orel Kimhi, Points 35, Tournaments Played 19, Profile Link /en/players/orel-kimhi/k0ho/overview
Processing row 3501: Rank 1011, Player Darian King, Points 34, Tournaments Played 4, Profile Link /en/players/darian-king/kc86/overview
Processing row 3502: Rank 1012, Player Stefan Latinovic, Points 34, Tournaments Played 4, Profile Link /en/players/stefan-latinovic/l0c6/overview
Processing row 3503: Rank 1013, Player Lukas Hellum-Lilleengen, Points 34, Tournaments Played 6, Profile Link /en/players/lukas-hellum-lilleengen/h0bc/overview
Processing row 3504: Rank 1014, Player Max Westphal, Points 34, Tournaments Played 6, Profile Link /en/players/max-westphal/w0bs/overview
Processing row 3505: Rank 1015, Player Edas Butvilas, Points 34, Tournaments Played 7, Profile Link /en/players/edas-butvilas/b0mm/overview
Processing

Processing row 3755: Rank 1265, Player Noah Schlagenhauf, Points 20, Tournaments Played 5, Profile Link /en/players/noah-schlagenhauf/s0sn/overview
Processing row 3756: Rank 1266, Player Anton Chekhov, Points 20, Tournaments Played 5, Profile Link /en/players/anton-chekhov/cc79/overview
Processing row 3757: Rank 1267, Player Loan Lestir, Points 20, Tournaments Played 5, Profile Link /en/players/loan-lestir/l0im/overview
Processing row 3758: Rank 1268, Player Mathys Erhard, Points 20, Tournaments Played 5, Profile Link /en/players/mathys-erhard/e0ac/overview
Processing row 3759: Rank 1269, Player Miguel Damas, Points 20, Tournaments Played 6, Profile Link /en/players/miguel-damas/d0cn/overview
Processing row 3760: Rank 1270, Player Luis Francisco, Points 20, Tournaments Played 6, Profile Link /en/players/luis-francisco/f0e3/overview
Processing row 3761: Rank 1271, Player Gonzalo Zeitune, Points 20, Tournaments Played 6, Profile Link /en/players/gonzalo-zeitune/z0cm/overview
Processing r

Processing row 4012: Rank 1522, Player Marco Berti, Points 12, Tournaments Played 7, Profile Link /en/players/marco-berti/b0oy/overview
Processing row 4013: Rank 1523T, Player Leonardo Malgaroli, Points 12, Tournaments Played 7, Profile Link /en/players/leonardo-malgaroli/m0n4/overview
Processing row 4014: Rank 1523T, Player Giannicola Misasi, Points 12, Tournaments Played 7, Profile Link /en/players/giannicola-misasi/m0ic/overview
Processing row 4015: Rank 1525, Player Perry Gregg, Points 12, Tournaments Played 7, Profile Link /en/players/perry-gregg/g0b1/overview
Processing row 4016: Rank 1526T, Player Arda Azkara, Points 12, Tournaments Played 7, Profile Link /en/players/arda-azkara/a0dv/overview
Processing row 4017: Rank 1526T, Player Oskar Grzegorzewski, Points 12, Tournaments Played 7, Profile Link /en/players/oskar-grzegorzewski/g0mb/overview
Processing row 4018: Rank 1528, Player Cash Hanzlik, Points 12, Tournaments Played 8, Profile Link /en/players/cash-hanzlik/h0ec/overview


Processing row 4273: Rank 1783, Player Laelson Rodrigues, Points 7, Tournaments Played 6, Profile Link /en/players/laelson-rodrigues/r0d6/overview
Processing row 4274: Rank 1784T, Player Gabriel Evans, Points 7, Tournaments Played 6, Profile Link /en/players/gabriel-evans/e0ck/overview
Processing row 4275: Rank 1784T, Player Alan Raul Sau Franco, Points 7, Tournaments Played 6, Profile Link /en/players/alan-raul-sau-franco/s0bb/overview
Processing row 4276: Rank 1784T, Player Georgios Valaris, Points 7, Tournaments Played 6, Profile Link /en/players/georgios-valaris/v0hy/overview
Processing row 4277: Rank 1787, Player Manuel Sanchez, Points 7, Tournaments Played 7, Profile Link /en/players/manuel-sanchez/si82/overview
Processing row 4278: Rank 1788T, Player Marco Furlanetto, Points 7, Tournaments Played 7, Profile Link /en/players/marco-furlanetto/f0dd/overview
Processing row 4279: Rank 1788T, Player Martyn Pawelski, Points 7, Tournaments Played 7, Profile Link /en/players/martyn-pawel

Processing row 4611: Rank 2088T, Player Cezar Gabriel Papoe, Points 3, Tournaments Played 2, Profile Link /en/players/cezar-gabriel-papoe/p0mt/overview
Processing row 4612: Rank 2088T, Player Christopher Patzanovsky, Points 3, Tournaments Played 2, Profile Link /en/players/christopher-patzanovsky/p0eh/overview
Processing row 4613: Rank 2088T, Player Yannick Penkner, Points 3, Tournaments Played 2, Profile Link /en/players/yannick-penkner/p0mr/overview
Processing row 4614: Rank 2088T, Player Rares Teodor Pieleanu, Points 3, Tournaments Played 2, Profile Link /en/players/rares-teodor-pieleanu/p0ms/overview
Processing row 4615: Rank 2088T, Player Alexander Richards, Points 3, Tournaments Played 2, Profile Link /en/players/alexander-richards/r0gs/overview
Processing row 4616: Rank 2088T, Player Rikard Roos, Points 3, Tournaments Played 2, Profile Link /en/players/rikard-roos/r331/overview
Processing row 4617: Rank 2088T, Player Samuel Rubell, Points 3, Tournaments Played 2, Profile Link /e

Processing row 4909: Rank 2375T, Player Mousa Shanan Zayed, Points 2, Tournaments Played 3, Profile Link /en/players/mousa-shanan-zayed/z367/overview
Processing row 4910: Rank 2375T, Player Ivan Zaytsev, Points 2, Tournaments Played 3, Profile Link /en/players/ivan-zaytsev/z0aw/overview
Processing row 4911: Rank 2421T, Player Alexey Aleshchev, Points 2, Tournaments Played 4, Profile Link /en/players/alexey-aleshchev/a0av/overview
Processing row 4912: Rank 2421T, Player Carson Baker, Points 2, Tournaments Played 4, Profile Link /en/players/carson-baker/b0ud/overview
Processing row 4913: Rank 2421T, Player Nicolas Colne, Points 2, Tournaments Played 4, Profile Link /en/players/nicolas-colne/c0q2/overview
Processing row 4914: Rank 2421T, Player Mariano Dedura-Palomero, Points 2, Tournaments Played 4, Profile Link /en/players/mariano-dedura-palomero/d0ku/overview
Processing row 4915: Rank 2421T, Player Liam Delicata, Points 2, Tournaments Played 4, Profile Link /en/players/liam-delicata/d0

All data saved to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp\atp_doubles_20240602_192735.xlsx


In [70]:
import os
import time
import pandas as pd
from getpass import getuser
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from datetime import datetime

user = getuser()

# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"
output_folder = f"C:/Users/{user}/Documents/GitHub/tennis-homophily/data/atp"
url = "https://www.atptour.com/en/rankings/doubles?RankRange=1-5000&Region=all&DateWeek=2024-05-20"

def configure_driver():
    options = Options()
    options.add_argument("--verbose")
    options.add_argument("--log-path=chromedriver.log")
    service = Service(chrome_driver_path)
    return webdriver.Chrome(service=service, options=options)

def scrape_urls(url):
    all_data = []
    driver = configure_driver()
    driver.get(url)
    driver.minimize_window()
    
    try:
        WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, "mega-table")))
        time.sleep(5)  # Allow additional time for the page to load completely
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        rows = soup.find_all("tr", class_="lower-row")  # Locate rows with class "lower-row"
        if not rows:
            print("No rows found with class 'lower-row'.")
        
        for index, row in enumerate(rows[:50], start=1):  # Limit to the first 50 rows
            try:
                rank = "N/A"
                player_name = "N/A"
                player_profile_link = "N/A"
                points = "N/A"
                tournaments_played = "N/A"
                
                # Extract rank
                rank_td = row.find("td", class_="rank bold heavy tiny-cell")
                if rank_td:
                    rank = rank_td.get_text(strip=True)
                
                # Extract player name and profile link
                player_td = row.find("td", class_="player bold heavy large-cell")
                if player_td:
                    player_name_tag = player_td.find("a")
                    if player_name_tag:
                        player_name = player_name_tag.get_text(strip=True)
                        player_profile_link = player_name_tag["href"]
                
                # Extract points
                points_td = row.find("td", class_="points center bold extrabold small-cell")
                if points_td:
                    points = points_td.get_text(strip=True)
                
                # Extract tournaments played
                tournaments_td = row.find("td", class_="tourns center small-cell")
                if tournaments_td:
                    tournaments_played = tournaments_td.get_text(strip=True)
                
                print(f"Processing row {index}: Rank {rank}, Player {player_name}, Points {points}, Tournaments Played {tournaments_played}, Profile Link {player_profile_link}")  # Print the current row being processed
                
                # Only add row if player_profile_link and player_name are not "N/A"
                if player_profile_link != "N/A" and player_name != "N/A":
                    all_data.append([rank, player_name, points, tournaments_played, player_profile_link])
            except Exception as e:
                print(f"Error processing row {index}: {e}")
    except Exception as e:
        print(f"Error loading page: {e}")
    finally:
        driver.quit()
    
    return all_data

def save_urls_to_excel(data):
    df = pd.DataFrame(data, columns=["Rank", "Player", "Points", "Tournaments Played", "Player Profile Link"])
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_excel_filename = os.path.join(output_folder, f"atp_doubles_urls_{timestamp}.xlsx")
    df.to_excel(output_excel_filename, index=False)
    print(f"URLs and basic data saved to {output_excel_filename}")

def main():
    data = scrape_urls(url)
    if data:
        save_urls_to_excel(data)
    else:
        print("No data to save.")

if __name__ == "__main__":
    main()


Processing row 1: Rank 1T, Player M. Granollers, Points 8,130, Tournaments Played N/A, Profile Link /en/players/marcel-granollers/g710/overview
Processing row 2: Rank 1T, Player H. Zeballos, Points 8,130, Tournaments Played N/A, Profile Link /en/players/horacio-zeballos/z184/overview
Processing row 3: Rank 3, Player M. Ebden, Points 7,390, Tournaments Played N/A, Profile Link /en/players/matthew-ebden/e690/overview
Processing row 4: Rank 4, Player R. Bopanna, Points 7,210, Tournaments Played N/A, Profile Link /en/players/rohan-bopanna/b757/overview
Processing row 5: Rank 5, Player J. Salisbury, Points 6,630, Tournaments Played N/A, Profile Link /en/players/joe-salisbury/so70/overview
Processing row 6: Rank 6, Player R. Ram, Points 6,585, Tournaments Played N/A, Profile Link /en/players/rajeev-ram/r548/overview
Processing row 7: Rank 7, Player I. Dodig, Points 6,230, Tournaments Played N/A, Profile Link /en/players/ivan-dodig/d646/overview
Processing row 8: Rank 8, Player A. Krajicek, P

In [None]:
import os
import time
import pandas as pd
from getpass import getuser
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from datetime import datetime

user = getuser()

# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"
output_folder = f"C:/Users/{user}/Documents/GitHub/tennis-homophily/data/atp"
urls_excel_path = f"{output_folder}/atp_doubles_urls_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"  # Adjust this path to the actual file

def configure_driver():
    options = Options()
    options.add_argument("--verbose")
    options.add_argument("--log-path=chromedriver.log")
    service = Service(chrome_driver_path)
    return webdriver.Chrome(service=service, options=options)

def scrape_player_details(driver, player_profile_link):
    base_url = "https://www.atptour.com"
    full_url = base_url + player_profile_link
    driver.get(full_url)
    time.sleep(5)  # Allow time for the page to load completely

    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")
    
    personal_details_div = soup.find("div", class_="personal_details")
    if not personal_details_div:
        return {}

    details = {}
    
    pd_content = personal_details_div.find("div", class_="pd_content")
    if not pd_content:
        return {}

    # Process left side
    pd_left = pd_content.find("ul", class_="pd_left")
    if pd_left:
        for li in pd_left.find_all("li"):
            span_tags = li.find_all("span")
            if len(span_tags) == 2 and "Follow player" not in span_tags[0].text:
                details[span_tags[0].text.strip()] = span_tags[1].text.strip()

    # Process right side
    pd_right = pd_content.find("ul", class_="pd_right")
    if pd_right:
        for li in pd_right.find_all("li"):
            span_tags = li.find_all("span")
            if span_tags:
                details[li.text.split(':')[0].strip()] = li.text.split(':')[1].strip()

    return details

def scrape_data_from_urls(file_path):
    df = pd.read_excel(file_path)
    all_data = []
    driver = configure_driver()
    
    try:
        for index, row in df.iterrows():
            try:
                rank = row["Rank"]
                player_name = row["Player"]
                points = row["Points"]
                tournaments_played = row["Tournaments Played"]
                player_profile_link = row["Player Profile Link"]
                
                personal_details = scrape_player_details(driver, player_profile_link)
                
                print(f"Processing player {index + 1}: Rank {rank}, Player {player_name}, Profile Link {player_profile_link}, Personal Details {personal_details}")  # Print the current player being processed
                
                all_data.append([rank, player_name, points, tournaments_played, player_profile_link, personal_details])
            except Exception as e:
                print(f"Error processing player {index + 1}: {e}")
    except Exception as e:
        print(f"Error loading URLs: {e}")
    finally:
        driver.quit()
    
    return all_data

def save_detailed_info_to_excel(data):
    # Flatten the dictionary in Personal Details column
    flat_data = []
    for row in data:
        common_data = row[:-1]
        personal_details = row[-1]
        flat_row = common_data + list(personal_details.values())
        flat_data.append(flat_row)
    
    # Extract all unique keys from the personal details dictionaries
    all_keys = set()
    for row in data:
        all_keys.update(row[-1].keys())
    
    columns = ["Rank", "Player", "Points", "Tournaments Played", "Player Profile Link"] + list(all_keys)
    
    df = pd.DataFrame(flat_data, columns=columns)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_excel_filename = os.path.join(output_folder, f"atp_doubles_details_{timestamp}.xlsx")
    df.to_excel(output_excel_filename, index=False)
    print(f"Detailed player information saved to {output_excel_filename}")

def main():
    data = scrape_data_from_urls(urls_excel_path)
    if data:
        save_detailed_info_to_excel(data)
    else:
        print("No data to save.")

if __name__ == "__main__":
    main()
