In [1]:
import os
from random import choice
from selenium import webdriver
import pandas as pd
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import geckodriver_autoinstaller
import json
from tqdm import tqdm
from time import sleep
from random import uniform
import re

geckodriver_autoinstaller.install()

class Browser():
    def __init__(self) -> None:
        """
        Start a new browser and get it ready to crawl.
        """

        self.options = Options()
        self.options.add_argument("--start-maximized")
        self.options.add_argument("--private")
        self.options.add_argument('--no-sandbox')
        self.options.add_argument("--ssl-protocol=any")
        self.options.add_argument("--ignore-certificate-errors")

        # Set user-agent
        # with open(os.path.join('src', 'user_agents.txt')) as file:
        #     user_agents = [ua.strip() for ua in file.readlines()]
        #     user_agent = choice(user_agents)
        #     # print(user_agent)        
        # self.options.add_argument('user-agent={0}'.format(user_agent))    
                
        # self.options.headless = True
        
        # Create a webdriver instance
        self.driver = webdriver.Firefox(
            firefox_binary='/home/gabriel/Downloads/firefox/firefox',
            options=self.options,

            )
        self.driver.maximize_window()

    def go(self, url, end_of_page, *args, **kwargs):
        self.driver.get(url)
        # Wait until the end of the page load
        WebDriverWait(self.driver, 200).until(EC.presence_of_element_located((By.XPATH, end_of_page)))


In [2]:
browser = Browser()
for draft_year in [2020, 2019, 2018, 2017, 2016, 2015, 2014]:
    print()
    print(f'Extracting {draft_year}')
    
    results = []

    def get_players_db():
        # Get a DataFrame with extracted players json files
        players_list = []
        for root, folders, files in os.walk('data'):
            for file in files:
                with open(os.path.join(root, file)) as f:
                    data = json.load(f)
                    players_list.append(data)
        players = pd.DataFrame(players_list)
        return players

    def get_links():

        # Get num of results pages
        browser.go(f'https://www.nfl.com/draft/tracker/prospects/all-positions/all-colleges/all-statuses/{draft_year}?page=1',"//button[@id='ot-sdk-btn']")
        sleep(7)
        pagination = browser.driver.find_element(By.XPATH, '//div[contains(text(), "Page ")]').text
        num_pages = int(re.search('\D(\d+)$', pagination).group(1))

        # Get the links of draft_year
        links = []
        for i in range(1, num_pages + 1):
            browser.go(f'https://www.nfl.com/draft/tracker/prospects/all-positions/all-colleges/all-statuses/{draft_year}?page={i}',"//button[@id='ot-sdk-btn']")
            sleep(uniform(3.8, 5.9))
            names_elements = browser.driver.find_elements(By.XPATH, '//div[@data-test-id="facemask-simple-tile-headerText"]')
            for element in names_elements:
                link = element.find_element(By.XPATH, './/a').get_attribute('href')
                links.append(link)
        return links

    players = get_players_db()
    links = get_links()

    for link in tqdm(links):
        # Extract player data from link and save in a json data file
        if link not in players['url'].tolist():
            sleep(uniform(3.8, 5.9))
            browser.go(link, '//*[@id="ot-sdk-btn"]')
            sleep(8)
            name = browser.driver.find_element(By.XPATH, '//div[@data-testid="prospectHeadshotCard"]//h2').text
            college = browser.driver.find_element(By.XPATH, '//div[@data-testid="prospectInfoCard"]//div[3]').text
            hometown = browser.driver.find_element(By.XPATH, '//*[@id="main-content"]/div/div/div/div/div[1]/div/div/div/div/div[2]/div/div/div/div/div/div[5]/div[1]/div').text 
            className = browser.driver.find_element(By.XPATH, '//*[@id="main-content"]/div/div/div/div/div[1]/div/div/div/div/div[2]/div/div/div/div/div/div[5]/div[2]/div').text
            grade = browser.driver.find_element(By.XPATH, '//*[@id="main-content"]/div/div/div/div/div[1]/div/div/div/div/div[3]/div/div/div/div/div[1]/div[1]').text
            try:
                drafted_by = browser.driver.find_element(By.XPATH, '//*[@id="main-content"]/div/div/div/div/div[1]/div/div/div/div/div[4]/div/div/div/div/div[1]/img').get_attribute('alt')
            except:
                drafted_by = ''
            try:
                round_elem = browser.driver.find_element(By.XPATH, '//*[@id="main-content"]/div/div/div/div/div[1]/div/div/div/div/div[4]/div/div/div/div/div[1]/div[4]').text
                round = re.sub(r'Round\s(\d+).*?Pick\s+.*', r'\1', round_elem)
                pick = re.sub(r'Round\s\d+.*?Pick\s+(.*)', r'\1', round_elem)
            except:
                try:
                    round_elem = browser.driver.find_element(By.XPATH, '//div[contains(text(), "Draft Projection")]/following-sibling::div').text
                    round = re.sub(r'Round\s+(\d+)', r'\1', round_elem)
                    pick = ''
                except:
                    round = ''
                    pick = ''
            production_score = browser.driver.find_element(By.XPATH, '//*[@id="main-content"]/div/div/div/div/div[2]/div/div/div/div/div/div/div[1]/div[2]').text
            athleticism_score = browser.driver.find_element(By.XPATH, '//*[@id="main-content"]/div/div/div/div/div[2]/div/div/div/div/div/div/div[2]/div[2]').text
            total_score = browser.driver.find_element(By.XPATH, '//*[@id="main-content"]/div/div/div/div/div[2]/div/div/div/div/div/div/div[3]/div[2]').text
            try:
                bio = browser.driver.find_element(By.XPATH, '//div[contains(text(), "Player Bio")]/parent::*/following-sibling::div').text
            except:
                bio = ''
            try:
                overview = browser.driver.find_element(By.XPATH, '//div[contains(text(), "Overview")]/parent::div').get_attribute('innerText')
                overview = re.sub(r'Overview|\n|', '', overview).replace("\'", '')
            except:
                overview = ''
            strengths_list = browser.driver.find_elements(By.XPATH, '//div[contains(text(), "Strengths")]/following-sibling::ul/li')
            strengths = ' '.join([e.text for e in strengths_list])
            weaknesses_list = browser.driver.find_elements(By.XPATH, '//div[contains(text(), "Weaknesses")]/following-sibling::ul/li')
            weaknesses = ' '.join([e.text for e in weaknesses_list])
            try:
                analysis_by = browser.driver.find_element(By.XPATH, '//*[@id="main-content"]/div/div/div/div/div[6]/div/div/div/div/div/div[1]/div/div[1]/div[1]').text
            except:
                try:
                    analysis_by = browser.driver.find_element(By.XPATH, '//div[contains(text(), "Analysis")]/parent::button/following-sibling::div/div/div/div/div').text
                except:
                    analysis_by = ''
            year = draft_year

            data = {
                'url': link,
                'name': name,
                'college': college,
                'hometown': hometown,
                'class': className,
                'grade': grade,
                'drafted_by': drafted_by,
                'round': round,
                'pick': pick,
                'production_score': production_score,
                'athleticism_score': athleticism_score,
                'total_score': total_score,
                'bio': bio,
                'overview': overview,
                'strengths': strengths,
                'weaknesses': weaknesses,
                'analysis_by': analysis_by,
                'year': year,
            }

            results.append(data)

            with open(f'data/{name}.json', 'w') as file:
                data_json = json.dump(data, file)

    df = pd.DataFrame(results)
    df.to_excel(f'drafts_{draft_year}.xlsx', index=False)

final = []
# Save all json data files in an excel file
for root, folders, files in os.walk('data'):
    for file in files:
        with open(os.path.join(root, file)) as data_file:
            data = json.load(data_file)
            final.append(data)

final_df = pd.DataFrame(final)
final_df.to_excel('final.xlsx', index=False)


  self.driver = webdriver.Firefox(



Extracting 2020


100%|██████████| 506/506 [45:20<00:00,  5.38s/it]



Extracting 2019


100%|██████████| 503/503 [2:01:39<00:00, 14.51s/it]  



Extracting 2018


100%|██████████| 642/642 [2:34:08<00:00, 14.41s/it]  



Extracting 2017


100%|██████████| 424/424 [1:42:29<00:00, 14.50s/it]



Extracting 2016


100%|██████████| 407/407 [1:38:26<00:00, 14.51s/it]



Extracting 2015


100%|██████████| 408/408 [1:38:33<00:00, 14.49s/it]



Extracting 2014


100%|██████████| 462/462 [1:51:58<00:00, 14.54s/it]


In [15]:
import os
results = []
for root, folders, files in os.walk('data'):
    for file in files:
        with open(os.path.join(root, file)) as data_file:
            data = json.load(data_file)
            results.append(data)

df = pd.DataFrame(results)
df.to_excel(f'drafts_{draft_year}.xlsx', index=False)

In [3]:
browser.driver.close()

In [4]:
all_players = []
for root, folders, files in os.walk('data'):
    for file in files:
        with open(os.path.join(root, file)) as f:
            data = json.load(f)
            all_players.append(data)
final_df = pd.DataFrame(all_players)

In [10]:
df_2021 = final_df.loc[final_df['year'] == 2021]

In [11]:
df_2021.to_excel('drafts_2021.xlsx', index=False)

In [4]:
final_df['year'].value_counts()

2019    967
2018    636
2020    500
2021    463
2014    462
2017    421
2016    407
2015    406
2022      1
Name: year, dtype: int64

In [8]:
all_drafts = pd.read_excel('drafts_2014_2022.xlsx')

In [11]:
all_drafts.to_excel('all_drafts.xlsx', index=False)

In [10]:
all_drafts['url'] = all_drafts['url'].apply(lambda x: x.strip())