In [2]:
from selenium import webdriver
from selenium.webdriver import Chrome 
from webdriver_manager.chrome import ChromeDriverManager 
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.common.keys import Keys
import time 
from datetime import datetime, timedelta
import os 
import uuid 
import json 


class News_Scraper:

    def __init__(self, url) -> None:
        # url to be parameter entered in __name__ == "__main__"
        self.url = url 
        
        #Creates an empty list for links to each article (to be collected from main news page).
        self.href_list = []

        #Creates a list to store the dictionaries that contain the information about each article.  
        self.article_data = []

        # Using ChromeDriver / Selenium to open webpage 
        self.driver = Chrome(ChromeDriverManager().install())
        self.driver.get(url)

        # Accepts cookies if they are there. If not pass.
        try:
            click_accept_cookies = self.driver.find_element(By.ID, "cookie-accept-link-text")
            click_accept_cookies.click()

            # Once cookies are accepted. Scroll to the bottom of the initial page.
            try:
                scroll_to_bottom = self.driver.find_element(By.TAG_NAME, 'body') 
                scroll_to_bottom.send_keys(Keys.END)
            except:
                print("Didn't scroll to Medical Links")
                pass 

        except:
            print("Couldn't accept cookies, or there were none to accept")
            pass 
        return

    
    def news_button(self):
        """
        Method to go to the 'news' page where the articles are listed
        """
        # Below no longer working (had been working before 14/05/22 - WHY???)
        # try:
        #     time.sleep(2)
        #     click_news_button = self.driver.find_element(By.LINK_TEXT, "News")
        #     click_news_button.click()
        #     time.sleep(2)
        # except:
        #     print("Didn't click 'News' button")
        #     pass   
        
        try:
            # news_link_container = self.driver.find_element(By.XPATH, '//div[@class="col-xs-6 col-sm-3 footer-menu life-sciences-useful-links"]')
            # news_link = self.driver.find_element(By.XPATH, f"//a[contains (@href, '/life-sciences/news')]")
            # news_link.click 
            self.driver.get("https://www.news-medical.net/medical/news")
        except:
            print("Didn't click on news link")
            pass


    def link_list(self):
        """
        Return a list (href_list) storing the link for each article on the page.
        As each article doesn't have a clear unique identifier (i.e. product number). The URL will be the unique identifier 
        When each linked it opened, extract data (e.g. title, author, date, source, etc) & store this in a dictionary 
        """

        container = self.driver.find_element(By.XPATH, '//div[@class="posts publishables-list-wrap first-item-larger"]')        
        self.elements = container.find_elements(By.XPATH, ".//div[@class='col-xs-9']/h3/a")
        for element in self.elements:  
            self.href = element.get_attribute('href') # this is returnning as the same as url for the news page. NOT the individual hrefs for each article as aimed. WHY????
            self.href_list.append(self.href) # This returns a list of all the urls for each article on the news page. 
        
        print(f"self.href is {self.href} & is type {type(self.href)}") 
        print(f"first eleement of self.href_list is {self.href_list[0]} & is type {type(self.href_list)} & is length {len(self.href_list)}")

        self.href_list_shorten = self.href_list[:10]

        for i in range(len(self.href_list_shorten)):
            item = str(self.href_list_shorten[i])

            self.article_dict= {
                "url_link": [],
                "title": [],
                "author": [],
                "date": [],
                "source": [],
                "uuid_ID": []
            }

            if (item != str(self.href_list_shorten[0])) and (item != str(self.href_list_shorten[i-1])):

                self.driver.get(item)
                
                self.url_link = item
                self.article_dict["url_link"].append(self.url_link)

                self.title = self.driver.title
                self.article_dict["title"].append(self.title)
                
                self.author_container = self.driver.find_element(By.CLASS_NAME, "article-meta-contents")
                self.author_link = self.author_container.find_element(By.XPATH, ".//a[@href]")
                self.author = self.author_link.text
                self.article_dict["author"].append(self.author)
                
                self.date = self.driver.find_element(By.CLASS_NAME, "article-meta-date").text
                self.article_dict["date"].append(self.date)

                self.ID_for_each_article()

                self.article_data.append(self.article_dict)
            else:
                pass
        print(f"Article Data is a {type(self.article_data)} of {len(self.article_data)} dictionaries, which is: {self.article_data}")
        print(" ")
        print(f"List 1 is: {self.article_data[0]}")
        print(" ")
        print(f"List 2 is: {self.article_data[1]}")
    

    def create_folder_for_each_article(self):
        """ 
        Create an empty folder for each article in the list.
        Will add json file into each to store the data and a image. 
        Later will change to place all these folders within "raw_data"
        """"
        x = 1 
        for item in self.article_data:
            folder_name = f"article {x}"
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)
            x += 1 

    def ID_for_each_article(self):
        """ 
        Create a list of uuid for each item in a list (i.e. for each url in the link_list, create a associated uuid
        Using version 4 
        """
        # self.article_data["uuid"]
        # self.uuid_list = []
        link_ID = uuid.uuid4()
        self.article_dict["uuid_ID"].append(link_ID)
        # print(f"uuid_list is: {self.uuid_list}")
        # return self.uuid_list

    def image(self):
        """
        Extract an image from each article 
        """
        content_container = self.driver.find_element(By.XPATH, "//div[@class = 'content']")
        image_container = content_container.find_element(By.XPATH, "//figure[@class = 'contentImage']/span/img")
        image = image_container.get_attribute("src")


    def create_raw_data_folder(self):
        """
        Creates a folder called raw data, this will store a folder for each artile with all it's data. 
        Before a fodler is created, it will check if one has already been made. 
        Need to define where this will be stored 
        Uses os method 
        """

        if not os.path.exists('raw_data'):
            os.makedirs('raw_data')
    
        
if __name__ == "__main__":
    news_scraper = News_Scraper('https://www.news-medical.net/')
    # news_scraper.scroll_down()
    news_scraper.news_button()
    news_scraper.create_raw_data_folder()
    news_scraper.link_list()
    news_scraper.create_folder_for_each_article()
    # for i in range(len(news_scraper.article_data)):
    
    



Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/frankiejames/.wdm/drivers/chromedriver/mac64_m1/102.0.5005.61/chromedriver] found in cache
  self.driver = Chrome(ChromeDriverManager().install())


self.href is https://www.news-medical.net/news/20220622/Even-asymptomatic-maternal-COVID-19-can-potentially-cause-pregnancy-complications.aspx & is type <class 'str'>
first eleement of self.href_list is https://www.news-medical.net/news/20220622/Olfactory-sensor-can-authenticate-individuals-by-analyzing-their-breath.aspx & is type <class 'list'> & is length 19
Article Data is a <class 'list'> of 9 dictionaries, which is: [{'url_link': ['https://www.news-medical.net/news/20220622/New-Brand-Identity-from-KA-Imaging-for-Patented-Dual-Energy-Technology.aspx'], 'title': ['New Brand Identity from KA Imaging for Patented Dual Energy Technology'], 'author': ['KA Imaging'], 'date': ['Jun 22 2022'], 'source': [], 'uuid_ID': [UUID('69c006c4-af7b-47f9-8de1-241c6a7245ac')]}, {'url_link': ['https://www.news-medical.net/news/20220622/Dissociation-may-indicate-a-high-risk-of-worse-mental-health-outcomes-after-trauma.aspx'], 'title': ['Dissociation may indicate a high risk of worse mental health outcom