In [None]:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import selenium.common.exceptions
from time import sleep
import csv

class mevzuat:
    ELEMENT_LOCATING_TIMEOUT = 60  # in seconds
    FETCH_COUNT_PER_PAGE = 100
    STARTING_PAGE = 1  #inclusive
    ENDING_PAGE = 20  #inclusive
    SCRAPED_URL = 'https://www.mevzuat.gov.tr/#kurumKurulusVeUniversiteYonetmelikleri'
    OUTPUT_PATH = r'C:\Users\Furkan\Desktop\study\nlp\teknofest\notebooks\mevzuat.csv'
    
    def __init__(self):
        self.init_driver()
        self.new_file()
        self.bring_legislations()
        self.currentPage = 1
        for i in range(1, mevzuat.STARTING_PAGE):
            self.goto_next_page()
        while self.currentPage <= mevzuat.ENDING_PAGE:                
            i = 1
            while i <= mevzuat.FETCH_COUNT_PER_PAGE:
                self.select_legislation(i)
                self.change_frame()
                self.write_data_to_csv()
                self.driver.close()
                self.driver.switch_to.window(self.driver.window_handles[0])
                i += 1
            
            self.goto_next_page()
                
    def init_driver(self):
        # options to open driver
        options = Options()
        options.add_argument('start-maximized')
        options.add_argument('disable-infobars')
        options.add_argument('--disable-extensions')
        options.add_argument('--headless')

        # create web driver element
        self.driver = webdriver.Chrome(chrome_options=options)
        self.URL = mevzuat.SCRAPED_URL
        self.driver.get(self.URL)
        
    def bring_legislations(self):
        # click on 'ARA' button
        searchElement = WebDriverWait(self.driver, mevzuat.ELEMENT_LOCATING_TIMEOUT).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[5]/div/div/div/div/div[1]/div/div[12]/div/div[2]/div/div/div[2]/form/div[8]/div/button[1]')))
        searchElement.click()
        
        self.wait_until_page_fully_loaded()
        
        self.set_listing()
        
    def wait_until_page_fully_loaded(self):
        WebDriverWait(self.driver, mevzuat.ELEMENT_LOCATING_TIMEOUT).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[5]/div/div/div/div/div[5]/div/div/div/div[2]/div/table/tbody/tr[1]/td[2]/a/div[1]')))
    
    def safe_click(self, element):
        # safely clicks the given element, meaning loading screen won't get front of the element to be clicked
        try:
            element.click()
        except (selenium.common.exceptions.ElementClickInterceptedException, selenium.common.exceptions.StaleElementReferenceException) as e:
            try:
                sleep(2)
                self.safe_click(element)
            except selenium.common.exceptions.StaleElementReferenceException:
                pass
    
    def set_listing(self):
        # change the listing to 100
        listingSelect = Select(WebDriverWait(self.driver, mevzuat.ELEMENT_LOCATING_TIMEOUT).until(EC.presence_of_element_located((By.XPATH, '//*[@id="DataTables_Table_0_length"]/label/select'))))
        listingSelect.select_by_value("100")
        
        try:
            WebDriverWait(self.driver, mevzuat.ELEMENT_LOCATING_TIMEOUT).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[5]/div/div/div/div/div[5]/div/div/div/div[2]/div/table/tbody/tr[100]/td[2]/a/div[1]')))
        except selenium.common.exceptions.TimeoutException:
            pass        
    
    def goto_next_page(self):
        if self.currentPage == mevzuat.ENDING_PAGE:
            self.currentPage += 1
            return
        
        firstTextElem = WebDriverWait(self.driver, mevzuat.ELEMENT_LOCATING_TIMEOUT).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[5]/div/div/div/div/div[5]/div/div/div/div[2]/div/table/tbody/tr[1]/td[2]/a/div[1]')))
        firstText = firstTextElem.get_attribute("innerHTML")
        
        nextPageButton = WebDriverWait(self.driver, mevzuat.ELEMENT_LOCATING_TIMEOUT).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#DataTables_Table_0_paginate > ul > li.paginate_button.page-item.active + li > a')))  
        self.safe_click(nextPageButton)
                
        WebDriverWait(self.driver, mevzuat.ELEMENT_LOCATING_TIMEOUT).until_not(EC.text_to_be_present_in_element_attribute((By.XPATH, '/html/body/div[5]/div/div/div/div/div[5]/div/div/div/div[2]/div/table/tbody/tr[1]/td[2]/a/div[1]'), "innerHTML", firstText))
                
        self.wait_until_page_fully_loaded()
        
        self.currentPage += 1
        
    def select_legislation(self, i):
        content_xpath = f'//*[@id="DataTables_Table_0"]/tbody/tr[{i}]/td[2]/a'
        link = WebDriverWait(self.driver, mevzuat.ELEMENT_LOCATING_TIMEOUT).until(EC.presence_of_element_located((By.XPATH, content_xpath)))
        href = link.get_attribute('href')
        self.driver.execute_script("window.open(arguments[0]);", href)
        self.driver.switch_to.window(self.driver.window_handles[1])
        
    def change_frame(self):
        iframe = WebDriverWait(self.driver, mevzuat.ELEMENT_LOCATING_TIMEOUT).until(EC.presence_of_element_located((By.TAG_NAME, 'iframe')))
        self.driver.switch_to.frame(iframe)
        
    def new_file(self):
        newfile = open(mevzuat.OUTPUT_PATH, 'w', encoding='utf-8')
        
        header = ["data_text"]
        self.writer = csv.writer(newfile)
        self.writer.writerow(header)
        
    def write_data_to_csv(self):   
        textSegmentElements = self.driver.find_elements_by_xpath('/html/body/div/child::*')
        totalText = ''
        for textSegmentElement in textSegmentElements:
            textSegment = textSegmentElement.text.replace('\n', ' ').strip()
            if len(textSegment) == 0 or textSegment.isdigit():
                continue
            if textSegment[-1] != '.':
                textSegment += '.'
            textSegment += ' '
            totalText += textSegment
        
        self.writer.writerow([totalText])
        
            
mevzuat()

  self.driver = webdriver.Chrome(chrome_options=options)
  textSegmentElements = self.driver.find_elements_by_xpath('/html/body/div/child::*')
