<a href="https://colab.research.google.com/github/BgTrProject/kivy/blob/main/eksi_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip3 install beautifulsoup4
# !pip3 install selenium
# !pip3 install pymongo==3.11.2

In [None]:
import time
import json
from collections import defaultdict

from selenium import webdriver
from bs4 import BeautifulSoup
from pymongo import MongoClient

In [None]:
import os
import environ

env = environ.Env()
env.read_env(env.str('ENV_PATH', '.env'))

In [None]:
mongo_cli_username = os.environ.get('MONGO_CLI_USERNAME')
mongo_cli_password = os.environ.get('MONGO_CLI_PASSWORD')

In [None]:
# target url
url = "https://eksisozluk.com/"

# input location
input_location = 'data/input/keywords.txt'

# output location
output_location = 'data/output/eksi.json'

In [None]:
client = MongoClient("mongodb+srv://{}:{}@cluster0.plop5.mongodb.net/myFirstDatabase?retryWrites=true&w=majority".format(mongo_cli_username, mongo_cli_password))
db = client['healdash']

In [None]:
# keywords list
keywords = []

with open(input_location) as my_file:
    for line in my_file:
        keywords.append(line.replace("\n", ""))

In [None]:
# show keywords
keywords

['hazımsızlık', 'gaz']

In [None]:
# class structure
class Eksi:
    def __init__(self, url: str) -> None:
        self.url = url
        
        # init the browser
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')

        driver = webdriver.Chrome() # initialize the driver
        driver.get(self.url) # go to the url
        self.driver = driver
        
    def search_keyword(self, keyword: str) -> None:
        search_input = self.driver.find_element_by_id("search-textbox")
        search_input.clear()
        search_input.send_keys(keyword)
        search_input.submit()
        time.sleep(0.5) # small delay before getting the page source
        
    def compile_page_source(self) -> object:
        page_source = self.driver.page_source # get the page source
        soup = BeautifulSoup(page_source.encode('utf-8','ignore')) # compile it with bs4
        try:
            self.max_pages = int(soup.find('div', {"class": "pager"})['data-pagecount'])
        except:
            self.max_pages = 1
        self.keyword_scape_time = self.max_pages * 0.5
        self.page_source = soup
        self.keyword_exists() # detect if the keyword exists
        return self
    
    def next_page(self, page_number: int) -> None:
        current_url = self.driver.current_url 
        current_url = current_url[:current_url.rfind("?")+1] # remove all url variables 
        
        # if there are not parameters in the existing url
        if not current_url:
            current_url = self.driver.current_url + "?"
            
        current_url = current_url + ('p={}'.format(page_number))
        self.driver.get(current_url)
        
    def clean_entry(self, entry: str) -> str: 
        return (
            entry
            .replace("\n", "") # remove new lines
            .replace("\'", "'") # fix apostrophe
            .strip() # remove spaces
        )
    
    def keyword_exists(self) -> bool:
        all_authors = self.page_source.find_all('a', {"class": "entry-author"}) # get all authors
        if len(all_authors):
            return True
        else:
            return False
        
    def scrape_data(self, keyword: str) -> None:
        all_entries = self.page_source.find_all('div', {"class": "content"}) # get all entries
        all_dates = self.page_source.find_all('a', {"class": "entry-date"}) # get all dates
        all_authors = self.page_source.find_all('a', {"class": "entry-author"}) # get all authors
        
        for entry, date, author in zip(all_entries, all_dates, all_authors):
            self.keyword_dict[keyword].append({
                "date": date.text, 
                "author": author.text, 
                "entry": self.clean_entry(entry.text)
            })
          
    def scrape_all_pages(self, keyword_list: list) -> None:
        
        # reset keywords dict
        self.keyword_dict = defaultdict(list)
        
        for keyword in keyword_list:
            self.search_keyword(keyword)
            self.compile_page_source() # compile for the first time
            
            if self.keyword_exists:
                print("{} - scraping time: {} seconds".format(keyword, self.keyword_scape_time)) # print scraping time for the keyword

                for i in range(1, self.max_pages + 1):
                    self.next_page(i)
                    self.compile_page_source().scrape_data(keyword)
                    
                # add data to mongodb
                db.eksi_entries.update_many({"keyword": keyword}, {"$set": {"objects": self.keyword_dict[keyword]}}, upsert=True)
            else:
                print("No results for {}".format(keyword))
                
    def get_json_output(self, output_location: str) -> None:
        
        # dump the json file
        json_object = json.dumps(self.keyword_dict, ensure_ascii=False).encode('utf-8','ignore').decode() 
        
        # get the output
        with open(output_location, 'w+', encoding='utf-8') as f: 
            json.dump(json_object, f, ensure_ascii=False)

In [None]:
# initialize the object
eksi = Eksi(url)

In [None]:
# scrape the data
eksi.scrape_all_pages(keywords)

hazımsızlık - scraping time: 2.5 seconds
gaz - scraping time: 2.5 seconds


In [None]:
# get json output
eksi.get_json_output(output_location)

In [None]:
# example
# eksi.keyword_dict