In [3]:
import itertools
import math
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import random
import re
import time
import warnings
from functools import partial

import pandas as pd
import pydub
import requests
import speech_recognition as sr
from bs4 import BeautifulSoup
from p_tqdm import p_map
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from seleniumwire import webdriver as wire_webdriver
from tqdm import tqdm, tqdm_notebook
from twocaptcha import TwoCaptcha

from src.parsing_fun import ProxyRotator, proxy, random_sleep, captcha_bypass, \
    write_audio, audio_to_text, viewbull_field_container, process_page

warnings.filterwarnings("ignore")

## Parsing

For Pet project was chosen site farpost.ru with ads for apartments for sale in Primorsky and Khabarovsk regions, at the first stage it is necessary to get a dataset of ads of the site. As an example, the largest cities Amursk, Vladivostok, Nakhodka, Ussuriysk, Khabarovsk, Komsomolsk-Na-Amure and Amursk were chosen.
            
1) Parsing consists of two stages, the first parses the list of links to ads on the second parses the ads themselves.
2) In the process of parsing realized bypass Google captcha (ReCapcha) and text captcha.
3) Parsing was performed using paid proxies (proxy6.net).

The output is a dataset for further processing in EDA analysis.

In [7]:
# Audio file used for captcha recognition
name_audio_file = "audio_file.mp3"

In [4]:
# List of proxies for parsing
# (currently not working, if you reuse the code you should replace them with others)
proxies = [
    {
        "http": "socks5://s4PVjF:kcFuTM@147.45.56.125:8000",
        "https": "socks5://s4PVjF:kcFuTM@147.45.56.125:8000",
    },
    {
        "http": "socks5://s4PVjF:kcFuTM@188.130.203.60:8000",
        "https": "socks5://s4PVjF:kcFuTM@188.130.203.60:8000",
    },
    {
        "http": "socks5://s4PVjF:kcFuTM@188.130.201.230:8000",
        "https": "socks5://s4PVjF:kcFuTM@188.130.201.230:8000",
    },
    {
        "http": "socks5://s4PVjF:kcFuTM@188.130.203.175:8000",
        "https": "socks5://s4PVjF:kcFuTM@188.130.203.175:8000",
    },
    {
        "http": "socks5://QvY0Bz:49N7R7@46.161.45.51:9347",
        "https": "socks5://QvY0Bz:49N7R7@46.161.45.51:9347",
    },
    {
        "http": "socks5://QvY0Bz:49N7R7@5.8.13.198:9432",
        "https": "socks5://QvY0Bz:49N7R7@5.8.13.198:9432",
    },
    {
        "http": "socks5://QvY0Bz:49N7R7@188.119.124.55:9663",
        "https": "socks5://QvY0Bz:49N7R7@188.119.124.55:9663",
    },
    {
        "http": "socks5://QvY0Bz:49N7R7@193.124.179.54:9996",
        "https": "socks5://QvY0Bz:49N7R7@193.124.179.54:9996",
    },
]

proxy_rotator = ProxyRotator(proxies)

In [None]:
# First step of parsing, parsing the list of ad links
# Browser Options
chrome_options = Options()

# chrome_options.add_argument("--headless")
# Run in the background without GUI (can be removed for visual control)

chrome_options.add_argument(
    "--disable-blink-features=AutomationControlled"
)  # Hide automation

# Add a parameter to not load images
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)

# Specify the path to ChromeDriver if it is not in the PATH
service = Service(executable_path="")

# Create an instance of the Chrome browser
driver = webdriver.Chrome(service=service, options=chrome_options)
# A list to store the results
results = []
href_lists = []


cities_lst = [
    "amursk",
    "vladivostok",
    "nakhodka",
    "ussuriisk",
    "khabarovsk",
    "komsomolsk-na-amure",
]
apartments = ["share", "room", "gostinka",
              "studio", "1", "2", "3", "4", "5", "6"]


for city in cities_lst:
    for apartment in apartments:
        # Page load
        url = f"https://www.farpost.ru/{city}/realty/sell_flats/?flatType%5B%5D={apartment}"
        driver.get(url)
        page_source = driver.page_source
        # Random delay after page load
        random_sleep(2, 5)

        # Create a BeautifulSoup object to parse
        soup = BeautifulSoup(page_source, "html.parser")

        # Find the item by id and get the data-count value
        element = soup.find("span", id="itemsCount_placeholder")

        try:
            number_links = int(element["data-count"] if element else "200")
        except Exception as ex:
            number_links = 200

       # Count the number of pages
        number_pages = math.ceil(number_links / 50)

        # Cycle to scroll page and collect data
        for i in tqdm(range(number_pages)):
            # Page load
            page = i + 1
            url = f"https://www.farpost.ru/{city}/realty/sell_flats/?flatType%5B%5D={apartment}&page={str(page)}"
            driver.get(url)
            page_source = driver.page_source
            # Random delay after page load
            random_sleep(1, 3)

            # Create a BeautifulSoup object to parse
            soup = BeautifulSoup(page_source, "html.parser")

            # Scroll to the bottom of the page
            driver.find_element("tag name", "body").send_keys(Keys.END)

            # Data collection after scrolling
            page_source = driver.page_source

            # Create a BeautifulSoup object to parse
            soup = BeautifulSoup(page_source, "html.parser")

            # Get all <a> tags with data-role="bulletin-link" attribute
            bulletin_links = soup.find_all(
                "a", class_="bulletinLink bull-item__self-link auto-shy"
            )

            # Create a list of all hrefs
            href_list = [link.get("href") for link in bulletin_links]

            results.extend(href_list)


# Close the browser
driver.quit()

In [None]:
# In the process of parsing due to network failures we got 5 files with links
# save to df
df_results = pd.DataFrame(results)

# сохраняем в файл
df_results.to_csv("results5.csv")

In [None]:
# Collect the links in a dataframe
links_1 = pd.read_csv("results1.csv")
links_2 = pd.read_csv("results2.csv")
links_3 = pd.read_csv("results3.csv")
links_4 = pd.read_csv("results4.csv")
links_5 = pd.read_csv("results5.csv")

data_df_combined = pd.concat(
    [links_1, links_2, links_3, links_4, links_5], ignore_index=True
)

In [None]:
# Delete duplicates
data_df_combined.drop_duplicates(inplace=True, ignore_index=True)

In [None]:
# Delete the column after concat
data_df_combined.drop(labels="Unnamed: 0", axis=1, inplace=True)

In [None]:
# Remove dates where the link was incorrectly sparsed
data_df_combined.drop(
    data_df_combined[data_df_combined["0"] ==
                     "javascript:void(0)"].index, inplace=True
)

In [19]:
# save to file
data_df_combined.to_csv("links.csv", index=True)

In [20]:
# The second stage of parsing, parsing ads using the received links
# Browser options
chrome_options = Options()

# Run in the background with no GUI (can be removed for visual control)
# chrome_options.add_argument("--headless")

# Hide automation
chrome_options.add_argument("--disable-blink-features=AutomationControlled")

# Disable JavaScript loading
chrome_options.add_argument("--disable-javascript")

# Specify the path to ChromeDriver if it is not in the PATH
service = Service(executable_path="")

In [None]:
number_links = len(data_df_combined)

driver = webdriver.Chrome(service=service, options=chrome_options)

for i in tqdm(range(17640, number_links)):
    
    url = f"{data_df_combined.iloc[i,0]}"
   
    driver.get(url)
    page_source = driver.page_source
  
    # Create a BeautifulSoup object to parse
    soup = BeautifulSoup(page_source, "html.parser")
    
    

    # Check condition 1: The captcha is simple
    captcha_element = soup.find("h2", string="Вы не робот?")
    img_tag = soup.find("img", {"alt": "Изображение для проверки"})

    if captcha_element is not None and img_tag is not None:
        #print(f"Requires entering a simple captcha, ad №{i}")
        
        закрываем драйвер
        driver.quit()
        
        меняем прокси
        driver = proxy()
        
        # reload the page
        driver.get(url)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        process_page(soup, i, data_df_combined)
        continue

    # Check Condition 2: Google Captcha
    elif captcha_element is not None:
        print(f"Need google captcha processing, ad №{i}")
        # Call the function to process the Google captcha
        driver = captcha_bypass(driver)
        driver.get(url)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        process_page(soup, i, data_df_combined)
        continue
        
    # All right, parse the page.
    process_page(soup, i, data_df_combined)

In [120]:
# Save the resulting dataset to a file
data_df_combined.to_csv("data/data.csv", index=True)