# Data identification and acquisition

## Useful utilities
Since problems will happen, we need to be able to handle them in a nice and clean way.

### Delete duplicates
This script deletes all the duplicate lines from `gigs.txt` and can be easily modified for `categories.txt`.
Notice that unlike post-processing, this script **can make the time require to scrap** faster.

In [None]:
with open('gigs.txt', 'r') as f:
    lines = f.readlines()

lines_set = set(lines)
lines_set = sorted(lines_set)

with open('gigs.txt', 'w') as f:
    f.writelines(lines_set)

print('Deleted ' + str(len(lines) - len(lines_set)) + ' duplicate lines.')

### Reversing the order of the lines
If two users wants to scrap the same data, they can do it in parallel by reversing the order of the lines in `gigs.txt` and `categories.txt`.
this can be easily done by appending `[::-1]` to the end of of every list that is being enumerated in the `#Crawling` section.

## Importing necessary libraries

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium_stealth import stealth
from bs4 import BeautifulSoup
from pprint import pprint
import csv
import threading
import os
import sys
import traceback

## Initializations

### Initialize stealthy Chrome driver

In [None]:
options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

chrome_driver_path = r"C:\Users\Deftera\Downloads\chromedriver_win32\chromedriver.exe"
s = Service(chrome_driver_path)

driver = webdriver.Chrome(service=s, options=options, service_args=["--verbose"])

stealth(driver,
      languages=["en-US", "en"],
      vendor="Google Inc.",
      platform="Win32",
      webgl_vendor="Intel Inc.",
      renderer="Intel Iris OpenGL Engine",
fix_hairline=True,)

### Initialize the timeout_callback function
Since the data is being scraped using selenium_stealth, which is very unstable, the driver can get stuck at any point of time. To avoid this, we use a timeout_callback function which will be called if the driver gets stuck. This function will then `taskkill` the driver and the current process. (**Note:** This function is neccessary to continue the scraping process even if the driver gets stuck, a `.bat` file will be responsible for restarting the process)

In [None]:
def timeout_callback(category_url):
    os.system("taskkill /F /IM chrome.exe /T") 
    sys.exit()

To make sure the program is running correctly, extract the jupyter notebook as a `.py` file, and run it with the following batch script.

`gigs.txt` is a file containing the links of all the gigs to be scraped. The links are separated by a newline character.
A similar file layout and batch script should be used for the category links as well.

In [None]:
@echo off

set "gigs_file=gigs.txt"
set "python_script=get_gigs.py"
set "timeout_seconds=5"

:loop
set /p first_line=<"%gigs_file%"
timeout /t %timeout_seconds% /nobreak >nul
python "%python_script%"

set /p new_first_line=<"%gigs_file%"
if "%new_first_line%"=="%first_line%" (
    call :removeDuplicateLine
)

goto loop

:removeDuplicateLine
ren "%gigs_file%" "gigs_temp.txt"
for /f "skip=1 delims=" %%a in (gigs_temp.txt) do echo %%a>>"%gigs_file%"
del "gigs_temp.txt"
exit /b

### Initialize the crawling functions

In [None]:
def crawl_category(category_url, gigs_limit=15):
    """
    Crawls a given category url and returns a list of the URLs of the gigs on the page
    :param category_url: the url of the category to crawl
    :param gigs_limit: the maximum number of gigs to return
    :return: a list of gigs URLs with length of up to gigs_limit
    """
    
    # Set a timer to kill the process if it takes too long
    timer = threading.Timer(18 , timeout_callback, [category_url])
    timer.start()
    try:
        print("trying to reach " + category_url)
        driver.get(category_url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        gigs = soup.select("div.gig-card-layout > div.gig-wrapper > div.basic-gig-card > a")
        if len(gigs) == 0:
            # If no gigs were found, stop the timer and return an empty list
            timer.cancel()
            return []
        gigs_links = [gig.get("href") for gig in gigs]
        if len(gigs_links) == 0:
            # if no gig *links* were found, stop the timer and return an empty list
            timer.cancel()
            return []
        full_gigs_links = [f"https://www.fiverr.com{link}" for link in gigs_links] # add the domain to the links
        full_gigs_links = list(set(full_gigs_links)) # remove duplicates
        timer.cancel()
    except Exception as e:
        sys.exit() # kill the process if an exception is raised
    return full_gigs_links[:gigs_limit]

In [None]:
def crawl_gig(gig_url):
    """
    Crawls a given gig url and returns a dictionary of the gig's data
    :param gig_url: the url of the gig to crawl
    :return: a dictionary of the gig's data 
    (title, rating score, orders in queue, rating counts, seller level, category, delivery times, prices,
    revisions, tags, language, country, member since and features)
    """

    # Set a timer to kill the process if it takes too long
    timer = threading.Timer(18, timeout_callback, [gig_url])
    timer.start()
    try:
        print("trying to reach " + gig_url)
        driver.get(gig_url)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # load the gig's overview
        gig_overview = soup.select_one(
            "div.main-content > div#__ZONE__main > div.gig_page_perseus > div.gig-page-wrapper > div.gig-page > div.main > div.gig-overview"
        )
        gig_title = gig_overview.select_one("h1").text
        seller_overview = gig_overview.select_one("div.seller-overview")
        try:
            rating_score = seller_overview.select_one("b.rating-score").text
        except:
            # if no rating score was found, set it to 0
            rating_score = 0
        try:
            seller_level = seller_overview.select_one(
                "div.Waqjn3u > div.user-profile-image"
            ).text
        except:
            # Sometimes the seller level is in another place, so if the first selector didn't work, try another one
            seller_level = soup.select_one(
                "span.level > div.jEW3B9z > span.HsyURQF"
            ).text
        try:
            rating_counts = (
                seller_overview.select_one("span.ratings-count")
                .text.replace("(", "")
                .replace(")", "")
            )
        except:
            # if no rating counts were found, set it to 0
            # also, it means that the seller is new, so set the seller level to 0
            rating_counts = 0
            seller_level = 0
        try:
            orders_in_queue = seller_overview.select_one("div.sfNimsX").text.split(" ")[
                0
            ]
        except:
            # if no orders in queue were found, set it to 0
            orders_in_queue = 0
        try:
            # Get the gig's package details
            packages_table = soup.select_one("div.gig-page-packages-table")
            delivery_times = packages_table.select_one("tr.delivery-time").select("td")[
                1:
            ]
            for index, delivery_time in enumerate(delivery_times):
                first_span = delivery_time.select_one("span:not([class])")
                if first_span:
                    delivery_times[index] = first_span.text
                else:
                    delivery_times[index] = delivery_time.text

            features = packages_table.select("tr.description > td")
            features = features[1:]
            for i in range(len(features)):
                features[i] = features[i].text

            prices = packages_table.select_one("tr.select-package").select(
                "td > div.price-wrapper > p"
            )
            prices = [price.text for price in prices]
            try:
                revisions = packages_table.select("tr:not([class])")[-1].select("td")[
                    1:
                ]
                revisions = [revision.text for revision in revisions]
            except:
                revisions = []

            tags = soup.select("div.gig-tags-container > ul > li > a")
            extracted_tags = []
            for tag in tags:
                extracted_tags.append(str(tag).split(">")[1][:-4])

            country = soup.select(
                "div.profile-card > div.seller-card > div.stats-desc > ul.user-stats > li > strong"
            )
            member_since = str(country[1]).split(">")[1][:-8]
            country = str(country[0]).split(">")[1][:-8]
            language = soup.select_one("span.HsyURQF > strong").text

            category = soup.select("span.category-breadcrumbs")[-1].select_one("a").text
        except:
            # sometimes gig's package details are not available, so set them to None
            # because their data won't help us in our analysis
            timer.cancel()
            return None
        timer.cancel()
    except Exception as e:
        # if an exception is raised, kill the process
        sys.exit()
    return {
        "gig_title": gig_title,
        "rating_score": rating_score,
        "orders_in_queue": orders_in_queue,
        "rating_counts": rating_counts,
        "seller_level": seller_level,
        "category": category,
        "delivery_times": delivery_times,
        "prices": prices,
        "revisions": revisions,
        "tags": extracted_tags,
        "language": language,
        "country": country,
        "member_since": member_since,
        "features": features,
    }


## Crawling the data

### Crawling the category links
Because we want to take data from all the categories, we first need to get the links of all the categories. This is done by the following function that crawls the category links and stores them in a file called `categories.txt`.

In [None]:
try:
    with open('categories.txt', 'r') as f:
        full_categories_links = f.read().splitlines()
except:
    CATEGORIES_URL = "https://www.fiverr.com/categories"
    driver.get(CATEGORIES_URL)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    categories = soup.select("section.mp-categories-columns.cf > ul > li > a")
    categories_links = [category.get("href") for category in categories]
    full_categories_links = [f"https://www.fiverr.com{link}" for link in categories_links]

    with open('categories.txt', 'w') as f:
        for item in full_categories_links:
            f.write("%s\n" % item)

try:
    with open('gigs.txt', 'r') as f:
        gigs_links = f.read().splitlines()
except:
    gigs_links = []

### Crawling the gig links from the category links
Notice that the links are constantly being appended to the `gigs.txt` file. This is because the driver can get stuck at any point of time, and we don't want to lose the links that have already been crawled. So, with the help of the `.bat` file, we can restart the process from the last link that was crawled.

In [None]:
for category_url in full_categories_links: # If reading in reverse order, you might wanna use full_categories_links[::-1] instead
    if category_url in gigs_links:
        continue
    pprint("created timer")
    gigs = crawl_category(category_url)
    
    gigs_links += gigs
    with open('gigs.txt', 'a') as f:
        for item in gigs:
            f.write("%s\n" % item)
    with open('categories.txt', 'r') as f:
        lines = f.readlines()
    with open('categories.txt', 'w') as f:
        for line in lines:
            if line.strip("\n") != category_url:
                f.write(line)

### Crawling the gig data from the gig links
Notice that the same script is being used to crawl the gig data as well. This is because the driver can get stuck at any point of time, and we don't want to lose the data that has already been crawled. So, with the help of the `.bat` file, we can restart the process from the last link that was crawled.

In [None]:
try:
    with open('gigs_data.csv', 'r') as f:
        pass
except:
    with open('gigs_data.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["gig_title", "rating_score", "orders_in_queue", "rating_counts", "seller_level", "category", "delivery_times", "prices", "revisions", "tags", "language", "country", "member_since", "features"])

for gig_link in gigs_links:
    gig_data = crawl_gig(gig_link)
    if gig_data == None:
        with open('gigs.txt', 'r') as f:
            lines = f.readlines()
        with open('gigs.txt', 'w') as f:
            for line in lines:
                if line.strip("\n") != gig_link:
                    f.write(line)
        continue

    # since we are dealing with hebrew, we need to encode the data to utf-8
    with open('gigs_data.csv', 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([gig_data["gig_title"], gig_data["rating_score"], gig_data["orders_in_queue"], gig_data["rating_counts"], gig_data["seller_level"], gig_data["category"], gig_data["delivery_times"], gig_data["prices"], gig_data["revisions"], gig_data["tags"], gig_data["language"], gig_data["country"], gig_data["member_since"], gig_data["features"]] )
    with open('gigs.txt', 'r') as f:
        lines = f.readlines()
    with open('gigs.txt', 'w') as f:
        for line in lines:
            if line.strip("\n") != gig_link:
                f.write(line)

driver.quit()
# This function can make sure no process is left running in the background
timeout_callback(1)