# Import Dependencies

In [None]:
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy, ProxyType
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc
from fake_useragent import UserAgent
import time
import random

# Anti-Blocking Techniques

### 1. Random time delay

* Problem: Most web scraping bots aim to fetch data as quickly as possible, however, this can easily get you exposed as a scraping bot as there’s no way a real human can surf the web so fast. Websites can track your access speed easily and once the system finds you are going through the pages too fast, it will suspect that you are not a human and block you by default.

* Solution: We can set random time intervals between requests.

In [None]:
def random_time_delay():
    time.sleep(random.uniform(60,120))

In [None]:
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')
# chrome_options.add_argument("--headless") # makes code more lightweight, less resource intensive. doesn't show browser

driver = uc.Chrome(executable_path=ChromeDriverManager().install(), 
                    options=chrome_options)

for i in range(10):
    driver.get('https://www.google.co.in/')
    random_time_delay()
    
driver.quit()

### 2. Clear cookies
* Problem: A cookie is like a small document containing helpful information about you and your preferences. For instance, you are an English native speaker. You open a website and change the prefereed language to English. Cookie will help the website remember your preferred language is English and every time you open the website, it will automatically switch the preferred language to English. If you are scraping a website constantly with the same cookie, it is easy to be detected as a scraping bot activity.
* Solution: Clear cookies from time to time.

In [None]:
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')
# chrome_options.add_argument("--headless") # makes code more lightweight, less resource intensive. doesn't show browser

driver = uc.Chrome(executable_path=ChromeDriverManager().install(), 
                    options=chrome_options)

for i in range(10):
    driver.get('https://www.google.co.in/')
    driver.delete_all_cookies()
    
driver.quit()

### 3. Proxy server

* Problem: When a site detects there are a number of requests coming from a single IP address, the IP address can be easily blocked.
* Solution: To avoid sending all of your requests through the same IP address, you can use proxy servers.

In [None]:
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')

proxy_ip_port = "IP:Port:Username:Password"

method1=False
method2=True

if(method1==True)
    
    proxy = Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    proxy.http_proxy = proxy_ip_port
    proxy.ssl_proxy = proxy_ip_port

    capabilities = webdriver.DesiredCapabilities.CHROME
    proxy.add_to_capabilities(capabilities)

    driver = uc.Chrome(executable_path=ChromeDriverManager().install(), 
                       options=chrome_options,
                       desired_capabilities=capabilities)
    
if(method2==True):
    chrome_options.add_argument('--proxy-server=%s' % proxy_ip_port)
    
    driver = uc.Chrome(executable_path=ChromeDriverManager().install(), 
                       options=chrome_options)

driver.get('https://www.google.co.in/')

driver.quit()

### 4. User agent

* Problem: Headers are small pieces of information that go with every HTTP request that hits the servers. One of those pieces of information precisely describes the client making the request, the infamous "User-Agent" header. When a software sends a request, it often identifies itself, its application type, operating system, software vendor, or software version, by submitting a characteristic identification string. This string is referred to as a “user agent string”. You can think of it as an ID card containing some basic information. All browsers, as well as some popular crawlers and bots, such as ‘google bot’, have a unique ‘user agent string’ that they identify themselves with. A lot of companies set up their servers in a way that allows them to identify the browser a client is using. If we do not send request with a legitimate user agent, our request may get blocked. 
* Solution: Fortunately, all browsers’ user agent strings are available publicly on the internet. Thus, we can easily pretend to be a browser. For instance, suppose we want to make a GET request to a website, pretending to be a client using Chrome. First, we need to find the User-Agent string of Chrome. A quick Google search should yield us a string like: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36".

In [None]:
ua = UserAgent()
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument(f"user-agent={ua['google chrome']}")

driver = uc.Chrome(executable_path=ChromeDriverManager().install(), 
                    options=chrome_options)

driver.get('https://www.google.co.in/')

driver.quit()

# References

* List of Chrome Option switches: https://peter.sh/experiments/chromium-command-line-switches/
* No sandbox chrome option switch: https://www.google.com/googlebooks/chrome/med_26.html