# Import Dependencies

In [None]:
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy, ProxyType
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc
from fake_useragent import UserAgent
import time
import random

# Anti-Blocking Techniques

### 1. Random time delay

* **Problem:** Most web scraping bots aim to fetch data as quickly as possible, however, this can easily get you exposed as a scraping bot as there’s no way a real human can surf the web so fast. Websites can track your access speed easily and once the system finds you are going through the pages too fast, it will suspect that you are not a human and block you by default.

* **Solution:** We can set random time intervals between requests.

In [None]:
def random_time_delay():
    time.sleep(random.uniform(60,120))

In [None]:
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')
# chrome_options.add_argument("--headless") # makes code more lightweight, less resource intensive. doesn't show browser

driver = uc.Chrome(executable_path=ChromeDriverManager().install(), 
                    options=chrome_options)

for i in range(10):
    driver.get('https://www.google.co.in/')
    random_time_delay()
    
driver.quit()

### 2. Clear cookies
* **Problem:** A cookie is like a small document containing helpful information about you and your preferences. For instance, you are an English native speaker. You open a website and change the prefereed language to English. Cookie will help the website remember your preferred language is English and every time you open the website, it will automatically switch the preferred language to English. If you are scraping a website constantly with the same cookie, it is easy to be detected as a scraping bot activity.
* **Solution:** Clear cookies from time to time.

In [None]:
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')
# chrome_options.add_argument("--headless") # makes code more lightweight, less resource intensive. doesn't show browser

driver = uc.Chrome(executable_path=ChromeDriverManager().install(), 
                    options=chrome_options)

for i in range(10):
    driver.get('https://www.google.co.in/')
    driver.delete_all_cookies()
    
driver.quit()

### 3. Proxy server

* **Problem:** When a site detects there are a number of requests coming from a single IP address, the IP address can be easily blocked.
* **Solution:** To avoid sending all of your requests through the same IP address, you can use proxy servers.

**What are proxy servers?**

Here's a breakdown of the interaction b/w user, proxy server, and website:

1. The user sends an HTTP request to the proxy server.
2. The proxy server forwards the request to the website.
3. The website responds with an HTTP response.
3. The proxy server receives the response and forwards it back to the user.

In this way, the proxy server acts as an intermediary between the user and the website, handling requests and responses on behalf of the user.

**Benefits of using proxy servers:**
* **Privacy and Security:** Proxy servers can help to protect user privacy and security by hiding their IP address and encrypting their internet traffic. This can prevent unauthorized access to user data and help to keep personal information secure.

* **Content Filtering:** Proxy servers can be used to filter content and block access to websites or resources that are deemed inappropriate or dangerous. This is particularly useful in corporate or educational environments where access to certain types of content may be restricted.

* **Network Performance:** Proxy servers can help to improve network performance by caching frequently accessed content and reducing bandwidth usage. This can speed up web browsing and reduce data costs for users.

**Authentication to access proxy server**
- When you connect to a proxy server, the server may require you to provide a username and password to verify your identity and ensure that you are authorized to use the proxy. This is especially important for proxies that are used in corporate or organizational environments, where access to the internet may be restricted or monitored for security reasons.

- Without the correct username and password, you may not be able to connect to the proxy server or access the internet through the proxy. In some cases, the proxy server may also limit the features or capabilities that are available to users who are not authenticated, or may impose restrictions on the websites or applications that can be accessed through the proxy.

- Therefore, if your proxy server requires authentication, you will need to provide the correct username and password in order to use the proxy and access the internet.

In [None]:
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')

PROXY_HOST = 'your_proxy_host'
PROXY_PORT = 'your_proxy_port'
USERNAME = 'your_username' # if authentication is needed
PASSWORD = 'your_password' # if authentication is needed

######### Method 1 #############

if USERNAME and PASSWORD: # if authentication is needed
    chrome_options.add_argument('--proxy-server=http://{}:{}@{}:{}'.format(USERNAME, PASSWORD, PROXY_HOST, PROXY_PORT))

else:
    chrome_options.add_argument('--proxy-server=http://{}:{}'.format(PROXY_HOST, PROXY_PORT))

driver = uc.Chrome(executable_path=ChromeDriverManager().install(), 
                    options=chrome_options)

driver.get('https://www.google.co.in/')

driver.quit()

######### Method 2 #############
# Note that this method is a bit more verbose compared to the previous methods, 
# but it provides more flexibility and control over the proxy settings.

proxy = Proxy({
    'proxyType': ProxyType.MANUAL,
    'httpProxy': '{}:{}'.format(PROXY_HOST, PROXY_PORT),
    'ftpProxy': '{}:{}'.format(PROXY_HOST, PROXY_PORT),
    'sslProxy': '{}:{}'.format(PROXY_HOST, PROXY_PORT),
    'noProxy': '' # set this to avoid any urls from being proxied
})

if USERNAME and PASSWORD: # if authentication is needed
    proxy.proxy_username = USERNAME
    proxy.proxy_password = PASSWORD

chrome_options.add_argument('--proxy-server={}'.format(proxy.proxy))

driver = uc.Chrome(executable_path=ChromeDriverManager().install(), 
                    options=chrome_options)

driver.get('https://www.google.co.in/')

driver.quit()

### 4. User agent

* **Problem:** Headers are small pieces of information that go with every HTTP request that hits the servers. One of those pieces of information precisely describes the client making the request, the infamous "User-Agent" header. When a software sends a request, it often identifies itself, its application type, operating system, software vendor, or software version, by submitting a characteristic identification string. This string is referred to as a “user agent string”. You can think of it as an ID card containing some basic information. All browsers, as well as some popular crawlers and bots, such as ‘google bot’, have a unique ‘user agent string’ that they identify themselves with. A lot of companies set up their servers in a way that allows them to identify the browser a client is using. If we do not send request with a legitimate user agent, our request may get blocked. 
* **Solution:** Fortunately, all browsers’ user agent strings are available publicly on the internet. Thus, we can easily pretend to be a browser. For instance, suppose we want to make a GET request to a website, pretending to be a client using Chrome. First, we need to find the User-Agent string of Chrome. A quick Google search should yield us a string like: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36".

In [None]:
ua = UserAgent()
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument(f"user-agent={ua['google chrome']}")

driver = uc.Chrome(executable_path=ChromeDriverManager().install(), 
                    options=chrome_options)

driver.get('https://www.google.co.in/')

driver.quit()

# References

* List of Chrome Option switches: https://peter.sh/experiments/chromium-command-line-switches/
* No sandbox chrome option switch: https://www.google.com/googlebooks/chrome/med_26.html