# Chrome Mixed Content Errors Fetch

### Configuration
This first cell is the the only cell you should need to make changes to.
Be sure the read all commented notes in this first cell.

In [30]:
## As a fail safe, the script saves 10 rows of urls at a time to the output file, designate the amount of rows here. 
## You can make this slightly higher for CSVs over 10k URLs, but this script is untested above that many URLs.
rows_per_run = 10

## If you'd like to loop through a CSV of URLs with URL in the 'url' column, paste that path here.
## !IMPORTANT! As part of the fail safe described above, the script removes 10 rows of urls at a time from this source file, so be sure to have a backup of this file before you run this script on it. 
url_source = '/Users/you/Documents/url_list.csv'

## Designate the path where you'd like the output of results. It will have 3 columns, URL, severe_count, warning_count
url_output = '/Users/you/Documents/my_urls_mixed_content_errors.csv'

## Designate the local path of your Chromedriver. If you need to install: https://chromedriver.chromium.org/downloads
## On Mac, the Chromedriver path may not have a file extension. On Windows it will likely have an .exe file extension.
chrome_path = '/Users/path/chromedriver'

In [31]:
## Imports the necessary libraries
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
from requests import get

In [32]:
## This creates the blank output file as the url_output path designated above. No changes needed.
df_output = pd.DataFrame(columns = ['url', 'severe_count', 'warning_count'])
df_output.to_csv(url_output, index=False)

### Program
The program will loop through each URL, extracting Mixed Content errors from the Chrome Console Log (webdriver.Chrome.get_log).<br>
It renders in full Chrome, including JavaScript, a rate of about 1-5 seconds per URL depending on host server speed and your internet connection.<br>
It has not been tested on a list larger than 10k URLs.
##### No changes are needed below.

In [33]:
## Enables browser logging & sets options
## No further changes are required

d = DesiredCapabilities.CHROME
d['loggingPrefs'] = { 'browser':'ALL' }

opt = webdriver.ChromeOptions()
opt.add_experimental_option('w3c', False)

In [34]:
df_source = pd.read_csv(url_source)

while len(df_source) > 0:
    new_rows = df_source.iloc[ 0: rows_per_run, : ]
    print(str(len(new_rows)) + ' rows to process...')
    url_list = new_rows['url'].tolist()
    
    console_output_df = pd.DataFrame()
    
    d = DesiredCapabilities.CHROME
    d['loggingPrefs'] = { 'browser':'ALL' }
    opt = webdriver.ChromeOptions()
    opt.add_experimental_option('w3c', False)

    for url in url_list:
        driver = webdriver.Chrome(chrome_path, options=opt,desired_capabilities=d)
        
        try:
            driver.get(url)
            console = driver.get_log('browser')

            severe_count = 0
            warning_count = 0

            for log in console:
                if "Mixed Content" in log['message'] and "SEVERE" in log['level']:
                    severe_count += 1
                if "Mixed Content" in log['message'] and "WARNING" in log['level']:
                    warning_count += 1

            console_results = {'severe_count':severe_count, 'warning_count':warning_count, 'loaded':True}
            console_row_df = pd.DataFrame(data=console_results, index=[0])
            console_row_df['url'] = url
            console_output_df = console_output_df.append(console_row_df, ignore_index=True, sort=False)

            # Quit browser each time to avoid zombies
            driver.quit()
    
        ## A failsafe to prevent URLs that won't load from blocking script from continuing
        ## There may be a more elegant solution for this
        except:
            console_results = {'severe_count':'', 'warning_count':'', 'loaded':False}
            console_row_df = pd.DataFrame(data=console_results, index=[0])
            console_row_df['url'] = url
            console_output_df = console_output_df.append(console_row_df, ignore_index=True, sort=False)
            driver.quit()
            print("Skipping 1 URL that failed to render.")

        
    # Read the output CSV, write the new rows, then write the output back again
    df_output = pd.read_csv(url_output)
    df_output = df_output.append(console_output_df, ignore_index=True, sort=False)
    df_output.to_csv(url_output, index=False)
    
    # If all the URLs were processed, write the source list back without the processed URLs
    updated_df = df_source.iloc[ rows_per_run+1: , : ]
    updated_df.to_csv(url_source, index=False)
    df_source = pd.read_csv(url_source)
    
driver.quit()

print("Finished!")

10 rows to process...
Skipping 1 URL that failed to render.
Finished!
