In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd
from datetime import datetime, timedelta
import time
import re
import pickle

In [11]:
excel_file = 'Urls.xlsx'
df = pd.read_excel(excel_file)
expected_headers = ['Main_Url', 'Magzine', 'Support_Url']
data = df[expected_headers]
urls_data = list(zip(data['Main_Url'], data['Support_Url']))

chrome_options = Options()
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=chrome_options)

driver.get("https://www.icloud.com")  

with open("icloud_cookies.pkl", "rb") as f:
    cookies = pickle.load(f)

for cookie in cookies:
    if isinstance(cookie.get("expiry"), float):
        cookie["expiry"] = int(cookie["expiry"])
    try:
        driver.add_cookie(cookie)
    except Exception as e:
        print(f"Could not add cookie {cookie.get('name', 'unknown')}: {e}")


driver.get("https://www.icloud.com/#newspublisher/")
time.sleep(5)

results = []
tab_count = 0  # Track the number of processed URLs

def refresh_page():
    print("Refreshing page to clear memory...")
    driver.execute_script("window.location.reload();") 
    time.sleep(5)

def process_url(main_url, support_url, url_index, total_urls):
    global tab_count

    try:
        print(f"Processing URL {url_index}/{total_urls}")

        # Open new tab every 100 URLs
        if url_index % 100 == 0:
            print("Refreshing page...")
            refresh_page()

        driver.get(support_url)
        time.sleep(10)

        wait = WebDriverWait(driver, 20)

        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        if iframes:
            driver.switch_to.frame(iframes[0])

        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "table-body")))
        table_rows = driver.find_elements(By.CSS_SELECTOR, ".table-body .table-row-wrapper")
        print(f"Found {len(table_rows)} rows in the table for URL")

        if not table_rows:
            print(f"No rows found for URL: {support_url}. Taking screenshot for debugging...")
            driver.save_screenshot(f"screenshot_no_rows_{time.time()}.png")
            return

        for index, row in enumerate(table_rows):
            try:
                print(f"Processing Row {index + 1}")
                time.sleep(2)

                try:
                    cell_metadata = row.find_element(By.CLASS_NAME, "cell-metadata").text
                except Exception as e:
                    print(f"Error extracting 'cell-metadata' for Row {index + 1}: {e}")
                    cell_metadata = None

                publish_date = None
                if cell_metadata:
                    match = re.search(r'Published:\s*(\w+\s\d{1,2},\s\d{4})', cell_metadata)
                    if match:
                        publish_date_str = match.group(1)
                        try:
                            publish_date = datetime.strptime(publish_date_str, "%B %d, %Y").strftime("%Y-%m-%d")
                        except ValueError as ve:
                            print(f"Error formatting publish date: {ve}")
                            publish_date = None

                analytics_button = WebDriverWait(row, 10).until(EC.element_to_be_clickable((
                    By.CLASS_NAME, "np-icon-btn.icon-link-btn.cw-button"
                )))
                driver.execute_script("arguments[0].scrollIntoView(true);", analytics_button)
                analytics_button.click()

                try:
                    cw_text_fields = wait.until(EC.presence_of_all_elements_located((
                        By.CLASS_NAME, "cw-text-field"
                    )))
                    text_values = [field.get_attribute("value") for field in cw_text_fields]
                    second_component = text_values[1] if len(text_values) > 1 else None
                    second_component_key = second_component.split('/')[-1] if second_component else None
                    print(f"Second Component Key: {second_component_key}")

                    results.append({
                        "Main URL": main_url,
                        "Support URL": support_url,
                        "Row Index": index + 1,
                        "Cell Metadata": cell_metadata,
                        "Second Component": second_component,
                        "Second Component Key": second_component_key,
                        "Publish Date": publish_date
                    })

                except Exception as e:
                    print(f"Error retrieving texts for Row {index + 1} in URL {support_url}: {e}")

                close_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
                    By.CLASS_NAME, "np-clean-btn.right.cw-button.primary-button"
                )))
                driver.execute_script("arguments[0].scrollIntoView(true);", close_button)
                close_button.click()

            except Exception as e:
                print(f"Error processing Row {index + 1} in URL {support_url}")

    except Exception as e:
        print(f"Error while handling URL: {support_url}\n{e}")

total_urls = len(urls_data)
for url_index, (main_url, support_url) in enumerate(urls_data, start=1): 
    process_url(main_url, support_url, url_index, total_urls)

Processing URL 1/29
Found 2 rows in the table for URL
Processing Row 1
Second Component Key: AL-U1gHqoTNGVPghCIO5F2Q
Processing Row 2
Second Component Key: AOq1OaFvoRmS6Ve1_pmoSTQ
Processing URL 2/29
Found 1 rows in the table for URL
Processing Row 1
Second Component Key: AMSvit2rgSF2b4YTEt4WKAg
Processing URL 3/29
Found 1 rows in the table for URL
Processing Row 1
Second Component Key: AICUoV3tUTMCWY5amw5lwdw
Processing URL 4/29
Found 1 rows in the table for URL
Processing Row 1
Second Component Key: AJea_IZjORHSSUH8P6PvrXw
Processing URL 5/29
Found 1 rows in the table for URL
Processing Row 1
Second Component Key: AuKQA93KnSdepwl4HseiYLw
Processing URL 6/29
Found 1 rows in the table for URL
Processing Row 1
Second Component Key: AQpfsKuD-SGuGu5-qtt15jQ
Processing URL 7/29
Found 1 rows in the table for URL
Processing Row 1
Second Component Key: AhcfJdh8ySBC3gODi_ssMvg
Processing URL 8/29
Found 1 rows in the table for URL
Processing Row 1
Second Component Key: A5trjYJ7ORi2hRqUV4U-oyg
P

In [12]:
if results:
    output_df = pd.DataFrame(results)
    #output_df['Publish Date'] = pd.to_datetime(output_df['Publish Date'])
    #output_df = output_df[output_df['Publish Date'] >= '2024-10-01']
    #output_df['Publish Date'] = output_df['Publish Date'].dt.date
    output_df.to_excel("Paywall_0506_49_4.xlsx", index=False)
    print("Process completed. Results saved to 'Extracted_Results.xlsx'.")
else:
    print("No data extracted. Please check the logs for issues.")

Process completed. Results saved to 'Extracted_Results.xlsx'.


In [13]:
# Set up Chrome WebDriver
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=chrome_options)

driver.get("https://www.icloud.com")  

with open("icloud_cookies.pkl", "rb") as f:
    cookies = pickle.load(f)

for cookie in cookies:
    if isinstance(cookie.get("expiry"), float):
        cookie["expiry"] = int(cookie["expiry"])
    try:
        driver.add_cookie(cookie)
    except Exception as e:
        print(f"Could not add cookie {cookie.get('name', 'unknown')}: {e}")


driver.get("https://www.icloud.com/#newspublisher/")
time.sleep(5)

# Load input file
output_file = "Paywall_0506_49.xlsx"
df = pd.read_excel(output_file)

# Define the different analytics types
analytics_types = ["uniqueExposures", "totalViews", "affiliateCommerce", "newsPlusEngagement"]

# Add new columns for extracted values
for analytics_type in analytics_types:
    df[analytics_type] = None

def get_reach_value(driver):
    """Extract reach value after ensuring iframe is loaded."""
    try:
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        if iframes:
            driver.switch_to.frame(iframes[0])  

        reach_element = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, "text-transform-none"))
        )
        return reach_element.text.strip()
    except Exception:
        return None  
    finally:
        driver.switch_to.default_content()  

# Iterate through rows
for index, row in df.iterrows():
    try:
        main_url = row["Main URL"]
        support_url = row["Support URL"]
        key = row["Second Component Key"]
        publish_date = row["Publish Date"]

        if not main_url or not support_url or not key or not publish_date:
            print(f"Skipping row {index + 1} due to missing data.")
            continue

        base_url = support_url.split("articles/")[0]
        publish_datetime = datetime.strptime(publish_date, "%Y-%m-%d")
        current_date = (publish_datetime + timedelta(days=7)).strftime("%Y-%m-%d")

        for analytics_type in analytics_types:
            new_link = f"{base_url}analytics/{key}/{analytics_type}&d={publish_date}%2C{current_date}&tz=-08%3A00"
            print(f"Generated Link for Row {index + 1} ({analytics_type})")

            retry_count = 0
            max_retries = 6
            reach_value = None
            last_value = None  # Store last retrieved value

            while retry_count < max_retries:
                driver.get(new_link)
                time.sleep(6)  

                reach_value = get_reach_value(driver)

                # Check if reach_value is valid and different from last_value
                if reach_value and reach_value not in ["0", "-"] and reach_value != last_value:
                    print(f"Extracted Reach Value ({analytics_type}): {reach_value}")
                    df.at[index, analytics_type] = reach_value  # Store in corresponding column
                    break  # Success, exit loop
                else:
                    print(f"Retrying Row {index + 1} ({analytics_type}): No valid or new reach value found ({reach_value}).")
                    last_value = reach_value  # Store for next comparison
                    retry_count += 1
                    time.sleep(3)  # Wait before retrying

            if not reach_value or reach_value in ["0", "-"]:
                print(f"Error processing Row {index + 1} ({analytics_type}): Reach value did not update.")

    except Exception as e:
        print(f"Error processing Row {index + 1}: {e}")


Generated Link for Row 1 (uniqueExposures)
Retrying Row 1 (uniqueExposures): No valid or new reach value found ().
Extracted Reach Value (uniqueExposures): 124,444
Generated Link for Row 1 (totalViews)
Extracted Reach Value (totalViews): 5,930
Generated Link for Row 1 (affiliateCommerce)
Extracted Reach Value (affiliateCommerce): 1,615
Generated Link for Row 1 (newsPlusEngagement)
Extracted Reach Value (newsPlusEngagement): 1,412
Generated Link for Row 2 (uniqueExposures)
Retrying Row 2 (uniqueExposures): No valid or new reach value found (-).
Extracted Reach Value (uniqueExposures): 218,319
Generated Link for Row 2 (totalViews)
Extracted Reach Value (totalViews): 11,199
Generated Link for Row 2 (affiliateCommerce)
Extracted Reach Value (affiliateCommerce): 2,430
Generated Link for Row 2 (newsPlusEngagement)
Extracted Reach Value (newsPlusEngagement): 5,427
Generated Link for Row 3 (uniqueExposures)
Extracted Reach Value (uniqueExposures): 217,462
Generated Link for Row 3 (totalViews)


In [15]:
# Save updated DataFrame
df.to_excel("Paywall_0506_1_results.xlsx", index=False)
print("Extraction complete! Results saved to PPL_results.xlsx")

Extraction complete! Results saved to PPL_results.xlsx
