In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re

def extract_all_text_as_array(url, timeout=10):
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        all_text = soup.stripped_strings
        all_text_array = list(all_text)
        return all_text_array
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve page. Error: {e}")
        return None

def check_dark_pattern(url, dark_pattern_strings):
    extracted_text = extract_all_text_as_array(url)
    if extracted_text:
        for pattern in dark_pattern_strings:
            if any(pattern.lower() in text.lower() for text in extracted_text):
                return True, ' '.join(extracted_text)
    return False, ' '.join(extracted_text[:10]) if extracted_text else ''

def main():
    # Load dark pattern strings from the dataset
    dark_pattern_strings = set()
    with open('dark-patterns.csv', 'r') as dark_pattern_file:
        dark_pattern_reader = csv.DictReader(dark_pattern_file)
        for row in dark_pattern_reader:
            if row['Pattern String']:  # Ensure it's not an empty row
                dark_pattern_strings.add(row['Pattern String'])

    # Load ecommerce links from CSV
    ecommerce_links = []
    with open('ecommerce_links.csv', 'r') as ecommerce_links_file:
        ecommerce_links_reader = csv.DictReader(ecommerce_links_file)
        for row in ecommerce_links_reader:
            if row['Link']:  # Ensure it's not an empty row
                has_dark_pattern, pattern_string = check_dark_pattern(row['Link'], dark_pattern_strings)
                row['Deceptive?'] = '1' if has_dark_pattern else '0'
                row['Pattern String'] = pattern_string

                # Print the results
                print(f"Link: {row['Link']}, Deceptive?: {row['Deceptive?']}, Pattern String: {row['Pattern String']}")

                ecommerce_links.append(row)

    # Write the updated results back to a new CSV file
    with open('ecommerce_links_updated.csv', 'w', newline='') as updated_file:
        fieldnames = ['Link', 'Deceptive?', 'Pattern String']
        writer = csv.DictWriter(updated_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(ecommerce_links)

if __name__ == "__main__":
    main()


Link: http://amazon.in/, Deceptive?: 1, Pattern String: Online Shopping site in India: Shop Online for Mobiles, Books, Watches, Shoes and More - Amazon.in Skip to main content .in Delivering to Mumbai 400001 Update location All Select the department you want to search in All Categories Alexa Skills Amazon Devices Amazon Fashion Amazon Fresh Amazon Pharmacy Appliances Apps & Games Audible Audiobooks Baby Beauty Books Car & Motorbike Clothing & Accessories Collectibles Computers & Accessories Electronics Furniture Garden & Outdoors Gift Cards Grocery & Gourmet Foods Health & Personal Care Home & Kitchen Industrial & Scientific Jewellery Kindle Store Luggage & Bags Luxury Beauty Movies & TV Shows Music Musical Instruments Office Products Pet Supplies Prime Video Shoes & Handbags Software Sports, Fitness & Outdoors Subscribe & Save Tools & Home Improvement Toys & Games Under ₹500 Video Games Watches Search Amazon.in EN Hello, sign in Account & Lists Returns & Orders Cart All Fresh Amazon m

In [3]:
# prompt: drop all the rows of the ecommerce_links_updated.csv where column "deceptive?"is 1

import csv

# Open the input CSV file for reading
with open('./sample_data/ecommerce_links_updated.csv', 'r') as input_file:
    csv_reader = csv.DictReader(input_file)

    # Open the output CSV file for writing
    with open('ecommerce_links_updated_without_deceptive.csv', 'w', newline='') as output_file:
        csv_writer = csv.DictWriter(output_file, fieldnames=csv_reader.fieldnames)
        csv_writer.writeheader()

        # Iterate over the rows of the input CSV file
        for row in csv_reader:
            # Check if the "Deceptive?" column is not "1"
            if row['Deceptive?'] != '1':
                # Write the row to the output CSV file
                csv_writer.writerow(row)


In [6]:
# prompt: drop all the rows of the ecommerce_links_updated.csv where column  pattern string is empty

import csv

# Open the input CSV file for reading
with open('./sample_data/ecommerce_links_updated_without_deceptive.csv', 'r') as input_file:
    csv_reader = csv.DictReader(input_file)

    # Open the output CSV file for writing
    with open('ecommerce_links_updated_without_empty_pattern_string.csv', 'w', newline='') as output_file:
        csv_writer = csv.DictWriter(output_file, fieldnames=csv_reader.fieldnames)
        csv_writer.writeheader()

        # Iterate over the rows of the input CSV file
        for row in csv_reader:
            # Check if the "Pattern String" column is not empty
            if row['Pattern String']:
                # Write the row to the output CSV file
                csv_writer.writerow(row)


In [19]:
import csv

# Open the input CSV files
with open('./sample_data/dark-patterns.csv', 'r') as dark_pattern_file, open('./sample_data/ecommerce_links_updated_without_empty_pattern_string.csv', 'r') as ecommerce_file:
    dark_pattern_reader = csv.DictReader(dark_pattern_file)
    ecommerce_reader = csv.DictReader(ecommerce_file)

    # Create a list to store combined rows
    combined_rows = []

    # Read and mark the first 100 rows from dark patterns
    for i, dark_pattern_row in enumerate(dark_pattern_reader):
        if i >= 100:
            break
        row = {
            'Link': dark_pattern_row['Website Page'],
            'Pattern String': dark_pattern_row['Pattern String'],
            'Deceptive?': '1'  # Mark as deceptive
        }
        combined_rows.append(row)

    # Read rows from ecommerce and add to the list
    for ecommerce_row in ecommerce_reader:
        combined_rows.append(ecommerce_row)

    # Open the output CSV file
    with open('combined_datsa.csv', 'w', newline='') as combined_file:
        fieldnames = ['Link', 'Pattern String', 'Deceptive?']
        combined_writer = csv.DictWriter(combined_file, fieldnames=fieldnames)
        combined_writer.writeheader()

        # Write the combined rows to the output file
        for row in combined_rows:
            combined_writer.writerow(row)
