In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Load your dataset
# Assuming 'dark-patterns.csv' is the name of your CSV file
df = pd.read_csv('dark-patterns.csv')

# Drop rows with missing values in 'Pattern String' or 'Pattern Type'
df = df.dropna(subset=['Pattern String', 'Pattern Type', 'Pattern Category'])

In [None]:
# Split the data into features (X) and target variables (y, y_category)
X = df['Pattern String']
y = df['Pattern Type']
y_category = df['Pattern Category']

# Convert the text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(lowercase=True)
X_tfidf = vectorizer.fit_transform(X)

# Train a Multinomial Naive Bayes classifier
model = MultinomialNB()
model.fit(X_tfidf, y)

In [None]:
# Function to fetch HTML content from a given URL with retries
def get_html_content_with_retry(url, max_retries=1):
    for attempt in range(max_retries):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
            }
            with requests.Session() as session:
                response = session.get(url, headers=headers)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching HTML ({attempt + 1}/{max_retries}): {e}")
    return None

In [None]:
# Function to extract text from HTML using BeautifulSoup
def extract_text_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()

In [None]:
#Example of using the trained model for prediction with a URL
while True:
    url = input("Enter the URL of an e-commerce site (or 'exit' to stop): ")

    if url.lower() == 'exit':
        break

    # Fetch HTML content from the URL with retries
    html_content = get_html_content_with_retry(url)

    if html_content:
        # Extract text from HTML
        text_from_html = extract_text_from_html(html_content)

        # Transform the text using the TF-IDF vectorizer
        text_tfidf = vectorizer.transform([text_from_html])

        # Predict the pattern type
        prediction = model.predict(text_tfidf)
        #
        if prediction[0] in y.values:
            # Get the corresponding pattern category for display
            category_index = df.index[df['Pattern Type'] == prediction[0]].tolist()[0]
            predicted_category = y_category.iloc[category_index]
            print(f"Predicted Pattern Type: {prediction[0]}, Predicted Pattern Category: {predicted_category}\n")
        else:
            print("Dark pattern not found in the HTML content\n")


Predicted Pattern Type: Countdown Timer, Predicted Pattern Category: Urgency

Predicted Pattern Type: Activity Notification, Predicted Pattern Category: Social Proof

Predicted Pattern Type: Activity Notification, Predicted Pattern Category: Social Proof

