In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from joblib import dump
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

selected_classification = "label"  # Assuming "label" is your column with 0 or 1

# Load your CSV file with the correct encoding
df = pd.read_csv('cookies.csv', encoding='Windows-1252')

# Print the column names to identify the correct one
print(df.columns)

# Assuming "Pattern String" is your column with the text from the cookies
df = df[pd.notnull(df["pattern string"])]
col = ["pattern string", selected_classification]
df = df[col]

# Data preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]
    return ' '.join(words)

df['pattern string'] = df['pattern string'].apply(preprocess_text)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['pattern string'], df[selected_classification], train_size=.8, random_state=42
)

# Vectorize and transform the text data
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Build and train the model
clf = MultinomialNB().fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = clf.predict(count_vect.transform(X_test))

# Evaluate the model
accuracy = metrics.accuracy_score(y_pred, y_test)
print(f'Accuracy: {accuracy:.2f}')

# Save the model and vectorizer
dump(clf, 'dark_pattern_classifier.joblib')
dump(count_vect, 'dark_pattern_vectorizer.joblib')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Index(['pattern string', 'label', 'Pattern type', 'url'], dtype='object')
Accuracy: 0.67


['dark_pattern_vectorizer.joblib']

In [8]:
from joblib import load

# Load the trained model and vectorizer
loaded_clf = load('dark_pattern_classifier.joblib')
loaded_vectorizer = load('dark_pattern_vectorizer.joblib')

pattern_string ="This Site uses cookies and similar technologies, including third-party cookies, to function properly, perform statistical analysis, offer you a better experience and send our online advertising messages in line with your preferences. Consult the Cookie Policy  to find out more, to know which cookies are used and how to disable them and/or to withhold your consent."

# Preprocess the test pattern string
test_pattern = preprocess_text(pattern_string)

# Vectorize and transform the test data
test_pattern_counts = loaded_vectorizer.transform([test_pattern])
test_pattern_tfidf = tfidf_transformer.transform(test_pattern_counts)

# Make a prediction
prediction = loaded_clf.predict(test_pattern_tfidf)

# Print the result
print(f'Predicted Label: {prediction[0]}')


Predicted Label: 0.0


In [15]:
import requests
from bs4 import BeautifulSoup
from joblib import load

# Load the trained model and vectorizer
loaded_clf = load('dark_pattern_classifier.joblib')
loaded_vectorizer = load('dark_pattern_vectorizer.joblib')

# Preprocess function for cookie information
def preprocess_cookies(cookies):
    # Combine cookie names and values into a single string
    cookie_text = ' '.join([f'{cookie.name}={cookie.value}' for cookie in cookies])
    # Apply the same preprocessing function used for training
    preprocessed_cookie_text = preprocess_text(cookie_text)
    return preprocessed_cookie_text

def extract_cookies_from_website(url):
    # Make a GET request to the URL
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f'Error: Unable to fetch website content. {e}')
        return None

    # Parse HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract and preprocess the cookies
    cookies = response.cookies
    preprocessed_cookies = preprocess_cookies(cookies)

    # Vectorize and transform the cookie data
    cookie_counts = loaded_vectorizer.transform([preprocessed_cookies])
    cookie_tfidf = tfidf_transformer.transform(cookie_counts)

    # Make a prediction
    prediction = loaded_clf.predict(cookie_tfidf)

    # Check if the predicted label is 1 and inject JavaScript into the HTML
    if prediction[0] == 1:
        script = """
        <script>
            alert("Dark pattern detected on this website!");
        </script>
        """
        soup.head.append(BeautifulSoup(script, 'html.parser'))

    # Return the modified HTML content
    return str(soup)

# Example usage:
url = "https://www.oneplus.in/"
modified_html = extract_cookies_from_website(url)


