In [31]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from joblib import dump, load

In [32]:
# Load the CSV file from the model_training folder
df = pd.read_csv("../model_training/dataset.csv")
df.head()

Unnamed: 0,Link,Type,Text
0,https://www.expedia.com/,Not Dark Pattern,"By clicking “Accept All Cookies”, you agree to..."
1,https://www.expedia.com/,Fake Urgency,"Save 10% or more on over 100,000 hotels with M..."
2,https://www.expedia.com/,Fake Urgency,You have good taste! Book now before someone e...
3,https://www.expedia.com/,Fake Scarcity,We have 5 left at 15% off at
4,https://www.expedia.com/,Fake Scarcity,We have 5 left at


In [33]:
# List of selected types
selected_types = ["Not Dark Pattern", "Fake Scarcity", "Fake Social Proof", "Fake Urgency", "Misdirection"]

# Create a boolean mask
mask = df['Type'].isin(selected_types)

# Apply the mask to filter the DataFrame
selected_df = df[mask]

In [34]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(selected_df, test_size=0.2, random_state=42)

print(f"Train DataFrame count: {len(train_df)}")
print(f"Test DataFrame count: {len(test_df)}")

Train DataFrame count: 222
Test DataFrame count: 56


In [35]:
# Create pipelines for each algorithm
algorithms = {
    "Multinomial Naive Bayes": make_pipeline(TfidfVectorizer(), MultinomialNB()),
    "Support Vector Machines": make_pipeline(TfidfVectorizer(), SVC(kernel='linear')),
    "Random Forest": make_pipeline(TfidfVectorizer(), RandomForestClassifier())
}

In [36]:
# Create a folder for trained models if it doesn't exist
models_folder = "trained_models"
os.makedirs(models_folder, exist_ok=True)

In [37]:
# Train and evaluate each model
for algo_name, model in algorithms.items():
    # Drop rows with missing values in the 'Text' column
    train_df = train_df.dropna(subset=['Text'])

    # Ensure 'Text' column has string data type
    train_df['Text'] = train_df['Text'].astype(str)

    # Reset the index of the DataFrame
    train_df = train_df.reset_index(drop=True)

    # Fit the model
    model.fit(train_df['Text'], train_df['Type'])

    # Evaluate the model on the test set
    predictions = model.predict(test_df['Text'])
    accuracy = accuracy_score(test_df['Type'], predictions)

    # Save the trained model to a file in the trained_models folder
    model_path = os.path.join(models_folder, f'{algo_name.lower().replace(" ", "_")}_model.joblib')
    dump(model, model_path)

    print(f"{algo_name} Accuracy: {accuracy:.2f}")

Multinomial Naive Bayes Accuracy: 0.50
Support Vector Machines Accuracy: 0.70
Random Forest Accuracy: 0.59


In [38]:
user_input_text = "Hurry! Limited stock available."
print(f"Text to predict: {user_input_text}")

# Load and make predictions with each model
for algo_name, model in algorithms.items():
    # Load the saved model
    model_path = os.path.join(models_folder, f'{algo_name.lower().replace(" ", "_")}_model.joblib')
    loaded_model = load(model_path)
    
    # Make predictions
    predicted_type = loaded_model.predict([user_input_text])
    
    print(f"{algo_name}: {predicted_type}")

Text to predict: Hurry! Limited stock available.
Multinomial Naive Bayes: ['Not Dark Pattern']
Support Vector Machines: ['Fake Urgency']
Random Forest: ['Not Dark Pattern']
