In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from joblib import dump





In [7]:
selected_classification = "Pattern Category"

df = pd.read_csv('dark_patterns.csv')



In [11]:
df = df[pd.notnull(df["Pattern String"])]
col = ["Pattern String", selected_classification]
df = df[col]

df["category_id"] = df[selected_classification].factorize()[0]

print(df[selected_classification].value_counts())

Scarcity         678
Social Proof     314
Misdirection     237
Urgency          237
Obstruction       30
Sneaking          12
Forced Action      4
Name: Pattern Category, dtype: int64


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    df['Pattern String'], df[selected_classification], train_size=.8)

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
])


In [14]:

# You can perform grid search for hyperparameter tuning
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [None, 10, 20, 30],
}

grid_search = GridSearchCV(text_clf, parameters, cv=2)
grid_search.fit(X_train, y_train)



In [18]:
# Print the best parameters found by grid search
print("Best parameters found: ", grid_search.best_params_)



Best parameters found:  {'clf__max_depth': 30, 'clf__n_estimators': 200, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}


In [19]:
# Fit the model with the best parameters
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)



In [20]:
# Make predictions on the test set
y_pred = best_clf.predict(X_test)

# Evaluate the accuracy and other metrics
print("Accuracy:", metrics.accuracy_score(y_pred, y_test))
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.933993399339934
Classification Report:
               precision    recall  f1-score   support

Forced Action       0.00      0.00      0.00         2
 Misdirection       0.91      0.86      0.89        50
  Obstruction       0.67      1.00      0.80         4
     Scarcity       0.94      0.99      0.96       136
     Sneaking       0.00      0.00      0.00         2
 Social Proof       0.98      0.98      0.98        65
      Urgency       0.90      0.84      0.87        44

     accuracy                           0.93       303
    macro avg       0.63      0.67      0.64       303
 weighted avg       0.92      0.93      0.93       303



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
