In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from joblib import dump

selected_classification = "Pattern Category"

df = pd.read_csv('dark_patterns.csv')

df = df[pd.notnull(df["Pattern String"])]
col = ["Pattern String", selected_classification]
df = df[col]

df["category_id"] = df[selected_classification].factorize()[0]

X_train, X_test, y_train, y_test = train_test_split(
    df['Pattern String'], df[selected_classification], train_size=.7, random_state=42)

# Create a list of classifiers
classifiers = [
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machine', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier())
]

# Create an array to store accuracies
accuracies = []

for name, clf in classifiers:
    # Create a pipeline with vectorizer and classifier
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', clf)
    ])

    # Define the parameter grid for grid search
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': (True, False),
    }

    # Adjust parameters for Logistic Regression
    if name == 'Logistic Regression':
        parameters['clf__C'] = [0.001, 0.01, 0.1, 1, 10, 100]

    grid_search = GridSearchCV(text_clf, parameters, cv=StratifiedKFold(n_splits=5))
    grid_search.fit(X_train, y_train)

    # Print the best parameters found by grid search
    print(f"Best parameters for {name}: ", grid_search.best_params_)

    # Fit the model with the best parameters
    best_clf = grid_search.best_estimator_
    best_clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = best_clf.predict(X_test)

    # Calculate accuracy
    acc = metrics.accuracy_score(y_pred, y_test)
    
    # Save the result
    accuracies.append((name, acc))

# Print the results
for name, acc in accuracies:
    print(f'{name} Accuracy: {acc:.4f}')




Best parameters for Multinomial Naive Bayes:  {'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters for Logistic Regression:  {'clf__C': 10, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}




Best parameters for Support Vector Machine:  {'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}




Best parameters for Random Forest:  {'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}




Best parameters for Gradient Boosting:  {'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Multinomial Naive Bayes Accuracy: 0.9515
Logistic Regression Accuracy: 0.9692
Support Vector Machine Accuracy: 0.9626
Random Forest Accuracy: 0.9626
Gradient Boosting Accuracy: 0.9273


In [6]:
# Evaluate the best performing classifier on the test set
best_classifier_name, best_classifier_acc = max(accuracies, key=lambda x: x[1])
print(f"\nBest Performing Classifier: {best_classifier_name}, Accuracy: {best_classifier_acc:.4f}")



Best Performing Classifier: Logistic Regression, Accuracy: 0.9692


In [7]:

# Save the best performing model
dump(best_clf, 'best_text_classifier.joblib')

['best_text_classifier.joblib']