In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
data = pd.read_csv("/content/sample_data/dataset1.csv")


In [13]:
# Text preprocessing: Tokenization, stemming, and lemmatization
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [14]:
def preprocess_text(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]  # Remove non-alphabetic characters
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [stemmer.stem(word) for word in words]  # Stemming
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return ' '.join(words)

In [19]:
data['processed_text'] = data['text'].apply(preprocess_text)

# Separate features (text) and label
X = data['processed_text']
y = data['label']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [22]:
classifier = MultinomialNB()
classifier.fit(X_train_bow, y_train)


In [23]:
y_pred = classifier.predict(X_test_bow)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.90


In [24]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
                   precision    recall  f1-score   support

    Misdirection       0.85      0.76      0.81        38
Not Dark Pattern       0.93      0.91      0.92       244
     Obstruction       1.00      0.83      0.91         6
        Scarcity       0.89      0.99      0.94        81
        Sneaking       0.00      0.00      0.00         2
    Social Proof       0.83      0.93      0.88        54
         Urgency       0.84      0.79      0.81        47

        accuracy                           0.90       472
       macro avg       0.76      0.74      0.75       472
    weighted avg       0.90      0.90      0.90       472



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
