In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import chardet

# 파일을 바이너리 모드로 읽어서 인코딩 확인
with open("/content/drive/MyDrive/bigdata/darkpattern_dataset_0601.csv", 'rb') as file:
    result = chardet.detect(file.read(10000))  # 처음 10000바이트만 읽어서 샘플링

print(result['encoding'])

utf-8


In [None]:
data = pd.read_csv("/content/dark_patterns_test.csv")

In [None]:
data['word_count'] = data['text'].apply(lambda x: len(str(x).split()))

# 한 단어만 존재하는 행 제거
data = data[data['word_count'] > 1]

In [None]:
# prompt: dark_patterns_test_utf8.csv의 target열을 lable로 하고 y로감

y = data["target"]


In [None]:
print(data['target'].value_counts())

target
non-darkpattern    2171
Scarcity            344
Social Proof        312
Urgency             196
Misdirection        180
Obstruction          27
Sneaking             12
Forced Action         4
Name: count, dtype: int64


In [None]:
texts = data['text']
dark_patterns = data['target']
urls = data['url']

In [None]:
import numpy as np

texts = texts.replace(np.nan, '', regex=True)

In [None]:
#data['text'] = data['text'].astype(str)

In [None]:
print(texts.head())

0    I??�e read and accept the terms & conditions *...
1    I accept the Terms & Conditions of the FREE Br...
2    I would like to join Backstage Pass & agree to...
3    I agree to receive marketing emails from Natur...
4                        No thanks! I don't like deals
Name: text, dtype: object


In [None]:
def preprocess_text(text):

    # 소문자 변환
    text = text.lower()

    # 특수 문자 제거
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)

    # 토큰화
    tokens = word_tokenize(text)

    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 어간 추출 또는 표제어 추출
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 공백 제거 및 다시 문장으로 결합
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text


In [None]:
data.dtypes

text           object
target         object
Unnamed: 2    float64
url            object
word_count      int64
dtype: object

In [None]:
data['text'] = data['text'].astype(str)

In [None]:
data['preprocessed_text'] = data['text'].apply(preprocess_text)

In [None]:
print(data['preprocessed_text'].head())

0    i�e read accept term condition also authorize ...
1    accept term condition free brother care progra...
2    would like join backstage pas agree term condi...
3    agree receive marketing email natural life agr...
4                                thanks dont like deal
Name: preprocessed_text, dtype: object


In [None]:
from textblob import TextBlob
def extract_text_features(preprocessed_text):
    # 단어 빈도 피처 추출
    count_vectorizer = CountVectorizer()
    word_freq_features = count_vectorizer.fit_transform(preprocessed_text)

    # TF-IDF 피처 추출
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit(preprocessed_text)
    tfidf_features = tfidf_vectorizer.fit_transform(preprocessed_text)

    # 감성 점수 계산
    sentiment_scores = preprocessed_text.apply(lambda x: TextBlob(x).sentiment.polarity)

    # 단어 빈도 피처를 DataFrame으로 변환
    features = pd.DataFrame(word_freq_features.toarray(), columns=count_vectorizer.get_feature_names_out())

    # tfidf_features를 DataFrame으로 추가
    tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    features = pd.concat([features, tfidf_df], axis=1)
    features = pd.DataFrame(
         data={
             'sentiment': sentiment_scores,
         }
     )

    return features

# # 특성 추출을 위한 함수 정의
# def extract_features(df):
#     # N-gram 벡터화
#     tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_df=0.95)
#     tfidf_features = tfidf_vectorizer.fit_transform(df['text'])

#     # 감성 점수 계산
#     sentiment_scores = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

#     # 텍스트 길이
#     text_length = df['text'].apply(len)

#     # DataFrame으로 특성 합치기
#     features = pd.DataFrame(
#         data={
#             'sentiment': sentiment_scores,
#             'text_length': text_length
#         }
#     )
#     features = features.join(pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out()))

#     return features

In [None]:
print(data.columns)

Index(['text', 'target', 'Unnamed: 2', 'url', 'word_count',
       'preprocessed_text'],
      dtype='object')


In [None]:
features = extract_text_features(data['preprocessed_text'])

In [None]:
X = features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(y_train.value_counts())

target
non-darkpattern    1723
Scarcity            277
Social Proof        266
Urgency             164
Misdirection        134
Obstruction          19
Sneaking             10
Forced Action         3
Name: count, dtype: int64


In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6892307692307692


In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
                 precision    recall  f1-score   support

  Forced Action       0.00      0.00      0.00         1
   Misdirection       0.00      0.00      0.00        46
    Obstruction       0.00      0.00      0.00         8
       Scarcity       0.00      0.00      0.00        67
       Sneaking       0.00      0.00      0.00         2
   Social Proof       0.00      0.00      0.00        46
        Urgency       0.00      0.00      0.00        32
non-darkpattern       0.69      1.00      0.82       448

       accuracy                           0.69       650
      macro avg       0.09      0.12      0.10       650
   weighted avg       0.48      0.69      0.56       650



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
