# **Import**


In [None]:
import pandas as pd
# model
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC, OneClassSVM
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# **DATA import**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv("/content/drive/MyDrive/bigdata_dataset/dark_patterns_test.csv")

# **function**

In [None]:
def tfidf(text_data):
  # TF-IDF 벡터라이저 초기화
  vectorizer = TfidfVectorizer()

  # 벡터라이저를 사용하여 텍스트 데이터를 수치 벡터로 변환
  tfidf_matrix = vectorizer.fit_transform(text_data)

  # TF-IDF 벡터를 DataFrame으로 변환
  tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

  return tfidf_df

## **scaling**

In [None]:
# data['word_count'] = data['Pattern String'].apply(lambda x: len(str(x).split()))

# # 한 단어만 존재하는 행 제거
# data = data[data['word_count'] > 1]

In [None]:
# prompt: dark_patterns_test_utf8.csv의 target열을 lable로 하고 y로감

y = data["Pattern Category"]


In [None]:
print(data['Pattern String'].value_counts())

Pattern String
Only 1 left                                                                        26
No, I don't feel lucky                                                             23
An item you ordered is in high demand. No worries, we have reserved your order.    21
Only 1 left in stock                                                               20
Only 1 left!                                                                       16
                                                                                   ..
NO, THANKS. I DON'T LIKE DISCOUNTS                                                  1
No thanks, I don't like discounts                                                   1
No thanks, I'd rather pay full price.                                               1
No thanks, I don't want the free gift.                                              1
Save $148.98AUD ??49% Off                                                           1
Name: count, Length: 1177, dtype: int64

In [None]:
texts = data['Pattern String']
dark_patterns = data['Pattern Category']

In [None]:
import numpy as np

texts = texts.replace(np.nan, '', regex=True)

In [None]:
#data['text'] = data['text'].astype(str)

In [None]:
print(texts.head())

0    Collin P. from Grandview Missouri just bought ...
1    Faith in Glendale, United States purchased a C...
2    Sharmeen Atif From Karachi just bought Stylish...
3                           9 people are viewing this.
4             5338 people viewed this in the last hour
Name: Pattern String, dtype: object


In [None]:
def preprocess_text(text):

    # 소문자 변환
    text = text.lower()

    # 특수 문자 제거
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)

    # 토큰화
    tokens = word_tokenize(text)

    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 어간 추출 또는 표제어 추출
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 공백 제거 및 다시 문장으로 결합
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text


In [None]:
data.dtypes

Pattern String       object
Comment              object
Pattern Category     object
Pattern Type         object
Where in website?    object
Deceptive?           object
Website Page         object
dtype: object

In [None]:
data['Pattern String'] = data['Pattern String'].astype(str)

In [None]:
data['Pattern String'] = data['Pattern String'].apply(preprocess_text)

In [None]:
print(data['Pattern String'].head())

0    collin p grandview missouri bought burgundy tw...
1    faith glendale united state purchased choose f...
2    sharmeen atif karachi bought stylish metal gla...
3                                     9 people viewing
4                         5338 people viewed last hour
Name: Pattern String, dtype: object


In [None]:
from textblob import TextBlob
# def extract_text_features(preprocessed_text):
#     # 단어 빈도 피처 추출
#     count_vectorizer = CountVectorizer()
#     word_freq_features = count_vectorizer.fit_transform(preprocessed_text)

#     # TF-IDF 피처 추출
#     tfidf_vectorizer = TfidfVectorizer()
#     tfidf_vectorizer.fit(preprocessed_text)
#     tfidf_features = tfidf_vectorizer.fit_transform(preprocessed_text)

#     # 감성 점수 계산
#     sentiment_scores = preprocessed_text.apply(lambda x: TextBlob(x).sentiment.polarity)

#     # 단어 빈도 피처를 DataFrame으로 변환
#     features = pd.DataFrame(word_freq_features.toarray(), columns=count_vectorizer.get_feature_names_out())

#     # tfidf_features를 DataFrame으로 추가
#     tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
#     features = pd.concat([features, tfidf_df], axis=1)
#     features = pd.DataFrame(
#          data={
#              'sentiment': sentiment_scores,
#              'tf-idf': tfidf_features,
#          }
#      )

#     return features

# 특성 추출을 위한 함수 정의
def extract_features(text):
    # N-gram 벡터화
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_df=0.95)
    tfidf_features = tfidf_vectorizer.fit_transform(data[text])

    # 감성 점수 계산
    sentiment_scores = data[text].apply(lambda x: TextBlob(x).sentiment.polarity)

    # DataFrame으로 특성 합치기
    features = pd.DataFrame(
        data={
            'sentiment': sentiment_scores,
        }
    )
    features = features.join(pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out()))

    return features

In [None]:
print(data.columns)

Index(['Pattern String', 'Comment', 'Pattern Category', 'Pattern Type',
       'Where in website?', 'Deceptive?', 'Website Page'],
      dtype='object')


In [None]:
features = extract_features('Pattern String')

수동 추출 특성 끼워넣기

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(text):
  # TF-IDF 벡터라이저 초기화
  vectorizer = TfidfVectorizer()

  # 텍스트 데이터를 TF-IDF 벡터로 변환
  tfidf_matrix = vectorizer.fit_transform(data[text])

  # # 벡터화된 데이터 확인
  # print(tfidf_matrix.shape)

  tfidf_matrix = vectorizer.fit_transform(data[text])

  # 희소 행렬을 Dense 형태로 변환하고 데이터프레임으로 만들기
  tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

  return tfidf_df


In [None]:
# NaN 값을 빈 문자열로 대체
data['Comment'] = data['Comment'].fillna('')


In [None]:
comment = tfidf('Comment')
pattern_typr = tfidf('Pattern Type')
location = tfidf('Where in website?')
deceptive = tfidf('Deceptive?')

# 기존 데이터프레임과 병합
data = pd.concat([data, comment], axis=1)
print(data)

                                         Pattern String  \
0     collin p grandview missouri bought burgundy tw...   
1     faith glendale united state purchased choose f...   
2     sharmeen atif karachi bought stylish metal gla...   
3                                      9 people viewing   
4                          5338 people viewed last hour   
...                                                 ...   
1813                                         13290 9900   
1814                               offer valid add cart   
1815                                                nan   
1816                                                nan   
1817                                   save 14898aud 49   

                                                Comment Pattern Category  \
0                                        Periodic popup     Social Proof   
1                                        Periodic popup     Social Proof   
2                                        Periodic popup     Soc

In [None]:
# features['Comment'] = data['Comment']
# features['Pattern Type'] = data['Pattern Type']
# features['location'] = data['Where in website?']
# features['Deceptive'] = data ['Deceptive?']

In [None]:
X = features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(y_train.value_counts())

Pattern Category
Scarcity         546
Urgency          393
Social Proof     266
Misdirection     198
Obstruction       24
Sneaking          21
Forced Action      6
Name: count, dtype: int64


# **Model**

## **로지스틱 회귀**

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9010989010989011


In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

Misdirection       0.93      0.75      0.83        72
 Obstruction       1.00      0.71      0.83         7
    Scarcity       0.97      0.98      0.98       133
    Sneaking       0.00      0.00      0.00         5
Social Proof       0.90      0.97      0.93        59
     Urgency       0.79      0.92      0.85        88

    accuracy                           0.90       364
   macro avg       0.77      0.72      0.74       364
weighted avg       0.89      0.90      0.89       364



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## **Random Forest**


In [None]:
RFC = RandomForestClassifier(random_state=42)
RFC.fit(X_train, y_train)

In [None]:
y_pred = RFC.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9038461538461539


In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

Misdirection       0.93      0.78      0.85        72
 Obstruction       1.00      0.86      0.92         7
    Scarcity       0.97      0.97      0.97       133
    Sneaking       0.00      0.00      0.00         5
Social Proof       0.97      0.97      0.97        59
     Urgency       0.76      0.92      0.84        88

    accuracy                           0.90       364
   macro avg       0.77      0.75      0.76       364
weighted avg       0.90      0.90      0.90       364



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## **SVM**

In [None]:
import time

start = time.time()

svm = OneVsRestClassifier(SVC(kernel="rbf", gamma=1, C=10))
svm.fit(X_train, y_train)

print("time :", time.time() - start)

time : 4.3225884437561035


In [None]:
y_pred = svm.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9065934065934066


In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

Misdirection       0.95      0.75      0.84        72
 Obstruction       1.00      0.86      0.92         7
    Scarcity       0.96      0.98      0.97       133
    Sneaking       0.00      0.00      0.00         5
Social Proof       0.93      0.97      0.95        59
     Urgency       0.80      0.93      0.86        88

    accuracy                           0.91       364
   macro avg       0.77      0.75      0.76       364
weighted avg       0.90      0.91      0.90       364



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
