<a href="https://colab.research.google.com/github/waaterr1208/bigdata_security/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [61]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [114]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [141]:
!ls darkpatterntest_v1.csv

darkpatterntest_v1.csv


In [142]:
!file -i darkpatterntest_v1.csv

darkpatterntest_v1.csv: text/csv; charset=utf-8


In [149]:
!iconv -f latin1 -t utf8 darkpatterntest_v1.csv >darkpatterntest_v1_utf8.csv

In [150]:
data = pd.read_csv("darkpatterntest_v1_utf8.csv")

In [151]:
data['word_count'] = data['text'].apply(lambda x: len(str(x).split()))

# 한 단어만 존재하는 행 제거
data = data[data['word_count'] > 1]

In [152]:
# prompt: dark_patterns_test_utf8.csv의 target열을 lable로 하고 y로감

y = data["target"]


In [153]:
print(data['target'].value_counts())

target
0    3968
1    1500
Name: count, dtype: int64


In [157]:
texts = data['text']
dark_patterns = data['target']
urls = data['url']

In [158]:
import numpy as np

texts = texts.replace(np.nan, '', regex=True)

In [None]:
#data['text'] = data['text'].astype(str)

In [159]:
print(texts.head())

0                          Status:Only 3 left in stock
1                                          Side Tables
2    Sprint customers can call T-Mobile Care or vis...
3                             Microsoft Power Platform
4    Â¤While supplies last, order and purchase a Ga...
Name: text, dtype: object


In [160]:
def preprocess_text(text):

    # 소문자 변환
    text = text.lower()

    # 특수 문자 제거
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)

    # 토큰화
    tokens = word_tokenize(text)

    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 어간 추출 또는 표제어 추출
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 공백 제거 및 다시 문장으로 결합
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text


In [161]:
data.dtypes

text          object
target        object
url           object
word_count     int64
dtype: object

In [162]:
data['text'] = data['text'].astype(str)

In [163]:
data['preprocessed_text'] = data['text'].apply(preprocess_text)

In [164]:
print(data['preprocessed_text'].head())

0                              statusonly 3 left stock
1                                           side table
2    sprint customer call tmobile care visit select...
3                             microsoft power platform
4    â¤while supply last order purchase galaxy watc...
Name: preprocessed_text, dtype: object


In [63]:
#url_presence = [1 if urlparse(url).netloc else 0 for url in urls]

In [165]:
def extract_text_features(preprocessed_text):
    # 단어 빈도 피처 추출
    count_vectorizer = CountVectorizer()
    word_freq_features = count_vectorizer.fit_transform(texts)

    # TF-IDF 피처 추출
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit(texts)
    tfidf_features = tfidf_vectorizer.fit_transform(texts)

    # 문장 길이 피처 추출
    text_lengths = [len(text) for text in texts]

    # URL 포함 여부 피처 추가
    #features = pd.DataFrame(word_freq_features.toarray(), columns=count_vectorizer.get_feature_names_out())

    # 단어 빈도 피처를 DataFrame으로 변환
    features = pd.DataFrame(word_freq_features.toarray(), columns=count_vectorizer.get_feature_names_out())

    # tfidf_features를 DataFrame으로 추가
    tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    features = pd.concat([features, tfidf_df], axis=1)

    # text_length를 새 열로 추가
    features['text_length'] = text_lengths

    # url_presence를 새 열로 추가
    #features['url_presence'] = url_presence

    return features

In [166]:
features = extract_text_features(texts)

In [167]:
print(features)

      00  000  0000  001  00am  00days00hours47minutes36seconds  00footnote  \
0      0    0     0    0     0                                0           0   
1      0    0     0    0     0                                0           0   
2      0    0     0    0     0                                0           0   
3      0    0     0    0     0                                0           0   
4      0    0     0    0     0                                0           0   
...   ..  ...   ...  ...   ...                              ...         ...   
5463   0    0     0    0     0                                0           0   
5464   0    0     0    0     0                                0           0   
5465   0    0     0    0     0                                0           0   
5466   0    0     0    0     0                                0           0   
5467   0    0     0    0     0                                0           0   

      00hours  00msrp  00or  ...   â¹   ê¾   ëª   ë

In [175]:
X = features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [176]:
print(y_train.value_counts())

target
0    3185
1    1189
Name: count, dtype: int64


In [177]:
model = LogisticRegression()
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [178]:
y_pred = model.predict(X_test)

In [179]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9661791590493601


In [181]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       783
           1       0.96      0.92      0.94       311

    accuracy                           0.97      1094
   macro avg       0.96      0.95      0.96      1094
weighted avg       0.97      0.97      0.97      1094

