In [None]:
import os
import random
import time
import datetime
import torch
import argparse

import pandas as pd
import numpy as np

!pip install scikit-learn
!pip install transformers==4.36.2
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, BertConfig, AutoModelForSequenceClassification, DistilBertTokenizer
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
!pip install keras
!pip install tensorflow
from keras.preprocessing.sequence import pad_sequences
import nltk
from sklearn.utils.class_weight import compute_class_weight
nltk.download('punkt_tab')
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import files
uploaded = files.upload()


Saving final_dataset.csv to final_dataset (1).csv


In [None]:
def load_data(args):
    df = pd.read_csv(args.raw_data)
    df['topic'] = df['topic'].astype(int)
    print(args.topic)
    df = df[df['topic'].isin(args.topic)]
    documents = df['original_text'].tolist()
    labels = df['label'].tolist()
    user_id = df['user_name'].tolist()
    return documents, labels, user_id

In [None]:
def tokenization(args, document):
  if args.model == 'Bert':
    tokenizer = BertTokenizer.from_pretrained(
            'bert-base-uncased',
            do_lower_case=False,
            )
  elif args.model == 'DistilBert':
    tokenizer = DistilBertTokenizer.from_pretrained(
        'distilbert-base-uncased',
        do_lower_case=False,
    )
    if args.use_special_token:
      special_tokens_dict = {"additional_special_tokens": ["[single_first_person]", "[plural_first_person]","[third_person]", "[conjunctions]", "[auxiliary_verbs]", "[prepositions]"]}
      num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

  tokenized = [tokenizer.tokenize(sentence) for sentence in document]
  ids = [tokenizer.convert_tokens_to_ids(sentence) for sentence in tokenized]
  return ids

In [None]:
def padding(ids, args):
    ids = pad_sequences(ids, maxlen=args.max_len, dtype="long", truncating='post', padding='post')
    return ids

In [None]:
import re
!pip install emoji
import emoji
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def clean_texts(texts, remove_stopwords):
    ps = PorterStemmer()

    all_tokens = []
    stop_words = set(stopwords.words('english'))

    cleaned_texts = []
    for text in texts:
        text = text.lower()  # 소문자 변환
        text = emoji.replace_emoji(text, replace='')  # 이모지 제거
        text = re.sub(r"RT\s+", "", text)  # RT 제거
        text = re.sub(r"http\S+", "", text)  # 링크 제거
        text = re.sub(r"[^\w\s]", "", text)  # 구두점 제거
        tokens = nltk.word_tokenize(text)
        if remove_stopwords:
          tokens = [word for word in tokens if word not in stop_words]
        all_tokens.extend(tokens)
        cleaned_texts.append(tokens)

    # Hapax 제거
    token_counts = Counter(all_tokens)
    hapaxes = {word for word, count in token_counts.items() if count == 1}

    final_texts = []
    for tokens in cleaned_texts:
        filtered_tokens = [word for word in tokens if word not in hapaxes]
        final_texts.append(" ".join(filtered_tokens))

    return final_texts


In [None]:
nlp = spacy.load("en_core_web_sm")
FEATURES = {
    'single_first_person': ("i", "me", "my", "mine", "myself"),
    'plural_first_person': ("we", "us", "our", "ours", "ourselves"),
    'third_person': ("@user", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves")
}

def deal_with_special_token(args, document):
    processed_sentences = []
    if args.use_special_token:
        for sentence in document:
            doc = nlp(str(sentence))
            new_tokens = []

            for token in doc:
                lw = token.text.lower()

                if token.pos_ == "PRON":
                    if lw in FEATURES['single_first_person']:
                        new_tokens.append("[single_first_person]")
                        continue
                    elif lw in FEATURES['plural_first_person']:
                        new_tokens.append("[plural_first_person]")
                        continue
                    elif lw in FEATURES['third_person']:
                        new_tokens.append("[third_person]")
                        continue
                elif token.pos_ == "CCONJ":
                    new_tokens.append("[conjunctions]")
                    continue
                elif token.tag_ == "MD" or token.dep_ == "aux":
                    new_tokens.append("[auxiliary_verbs]")
                    continue
                elif token.pos_ == "ADP":
                    new_tokens.append("[prepositions]")
                    continue

                new_tokens.append(token.text)

            processed = "[CLS] " + " ".join(new_tokens) + " [SEP]"
            processed_sentences.append(processed)

        return processed_sentences
    else:
        added = ["[CLS]" + str(sentence) + "[SEP]" for sentence in document]
        return added


In [None]:
def preprocess(args):
    # 1. 데이터 불러오기 (user_name 포함)
    documents, labels, user_ids = load_data(args)

    # 2. 텍스트 전처리
    documents = clean_texts(documents, args.remove_stopwords)
    documents = deal_with_special_token(args, documents)

    # 3. 토큰화 및 패딩
    ids = tokenization(args,documents)
    ids = padding(ids, args)
    masks = attention_mask(ids)

    # 4. user_ids는 np.array로 변환해 반환
    user_ids = np.array(user_ids)

    return ids, masks, labels, user_ids


In [None]:
def attention_mask(ids):
    masks = []
    for id in ids:
        mask = [float(i>0) for i in id]
        masks.append(mask)
    return masks

In [None]:
def train_test_data_split(ids, masks, labels, user_ids):
    # 리스트일 경우 numpy array로 변환
    ids = np.array(ids)
    masks = np.array(masks)
    labels = np.array(labels)
    user_ids = np.array(user_ids)

    # 유저 기준으로 분할
    unique_users = np.unique(user_ids)
    train_users, test_users = train_test_split(unique_users, test_size=0.2, random_state=42)

    train_mask = np.isin(user_ids, train_users)
    test_mask = np.isin(user_ids, test_users)

    return (
        ids[train_mask], masks[train_mask], labels[train_mask],
        ids[test_mask], masks[test_mask], labels[test_mask]
    )

In [None]:
def build_dataloader(ids, masks, label, args):
    dataloader = TensorDataset(torch.tensor(ids), torch.tensor(masks), torch.tensor(label))
    dataloader = DataLoader(dataloader, sampler=RandomSampler(dataloader), batch_size=args.batch_size)
    return dataloader

In [None]:
def build_model(args):
  if args.model == 'Bert':
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2
    )

    # 🎯 BERT 본체는 freeze (gradient를 계산하지 않음)
    for param in model.bert.parameters():
        param.requires_grad = False

    # (선택) classifier 구조 확인
    print(model.classifier)

    device = torch.device("cuda")
    print(f"{torch.cuda.get_device_name(0)} available")
    model = model.cuda()
  elif args.model == 'DistilBert':
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
    if args.use_special_token:
      model.resize_token_embeddings(30528)
    model.config.dropout = args.p
    model.config.attention_dropout = args.p
    device = torch.device("cuda")
    print(f"{torch.cuda.get_device_name(0)} available")
    model = model.cuda()

  return model, device

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter

def test(test_dataloader, model, device):
    # 테스트 모드 전환
    model.eval()

    total_accuracy = 0
    all_preds = []
    all_trues = []

    for batch in test_dataloader:
        # 배치를 GPU로 이동
        batch = tuple(index.to(device) for index in batch)
        ids, masks, labels = batch

        # 테스트는 그레디언트 연산 안 함
        with torch.no_grad():
          if isinstance(model, BertForSequenceClassification):
              outputs = model(input_ids=ids, attention_mask=masks, token_type_ids=None)
          else:
              outputs = model(input_ids=ids, attention_mask=masks)

        preds = torch.argmax(outputs.logits, dim=1)

        # 리스트에 결과 누적
        all_preds.extend(preds.cpu().numpy())
        all_trues.extend(labels.cpu().numpy())

        # 현재 배치 accuracy
        accuracy = accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
        total_accuracy += accuracy

    # 평균 accuracy & 전체 F1 계산
    avg_accuracy = total_accuracy / len(test_dataloader)
    f1 = f1_score(all_trues, all_preds, average='macro')

    print(f"Test AVG Accuracy : {avg_accuracy:.4f}")
    print(f"Test Macro F1 Score : {f1:.4f}")
    print("예측 분포:", Counter(all_preds))
    print("정답 분포:", Counter(all_trues))

    return avg_accuracy, f1

In [None]:
import matplotlib.pyplot as plt
def train(train_dataloader, test_dataloader, args, class_weights):
    model, device = build_model(args)

    # 옵티마이저 정의
    optimizer = AdamW(model.parameters(), lr=args.lr, eps=1e-8)

    # learning rate decay
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*args.epochs)

    # 시드 고정
    random.seed(args.seed_val)
    np.random.seed(args.seed_val)
    torch.manual_seed(args.seed_val)
    torch.cuda.manual_seed_all(args.seed_val)

    save_dir = '/content/drive/MyDrive/results_selected_250613'
    os.makedirs(save_dir, exist_ok=True)
    csv_path = os.path.join(save_dir, "training_log.csv")
    if os.path.exists(csv_path):
        df_log = pd.read_csv(csv_path)
    else:
        df_log = pd.DataFrame(columns=[
            'lr', 'p', 'b', 'remove_stopwords', 'use_special_token',
            'epoch', 'train_acc', 'train_f1', 'test_acc', 'test_f1'
        ])

    # 그레디언트 초기화
    model.zero_grad()
    train_accuracies = []
    test_accuracies = []
    for epoch in range(0, args.epochs):
        # 훈련모드
        model.train()

        # 로스와 정확도 초기화
        total_loss, total_accuracy = 0, 0
        print("-"*30)
        for step, batch in enumerate(train_dataloader):
            if step % 500 == 0 :
                print(f"Epoch : {epoch+1} in {args.epochs} / Step : {step}")

            # 배치 선정
            batch = tuple(index.to(device) for index in batch)
            ids, masks, labels, = batch

            # forward
            if isinstance(model, BertForSequenceClassification):
              outputs = model(input_ids=ids, attention_mask=masks, token_type_ids=None)
            else:
              outputs = model(input_ids=ids, attention_mask=masks)
            logits = outputs.logits
            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
            loss = loss_fct(logits, labels)
            total_loss += loss.item()

            # 정확도 도출
            pred = [torch.argmax(logit).cpu().detach().item() for logit in outputs.logits]
            true = [label for label in labels.cpu().numpy()]
            accuracy = accuracy_score(true, pred)
            total_accuracy += accuracy

			# 그레디언트 연산
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # 파라미터 업데이트
            optimizer.step()

            # 러닝레이트 최적화
            scheduler.step()

            # 그레디언트 초기화
            model.zero_grad()

        # epoch 당 loss 와 정확도 계산
        avg_loss = total_loss / len(train_dataloader)
        avg_accuracy = total_accuracy/len(train_dataloader)
        print(f" {epoch+1} Epoch Average train loss :  {avg_loss}")
        print(f" {epoch+1} Epoch Average train accuracy :  {avg_accuracy}")
        train_accuracies.append(avg_accuracy)

		# test 수행
        acc,f1 = test(test_dataloader, model, device)
        test_accuracies.append(acc)

        record = {
            'lr': args.lr,
            'p': args.p,
            'b': args.batch_size,
            'remove_stopwords': args.remove_stopwords,
            'use_special_token': args.use_special_token,
            'epoch': epoch + 1,
            'train_acc': avg_accuracy,
            'train_loss': avg_loss,
            'test_acc': acc,
            'test_f1': f1
        }

        # DataFrame에 추가
        df_log = pd.concat([df_log, pd.DataFrame([record])], ignore_index=True)

        # 매 epoch마다 저장 (중간에 멈춰도 기록 보존)
        df_log.to_csv(csv_path, index=False)
        print(f"Saved training log to {csv_path}")

        setting_str = f"lr_{args.lr}_p_{args.p}_b_{args.batch_size}_remove_stop_{args.remove_stopwords}_use_special_{args.use_special_token}"
        f = os.path.join(save_dir, f'{setting_str}_epoch_{epoch+1}_evalAcc_{acc*100:.0f}_f1_{f1*100:.0f}.pth')
        torch.save(model.state_dict(), f)
        print('Saved checkpoint:', f)

    epochs = list(range(1, args.epochs + 1))
    plt.plot(epochs, train_accuracies, label='Train Accuracy')
    plt.plot(epochs, test_accuracies, label='Test Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Train vs Test Accuracy')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    # 저장 및 표시
    plt.savefig(save_dir + f"lr_{args.lr}_p_{args.p}_b_{args.batch_size}_remove_stop_{args.remove_stopwords}_use_special_{args.use_special_token}.png")
    plt.show()

In [None]:
def run(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    ids, masks, labels, user_ids = preprocess(args)
    train_ids, train_masks, train_labels, test_ids, test_masks, test_labels = train_test_data_split(ids, masks, labels, user_ids)
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
    print("# of trainset", len(train_ids), "# of testset", len(test_ids))
    train_dataloader = build_dataloader(train_ids, train_masks, train_labels, args)
    test_dataloader = build_dataloader(test_ids, test_masks, test_labels, args)
    train(train_dataloader, test_dataloader, args, class_weights)

In [None]:
class Args:
    def __init__(self, batch_size, remove_stopwords, special_token, lr, p):
        self.raw_data = "final_dataset.csv"
        self.max_len = 128
        self.batch_size = batch_size
        self.num_labels = 2
        self.epochs = 3
        self.seed_val = 42
        self.remove_stopwords = remove_stopwords
        self.model = 'DistilBert'
        self.topic = [ 0,11,7,10,27,13,5,22,12,1,9]
        self.use_special_token = special_token
        self.lr = lr
        self.p = p

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

for p in [0.1]:
  for lr in [3e-5]:
    for batch_size in [32]:
      for remove, special in [(False, False), (True,False), (False,True)]:
          args = Args(batch_size, remove, special, lr, p)
          run(args)


In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

for p in [0.1]:
  for lr in [3e-5]:
    for batch_size in [32]:
      for remove, special in [(False, False), (True,False), (False,True)]:
          args = Args(batch_size, remove, special, lr, p)
          args.topic = [72, 39, 92, 118, 73, 104, 101, 119, 15, 25, 48, 18, 68, 24, 41, 93, 97, 66, 96, 116, 29, 20, 113, 42, 46, 33, 84, 103, 102, 56, 105, 79, 89, 110]
          args.epochs = 2
          run(args)


In [None]:
#training bert
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

for p in [0.1]:
  for lr in [1e-5]:
    for batch_size in [32]:
      for remove, special in [(False, False), (True,False)]:
          args = Args(batch_size, remove, special, lr, p)
          args.model = 'Bert'
          args.epochs = 5
          run(args)
