In [None]:
!nvidia-smi

Sun Jun 23 10:16:44 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
try:
    import transformers, emoji, soynlp, pytorch_lightning
except:
    !pip install -U -q transformers emoji soynlp pytorch-lightning

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m416.8/416.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.2/812.2 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import pandas as pd

from pprint import pprint

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR

from pytorch_lightning import LightningModule, Trainer, seed_everything

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

import re
import emoji
from soynlp.normalizer import repeat_normalize
import seaborn as sns
import matplotlib as plt
import glob
from multiprocessing import Pool, cpu_count


### 쿠팡

In [None]:
# 쿠팡 상품평 concat
csv_files = glob.glob(os.path.join(os.getcwd(), "product_coupang_review*.csv"))
dataframes = []
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.to_csv('combined_coupang_review_2024.csv', index=False)

In [None]:
# 코랩 T4 GPU 환경
# 쿠팡 데이터 로드
coupang_df = pd.read_csv('combined_coupang_reviews.csv')

# 레이블 확인
print(coupang_df['rate'].value_counts())

# 다수 클래스와 소수 클래스 구분
df_majority = coupang_df[coupang_df['rate'] == 5]
df_minority_4 = coupang_df[coupang_df['rate'] == 4]
df_minority_3 = coupang_df[coupang_df['rate'] == 3]
df_minority_2 = coupang_df[coupang_df['rate'] == 2]
df_minority_1 = coupang_df[coupang_df['rate'] == 1]

# 다수 클래스 크기에 맞추어 소수 클래스 오버샘플링
df_minority_4_upsampled = resample(df_minority_4,
                                   replace=True,     # 샘플을 복제하여 오버샘플링
                                   n_samples=len(df_majority),  # 다수 클래스 샘플 수에 맞추어 변경
                                   random_state=42)  # 재현성을 위한 랜덤 시드 설정
df_minority_3_upsampled = resample(df_minority_3,
                                   replace=True,
                                   n_samples=len(df_majority),
                                   random_state=42)
df_minority_2_upsampled = resample(df_minority_2,
                                   replace=True,
                                   n_samples=len(df_majority),
                                   random_state=42)
df_minority_1_upsampled = resample(df_minority_1,
                                   replace=True,
                                   n_samples=len(df_majority),
                                   random_state=42)

# 오버샘플링된 데이터프레임 결합
df_upsampled = pd.concat([df_majority, df_minority_4_upsampled, df_minority_3_upsampled, df_minority_2_upsampled, df_minority_1_upsampled])

# 데이터 확인
print(df_upsampled['rate'].value_counts())
df_upsampled.to_csv('coupang_reviews_oversampled.csv', index=False)

# 형태소 분석기 로드
okt = Okt()

# 불용어 리스트 정의
stopwords = ["그", "이", "저", "것", "들", "의", "에", "를", "가", "은", "는", "이다", "하다"]
# stopwords = [ranks.nl의 675개 불용어 - 너무 길어서 생략]

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")

def preprocess_dataframe(df):
    def clean(x):
        emojis = ''.join(emoji.EMOJI_DATA.keys())
        pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
        url_pattern = re.compile(
            r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
        x = pattern.sub(' ', x)
        x = url_pattern.sub('', x)
        x = x.strip()
        x = repeat_normalize(x, num_repeats=2)
        return x

    def remove_stopwords(x):
        words = x.split()
        return ' '.join([word for word in words if word not in stopwords])

    def tokenize_and_remove_stopwords(text):
        tokens = okt.morphs(text, stem=True)
        tokens = [word for word in tokens if word not in stopwords]
        return ' '.join(tokens)

    def encode(x):
        clean_text = clean(str(x))
        tokenized_text = tokenize_and_remove_stopwords(clean_text)
        return tokenizer.encode(
            tokenized_text,
            padding='max_length',
            max_length=200,
            truncation=True
        )

    if 'review' in df.columns:
        df['review'] = df['review'].map(encode)
    elif 'review_text' in df.columns:
        df['review_text'] = df['review_text'].map(encode)
    else:
        raise ValueError("DataFrame does not contain 'review' or 'review_text' column.")

    return df

# 데이터 로드
coupang_df = pd.read_csv('coupang_review_2024_oversampled.csv')
kurly_df = pd.read_csv('combined_kurly_review_2024.csv')

# 쿠팡 데이터 전처리
coupang_df = coupang_df[['review', 'rate']]
coupang_df = coupang_df.rename(columns={'rate': 'label'})

# 전처리 적용
coupang_df = preprocess_dataframe(coupang_df)
kurly_df = preprocess_dataframe(kurly_df)

# 학습 및 검증 데이터 분할
train_df, val_df = train_test_split(coupang_df, test_size=0.2, random_state=42)

class Model(LightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()

        self.clsfier = AutoModelForSequenceClassification.from_pretrained(self.hparams.pretrained_model, num_labels=5)
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.hparams.pretrained_tokenizer if self.hparams.pretrained_tokenizer else self.hparams.pretrained_model
        )

    def forward(self, **kwargs):
        return self.clsfier(**kwargs)

    def step(self, batch, batch_idx):
        data, labels = batch
        output = self(input_ids=data, labels=labels)

        loss = output.loss
        logits = output.logits
        preds = logits.argmax(dim=-1)

        y_true = list(labels.cpu().numpy())
        y_pred = list(preds.cpu().numpy())

        return {'loss': loss, 'y_true': y_true, 'y_pred': y_pred}

    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def epoch_end(self, outputs, state='train'):
        loss = torch.tensor(0, dtype=torch.float)
        for i in outputs:
            loss += i['loss'].cpu().detach()
        loss = loss / len(outputs)

        y_true = []
        y_pred = []
        for i in outputs:
            y_true += i['y_true']
            y_pred += i['y_pred']

        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, average='weighted')
        rec = recall_score(y_true, y_pred, average='weighted')
        f1 = f1_score(y_true, y_pred, average='weighted')

        self.log(state+'_loss', float(loss), on_epoch=True, prog_bar=True)
        self.log(state+'_acc', acc, on_epoch=True, prog_bar=True)
        self.log(state+'_precision', prec, on_epoch=True, prog_bar=True)
        self.log(state+'_recall', rec, on_epoch=True, prog_bar=True)
        self.log(state+'_f1', f1, on_epoch=True, prog_bar=True)
        print(f'[Epoch {self.trainer.current_epoch} {state.upper()}] Loss: {loss}, Acc: {acc}, Prec: {prec}, Rec: {rec}, F1: {f1}')
        return {'loss': loss}

    def training_epoch_end(self, outputs):
        self.epoch_end(outputs, state='train')

    def validation_epoch_end(self, outputs):
        self.epoch_end(outputs, state='val')

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        scheduler = ExponentialLR(optimizer, gamma=0.5)
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

    def dataloader(self, df, shuffle=False):
        dataset = TensorDataset(
            torch.tensor(df['review'].to_list(), dtype=torch.long),
            torch.tensor(df['label'].to_list(), dtype=torch.long),
        )
        return DataLoader(
            dataset,
            batch_size=self.hparams.batch_size,
            shuffle=shuffle,
            num_workers=self.hparams.cpu_workers,
        )

    def train_dataloader(self):
        return self.dataloader(train_df, shuffle=True)

    def val_dataloader(self):
        return self.dataloader(val_df, shuffle=False)

# 학습 준비
args = {
    'random_seed': 42,
    'pretrained_model': 'beomi/kcbert-base',
    'pretrained_tokenizer': '',
    'batch_size': 32,
    'lr': 5e-6,
    'epochs': 1,
    'max_length': 150,
    'train_data_path': '',
    'val_data_path': '',
    'test_mode': False,
    'optimizer': 'AdamW',
    'lr_scheduler': 'exp',
    'fp16': True,
    'tpu_cores': 0,
    'cpu_workers': os.cpu_count(),
}

seed_everything(args['random_seed'])
model = Model(**args)

checkpoint_callback = ModelCheckpoint(
    filename='epoch{epoch}-val_acc{val_acc:.4f}',
    monitor='val_acc',
    save_top_k=3,
    mode='max',
    auto_insert_metric_name=False,
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=args['epochs'],
    fast_dev_run=args['test_mode'],
    num_sanity_val_steps=None if args['test_mode'] else 0,
    deterministic=torch.cuda.is_available(),
    devices=1 if torch.cuda.is_available() else None,
    precision=16 if args['fp16'] and torch.cuda.is_available() else 32,
)

trainer.fit(model)