<a href="https://colab.research.google.com/github/Barleysack/DaconRepo/blob/main/BasePipeLine_issac.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('./drive', force_remount=True)

Mounted at ./drive


In [4]:
!pip install transformers
!pip install torch
!pip install sklearn
!pip install numpy
!pip install pandas
!pip install seaborn
!pip install wandb
!pip3 install adamp
!pip install koeda


Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 6.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 57.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 76.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 69.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyy

In [5]:
!apt-get install g++ openjdk-8-jdk python3-dev python3-pip curl
!python3 -m pip install --upgrade pip
!python3 -m pip install konlpy
!apt-get install curl git
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Reading package lists... Done
Building dependency tree       
Reading state information... Done
curl is already the newest version (7.58.0-2ubuntu3.16).
g++ is already the newest version (4:7.4.0-1ubuntu2.3).
g++ set to manually installed.
python3-dev is already the newest version (3.6.7-1~18.04).
python3-dev set to manually installed.
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libgail-common libgail18 libgtk2.0-0 libgtk2.0-bin
  libgtk2.0-common libxxf86dga1 openjdk-8-jdk-headless openjdk-8-jre
  openjdk-8-jre-headless python-pip-whl python3-asn1crypto
  python3-cffi-backend python3-crypto python3-cryptography python3-idna
  python3-keyring python3-keyrings.alt python3-pkg-resources
  python3-secretstorage python3-setuptools python3-six python3-wheel
  python

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, Trainer, TrainingArguments, set_seed
from transformers.modeling_outputs import SequenceClassifierOutput
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.model_selection import StratifiedGroupKFold
from torch.utils.data import Dataset
import os
import wandb
import re
from datetime import datetime
from pytz import timezone
from collections import Counter
from adamp import AdamP
from torch.optim.lr_scheduler import LambdaLR,CosineAnnealingLR
from koeda import EDA,AEDA
set_seed(2022)
root='/content/drive/MyDrive/Data/'

In [10]:
class MainDataset(Dataset):
    def __init__(self, df, tokenizer, mode):
        premise=self.clean_text(df['premise'])
        hypothesis=self.clean_text(df['hypothesis'])


        self.len=len(premise)
        self.mode=mode
        self.eda=EDA(morpheme_analyzer="Mecab", alpha_sr=0.3, alpha_ri=0.1, alpha_rs=0.1, prob_rd=0.2) 

        if mode=='train':
            premise = self.f_preprocessing(premise)
            hypothesis=self.f_preprocessing(hypothesis)
            label2idx={key:value for value,key in enumerate(self.get_label_list())}
            self.labels=[label2idx[label] for label in df['label']]

        self.encodings=tokenizer(premise,hypothesis, add_special_tokens=True,return_tensors='pt',padding=True, truncation=True, max_length=256)

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items() if key!='token_type_ids'}
        if self.mode=='train':
            item['labels'] = self.labels[idx]
        return item

    def f_preprocessing(self, series):
        return [self._preprocess(sent) for sent in series]

    def clean_text(self,series):
        return [self._clean(sent) for sent in series]

    def _preprocess(self,sent):
        sent=self.eda(sent)
        return sent

    def _clean(self,sent):
        sent=re.sub('\u00A0',' ',sent)
        sent=re.sub(' +',' ',sent)
        return sent

    def get_data(self):
        return self.labels

    def get_label_list(self):
        return ['entailment','contradiction','neutral']

def clean_df(df,splits):
    df=df.drop_duplicates(subset=['premise','hypothesis','label'],ignore_index=True)
    if splits>2:
        groups=df['premise']
        cv = StratifiedGroupKFold(n_splits=splits)
        return df,cv.split(df['index'],df['label'],groups=df['premise'])
		#Customize train eval split strategy
    return df,[(range(len(df['label'])),range(len(df['label'])))]

def get_traindataset(path,tokenizer,splits=5):
    df=pd.read_csv(path)
    cleaned_df,cleaned_idxs=clean_df(df,splits)
    for train_idxs,eval_idxs in cleaned_idxs:
        yield MainDataset(cleaned_df.iloc[train_idxs],tokenizer,'train'),MainDataset(cleaned_df.iloc[eval_idxs],tokenizer,'train')

def get_testdataset(path,tokenizer):
    df=pd.read_csv(path)
    return MainDataset(df,tokenizer,'test')


In [1]:
def confusion_matrix(true, pred, num):
    cm=np.array([[0 for _ in range(num)] for _ in range(num)])
    for i in range(len(true)):
        cm[true[i]][pred[i]]+=1
    return cm

def draw_confusion_matrix(true, pred, num,save_dir):
    cm = confusion_matrix(true, pred, num)
    df = pd.DataFrame(cm/np.sum(cm, axis=1)[:, None],
                index=list(range(num)), columns=list(range(num)))    
    df = df.fillna(0)  # NaN 값을 0으로 변경
    plt.figure(figsize=(16, 16))
    plt.tight_layout()
    plt.suptitle('Confusion Matrix')
    sns.heatmap(df, annot=True, cmap=sns.color_palette("Blues"))
    plt.xlabel("Predicted Label")
    plt.ylabel("True label")
    if save_dir:
        save_folder=save_dir
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        plt.savefig(save_folder+f"/confusion_matrix_{len(os.listdir(save_folder))}.png")
    plt.close('all')

def get_compute_metrics(num,save_dir=None):
    def compute_metrics(pred):
        """ validation을 위한 metrics function """
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)

        f1 = f1_score(labels, preds, average="micro", labels=list(range(num))) * 100.0
        acc = accuracy_score(labels, preds)

        draw_confusion_matrix(labels,preds,num,save_dir)
        return {
            'micro f1 score': f1,
            'accuracy': acc,
        }
    return compute_metrics

def train():
    KST = timezone('Asia/Seoul')
    DATE = str(datetime.now().astimezone(KST))[:19]
    MODEL = "ehdwns1516/klue-roberta-base-kornli"
    BATCH_SIZE = 128
    LEARNING_RATE = 1e-7
    EPOCHS = 10
    SPLITS = 1
    increment = 0
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    set_seed(2022)

    tokenizer=AutoTokenizer.from_pretrained(MODEL)
    test_set=get_testdataset(root+'test_data.csv',tokenizer)
    for train_set,eval_set in get_traindataset(root+'train_data.csv',tokenizer,splits=SPLITS):
        increment+=1
        OUTPUT = '/'.join([root,'runs',DATE,str(increment)])
        config = AutoConfig.from_pretrained(MODEL)
        config.num_labels=3
        model = AutoModelForSequenceClassification.from_pretrained(MODEL,config=config)
        model.to(device)
        optimizer = AdamP(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999), weight_decay=1e-2)
        scheduler = LambdaLR(optimizer=optimizer,lr_lambda=lambda step: 0.95 ** step*BATCH_SIZE*(SPLITS-1)/(SPLITS*25000))

        training_args = TrainingArguments(
        output_dir = OUTPUT,                    # output directory
        #save_total_limit=2,                        # number of total save model.
        #save_steps=SAVE_STEPS,                     # model saving step.
        num_train_epochs = EPOCHS,                   # total number of training epochs
        learning_rate = LEARNING_RATE,               # learning_rate
        per_device_train_batch_size = BATCH_SIZE,    # batch size per device during training
        per_device_eval_batch_size = BATCH_SIZE,     # batch size for evaluation
        dataloader_num_workers = 4,
        weight_decay = 0.01,                         # strength of weight decay
        logging_dir = OUTPUT+'/logs',                      # directory for storing logs
        logging_steps = 100,                         # log saving step.
        
        save_strategy = 'epoch',
        evaluation_strategy = 'epoch',               # evaluation strategy to adopt during training
                                                # `no`: No evaluation during training.
                                                # `steps`: Evaluate every `eval_steps`.
                                                # `epoch`: Evaluate every end of epoch.
        load_best_model_at_end = True,
        metric_for_best_model = "micro f1 score",
        greater_is_better = True,
        report_to = "wandb"
        )

        wandb_configs = {'runs':DATE,'model':MODEL,'bsz':BATCH_SIZE,'lr':LEARNING_RATE,'epochs':EPOCHS,}
        run = wandb.init(project='Sentence classification',name=DATE,config=wandb_configs)
        os.makedirs(OUTPUT)
        trainer = Trainer(model=model,
                        args=training_args,
                        train_dataset=train_set,
                        eval_dataset=eval_set,
                        compute_metrics=get_compute_metrics(config.num_labels,OUTPUT+'/confusion'),
                        optimizers=(optimizer,scheduler)
                        )
        trainer.train()
        save_directory = OUTPUT+'/best'
        model.save_pretrained(save_directory)
        run.finish()

        prediction=trainer.predict(test_set)
        idx2label=train_set.get_label_list()
        outs=pd.read_csv(root+'test_data.csv')
        outs['label']=[idx2label[i] for i in np.argmax(prediction.predictions,axis=1)]
        outs.to_csv(OUTPUT+'/result.csv')
        outs['label']=prediction.predictions.tolist()
        outs.to_csv(OUTPUT+'/result_logits.csv')

In [2]:
torch.cuda.empty_cache()
train()

NameError: ignored