In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# Step 1: Load the IMDB dataset
dataset = load_dataset('imdb')
train_data = dataset['train']
test_data = dataset['test']

# Step 2: Initialize the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Step 3: Define a tokenization function
def tokenize_data(data):
    return tokenizer(data['text'], padding='max_length', truncation=True, max_length=128)

# Apply tokenization to the dataset
train_data = train_data.map(tokenize_data, batched=True)
test_data = test_data.map(tokenize_data, batched=True)

# Step 4: Set the format of dataset for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Step 5: Load the BERT model for sequence classification
# GPU/CPU 장치 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 모델 정의 후 GPU로 이동
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)  # 모델을 GPU로 이동
print("GPU available:", torch.cuda.is_available())

# Step 6: Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Step 7: Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Step 8: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics
)

# Step 9: Train the model
trainer.train()

# Step 10: Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

# Step 11: Save the trained model and tokenizer
model.save_pretrained('./sentiment-analysis-model')
tokenizer.save_pretrained('./sentiment-analysis-tokenizer')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Using device: cuda


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU available: True


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.339,0.343197,0.86808,0.859492,0.91934,0.80696
2,0.2353,0.418283,0.88848,0.888516,0.888232,0.8888
3,0.1129,0.55428,0.8906,0.891101,0.887039,0.8952


Evaluation results: {'eval_loss': 0.5542803406715393, 'eval_accuracy': 0.8906, 'eval_f1': 0.8911009356957993, 'eval_precision': 0.8870392390011891, 'eval_recall': 0.8952, 'eval_runtime': 184.9544, 'eval_samples_per_second': 135.168, 'eval_steps_per_second': 16.896, 'epoch': 3.0}


('./sentiment-analysis-tokenizer/tokenizer_config.json',
 './sentiment-analysis-tokenizer/special_tokens_map.json',
 './sentiment-analysis-tokenizer/vocab.txt',
 './sentiment-analysis-tokenizer/added_tokens.json',
 './sentiment-analysis-tokenizer/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')
model.save_pretrained('/content/drive/MyDrive/sentiment-analysis-model')
tokenizer.save_pretrained('/content/drive/MyDrive/sentiment-analysis-tokenizer')


Mounted at /content/drive


('/content/drive/MyDrive/sentiment-analysis-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/sentiment-analysis-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/sentiment-analysis-tokenizer/vocab.txt',
 '/content/drive/MyDrive/sentiment-analysis-tokenizer/added_tokens.json',
 '/content/drive/MyDrive/sentiment-analysis-tokenizer/tokenizer.json')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/sentiment-analysis-model')
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/sentiment-analysis-tokenizer')
trainer.train(resume_from_checkpoint=True)  # Trainer에서 체크포인트 활용


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=9375, training_loss=0.0, metrics={'train_runtime': 0.0106, 'train_samples_per_second': 7052253.06, 'train_steps_per_second': 881531.633, 'total_flos': 4933332288000000.0, 'train_loss': 0.0, 'epoch': 3.0})

In [None]:
cd /content/drive/MyDrive/Colab Notebooks


/content/drive/MyDrive/Colab Notebooks


In [55]:
cd /content/drive/MyDrive/Colab Notebooks/2024R0136COSE47402

/content/drive/MyDrive/Colab Notebooks/2024R0136COSE47402


In [49]:
!git config --global user.email 'ghthd97@naver.com'
!git config --global user.name 'Freerider-song'

In [56]:
!git status

On branch master
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m"HW2/HW2_2017170858_\341\204\200\341\205\265\341\206\267\341\204\222\341\205\251\341\204\211\341\205\251\341\206\274.ipynb"[m

nothing added to commit but untracked files present (use "git add" to track)


In [31]:
!git restore .
!git clean -f

Removing "HW2/HW2_2017170858_\341\204\200\341\205\265\341\206\267\341\204\222\341\205\251\341\204\211\341\205\251\341\206\274.ipynb"


In [38]:
!git add FinalProject/Final_Project.ipynb
!git commit --amend --no-edit


fatal: pathspec 'FinalProject/Final_Project.ipynb' did not match any files
[master 5734a41] Add FinalProject folder and final_project.py
 Date: Tue Nov 26 14:56:12 2024 +0900
 1 file changed, 73 insertions(+)
 create mode 100644 FinalProject/Final_project.py


In [29]:
!git rebase -i HEAD~3

error: cannot rebase: You have unstaged changes.
error: Please commit or stash them.


In [53]:
!git push origin master --force

Enumerating objects: 30, done.
Counting objects:   3% (1/30)Counting objects:   6% (2/30)Counting objects:  10% (3/30)Counting objects:  13% (4/30)Counting objects:  16% (5/30)Counting objects:  20% (6/30)Counting objects:  23% (7/30)Counting objects:  26% (8/30)Counting objects:  30% (9/30)Counting objects:  33% (10/30)Counting objects:  36% (11/30)Counting objects:  40% (12/30)Counting objects:  43% (13/30)Counting objects:  46% (14/30)Counting objects:  50% (15/30)Counting objects:  53% (16/30)Counting objects:  56% (17/30)Counting objects:  60% (18/30)Counting objects:  63% (19/30)Counting objects:  66% (20/30)Counting objects:  70% (21/30)Counting objects:  73% (22/30)Counting objects:  76% (23/30)Counting objects:  80% (24/30)Counting objects:  83% (25/30)Counting objects:  86% (26/30)Counting objects:  90% (27/30)Counting objects:  93% (28/30)Counting objects:  96% (29/30)Counting objects: 100% (30/30)Counting objects: 100% (30/30), done.
Delta comp