Run this file on Kaggle using GPU T4 x2

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aclimdb/train.csv
/kaggle/input/aclimdb/test.csv


In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from tqdm import tqdm
import pandas as pd

df_train = pd.read_csv('/kaggle/input/aclimdb/train.csv')
df_test = pd.read_csv('/kaggle/input/aclimdb/test.csv')

# Load the dataset into a pandas dataframe
# Assuming you have already loaded the dataset into a DataFrame named 'df'
# Make sure 'sentiment' column contains labels (e.g., 'positive' or 'negative') and 'review' column contains text data

# Split the dataset into train and test sets
train_texts = df_train['review'].tolist()
test_texts = df_test['review'].tolist()
train_labels = df_train['sentiment'].tolist()
test_labels = df_test['sentiment'].tolist()

# Load pre-trained DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to numerical values (0 for negative, 1 for positive)
train_labels = [1 if label == 'positive' else 0 for label in train_labels]
test_labels = [1 if label == 'positive' else 0 for label in test_labels]

# Convert tokenized data into torch tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(test_labels)

# Create DataLoader for training and testing sets
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_loader = DataLoader(test_data, batch_size=16)

# Load pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fine-tuning the model
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Average training loss: {total_loss/len(train_loader)}')

# Evaluation on test set
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Evaluating'):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1]}
        labels = batch[2]
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        total_correct += (predictions == labels).sum().item()
        total_samples += len(labels)

accuracy = total_correct / total_samples
print(f'Accuracy on test set: {accuracy}')


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 1875/1875 [21:52<00:00,  1.43it/s]


Average training loss: 0.25175461526016396


Epoch 2/3: 100%|██████████| 1875/1875 [22:03<00:00,  1.42it/s]


Average training loss: 0.13518581109729905


Epoch 3/3: 100%|██████████| 1875/1875 [22:03<00:00,  1.42it/s]


Average training loss: 0.07230376831057171


Evaluating: 100%|██████████| 1250/1250 [05:13<00:00,  3.99it/s]

Accuracy on test set: 0.9185





In [3]:
model.save_pretrained("distilbert_sentiment_model")