In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re

import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset

from transformers import CLIPTextModel, CLIPTokenizer, get_linear_schedule_with_warmup

In [2]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [3]:
def preprocessing(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "http", regex=True)
    df[text_field] = df[text_field].str.replace(r"http", "http", regex=True)
    df[text_field] = df[text_field].str.replace(r"@\S+", "@user", regex=True)
    df[text_field] = df[text_field].apply(lambda x: remove_emoji(x))
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"@", "at", regex=True)
    df[text_field] = df[text_field].str.lower()
    return df

In [5]:
class TwitterDataset(Dataset):
    def __init__(self, text_list, label_list):
        assert (len(text_list) == len(label_list))

        self.text_list = text_list
        self.label_list = label_list

    def __len__(self):
        return len(self.text_list)
    
    def __getitem__(self, idx):
        return self.text_list[idx], self.label_list[idx]

In [6]:
class Collator():
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch):
        text_list, label_list = zip(*batch)
        
        batch = self.tokenizer(text_list, 
                            max_length=self.max_length, 
                            padding='max_length', 
                            return_tensors='pt')

        output = dict(input_ids=batch['input_ids'],
                      attention_mask=batch['attention_mask'],
                      labels=torch.tensor(label_list, dtype=torch.int64))
        
        return output

In [7]:
class ClassificationHead(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super().__init__()
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size, num_classes)

        nn.init.normal_(self.linear1.weight, std=0.02)
        nn.init.normal_(self.linear1.bias, 0)

        nn.init.normal_(self.linear2.weight, std=0.02)
        nn.init.normal_(self.linear2.bias, 0)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        out = self.linear2(x)

        return out

In [8]:
class CLIPClassifier(nn.Module):
    def __init__(self, pretrained_model, num_labels):
        super(CLIPClassifier, self).__init__()
        self.clip = CLIPTextModel.from_pretrained(pretrained_model)
        self.classifier = ClassificationHead(self.clip.config.hidden_size, num_labels)

    def forward(self, input_ids, mask):
        output = self.clip(input_ids, mask)
        x = output.pooler_output
        x = self.classifier(x)
        return x

In [9]:
def train_model(net, dataloader_dict, criterion, optimizer, num_epochs):
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net.to(device)
    
    torch.backends.cudnn.benchmark = True

    print("-"*10, 'START', '-'*10)

    for epoch in range(num_epochs):
        # print('Epoch {}/{}'.format(epoch+1, num_epochs))
        # print('-' * 20)
    
        for phase in ["train", "val"]:
            if phase == 'train':
                net.train()
            else:
                net.eval()

            epoch_loss = 0.0
            epoch_corrects = 0

            if (epoch == 0) and (phase == "train"):
                continue

            for minibatch in dataloader_dict[phase]:
                inputs = minibatch['input_ids'].to(device)
                labels = minibatch['labels'].to(device)
                attention_mask = minibatch['attention_mask'].to(device)
                
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs, attention_mask)

                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)

            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloader_dict[phase].dataset)

            print('Epoch {}/{} | {:^5} | Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, num_epochs, phase, epoch_loss, epoch_acc))

In [10]:
def get_dataloader(train_data, valid_data, batch_size, collate_fn):
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=4)
    valid_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=4)

    dataloader_dict = {"train": train_dataloader, "val": valid_dataloader}

    return dataloader_dict

In [11]:
def get_test_data(data_list, tokenizer):
    output = []
    for data in data_list:
        batch = tokenizer(data, 
                        max_length=MAX_LEN, 
                        padding='max_length', 
                        return_tensors='pt')

        
        temp = dict(input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'])
        output.append(temp)

    return output

In [12]:
def pred_test(net, dataloader, ids):
    data = pd.read_csv('./dataset/test.csv')
    data = preprocessing(data, 'text')
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net.to(device)

    net.eval()

    preds = []
    for minibatch in dataloader:
        inputs = minibatch['input_ids'].to(device)
        attention_mask = minibatch['attention_mask'].to(device)

        with torch.set_grad_enabled(False):
            outputs = net(inputs, attention_mask)
            _, pred = torch.max(outputs, 1)
            preds.append(pred.cpu().numpy()[0])

    submission_csv = pd.DataFrame({'id': ids, 'target': preds})

    return submission_csv

In [4]:
DATA_PATH = './dataset/train.csv'
TEST_PATH = './dataset/test.csv'
VALID_RATIO = 0.2
RANDOM_SEED = 119
BATCH_SIZE = 128
NUM_EPOCHS = 5
MAX_LEN = 54
LEARNING_RATE = 2e-5
WARMUP_RATIO = 0.1
PRETRAINED_MODEL_NAME = "openai/clip-vit-base-patch32"
MODEL_PATH = './model/tweet-clip.pth'

In [14]:
def main():
    data = pd.read_csv(DATA_PATH)
    test_data = pd.read_csv(TEST_PATH)
    data = preprocessing(data, 'text')
    test_data = preprocessing(test_data, 'text')

    test_text, test_id = test_data['text'].to_list(), test_data['id'].to_list()

    data_text, data_labels = data['text'].to_list(), data['target'].to_list()
    train_texts, val_texts, train_labels, val_labels = train_test_split(data_text, data_labels, test_size=VALID_RATIO, random_state=RANDOM_SEED)

    num_labels = len(set(train_labels))

    train_data = TwitterDataset(train_texts, train_labels)
    valid_data = TwitterDataset(val_texts, val_labels)


    tokenizer = CLIPTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
    
    test_texts = get_test_data(test_text, tokenizer)

    dataloders_dict = get_dataloader(train_data, valid_data, BATCH_SIZE, Collator(tokenizer, MAX_LEN))

    model = CLIPClassifier(PRETRAINED_MODEL_NAME, num_labels=num_labels)
    optimizer = Adam(params=model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999))
    loss_fn = nn.CrossEntropyLoss()

    train_model(model, dataloders_dict, loss_fn, optimizer, NUM_EPOCHS)

    submission_csv = pred_test(model, test_texts, test_id)
    submission_csv.to_csv('./submission.csv', index=False)
    
    torch.save(obj={"clip":model.state_dict(),
                    "tokenizer":tokenizer,
                    "num_labels":num_labels},
                    f=MODEL_PATH)    

In [15]:
main()

Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.3.mlp.fc1.weight', 'vision_model.encoder.layers.0.layer_norm1.bias', 'vision_model.encoder.layers.8.self_attn.out_proj.weight', 'vision_model.encoder.layers.2.self_attn.q_proj.bias', 'vision_model.encoder.layers.1.mlp.fc1.bias', 'vision_model.encoder.layers.0.self_attn.q_proj.weight', 'vision_model.encoder.layers.8.self_attn.k_proj.weight', 'vision_model.encoder.layers.2.self_attn.out_proj.bias', 'vision_model.encoder.layers.8.layer_norm1.weight', 'vision_model.encoder.layers.11.self_attn.out_proj.weight', 'text_projection.weight', 'vision_model.encoder.layers.9.mlp.fc2.bias', 'vision_model.encoder.layers.1.mlp.fc2.weight', 'vision_model.encoder.layers.6.mlp.fc2.bias', 'vision_model.encoder.layers.1.self_attn.k_proj.weight', 'vision_model.encoder.layers.3.mlp.fc1.bias', 'vision_model.encoder.layers.10.self_attn.k_proj.bias', 'vision_model.enc

---------- START ----------
Epoch 1/5 |  val  | Loss: 0.7567 Acc: 0.4360
Epoch 2/5 | train | Loss: 0.4556 Acc: 0.8013
Epoch 2/5 |  val  | Loss: 0.3982 Acc: 0.8319
Epoch 3/5 | train | Loss: 0.3072 Acc: 0.8801
Epoch 3/5 |  val  | Loss: 0.4090 Acc: 0.8339
Epoch 4/5 | train | Loss: 0.2026 Acc: 0.9258
Epoch 4/5 |  val  | Loss: 0.4480 Acc: 0.8326
Epoch 5/5 | train | Loss: 0.1221 Acc: 0.9576
Epoch 5/5 |  val  | Loss: 0.5348 Acc: 0.8260
