## CLIP with fine tune


In [None]:
%%capture



import sys
!cp -r ../input/openai-clip/CLIP/CLIP-main /tmp/

!gzip -c /tmp/CLIP-main/clip/bpe_simple_vocab_16e6.txt > /tmp/CLIP-main/clip/bpe_simple_vocab_16e6.txt.gz
sys.path.append('/tmp/CLIP-main')

!pip install ../input/openai-clip/ftfy-5.9/ftfy-5.9 \
             ../input/openai-clip/torch-1.7.1+cu110-cp37-cp37m-linux_x86_64.whl \
             ../input/openai-clip/torchvision-0.8.2+cu110-cp37-cp37m-linux_x86_64.whl \
             ../input/faiss-163/faiss_gpu-1.6.3-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
import clip
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
import re
from clip.simple_tokenizer import SimpleTokenizer
import faiss
import matplotlib.pyplot as plt
from triplet_loss import TripletLoss

%matplotlib inline

In [None]:
df_test = pd.read_csv('../input/shopee-product-matching/test.csv', index_col='posting_id')

In [None]:
RUN_ON_TRAIN = len(df_test) == 3

### Utility classes and functions

In [None]:
_tokenizer = SimpleTokenizer()

# 来自 https://github.com/openai/CLIP/blob/beba48f35392a73c6c47ae67ddffced81ad1916d/clip/clip.py#L164
def tokenize(texts, context_length: int = 77) -> torch.LongTensor:
    if isinstance(texts, str):
        texts = [texts]

    sot_token = _tokenizer.encoder["<|startoftext|>"]
    eot_token = _tokenizer.encoder["<|endoftext|>"]
    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

    for i, tokens in enumerate(all_tokens):
        n = min(len(tokens), context_length)
        result[i, :n] = torch.tensor(tokens)[:n]
        if len(tokens) > context_length:
            result[i, -1] = tokens[-1]

    return result

In [None]:
# Remove EMOJI
RE_EMOJI = re.compile(r"\\x[A-Za-z0-9./]+", flags=re.UNICODE)

def strip_emoji(text):
    return RE_EMOJI.sub(r'', text)

In [None]:
class RollingMean():
    def __init__(self):
        self.n = 0
        self.mean = 0
        
    def update(self, value):
        self.mean = (self.mean * self.n + value) / (self.n+1)
        self.n += 1
        
    def result(self):
        return self.mean

### 数据集和采样器

确保在每个批次中始终存在同一组的两个样本。

In [None]:
class SameGroupSampler(Sampler):
    def __init__(self, df ,ds):
        super().__init__(ds)
        
        # Create a dictionary of posting_id -> index in dataset
        self.index_to_position = dict(zip(df.index, range(len(df))))
        
        # Create a Series of label_group -> set(posting_id)
        self.label_group = df.reset_index().groupby('label_group')['posting_id'].apply(set).map(sorted).map(np.array)

    def __len__(self):
        return len(self.label_group)
        
    def __iter__(self):
        for _ in range(len(self)):
            # Sample one label_group
            label_group_sample = self.label_group.sample(1).iloc[0]
            
            # Sample two posting_id's
            sample1, sample2 = np.random.choice(label_group_sample, 2, replace=False)
            
            yield self.index_to_position[sample1]
            yield self.index_to_position[sample2]            

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, images_path):
        super().__init__()
        self.df = df
        self.images_path = images_path
        self.has_target = ('label_group' in df)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        image = preprocess(Image.open(self.images_path / row['image']))
        text = tokenize([strip_emoji(row['title'])])[0]
        
        if self.has_target:
            return image, text, row['label_group']
        else:
            return image, text, 0

### 微调

In [None]:
# 加载 CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("../input/openai-clip/ViT-B-32.pt", device=device, jit=False)

# 获取嵌入大小
embed_dim = model.text_projection.shape[1]
embed_dim

In [None]:
# 训练数据
train_images_path = Path('../input/shopee-product-matching/train_images')

df_train = pd.read_csv('../input/shopee-product-matching/train.csv', index_col='posting_id')

dstrain = MyDataset(df_train, train_images_path)
dltrain = DataLoader(dstrain, batch_size=128, num_workers=2, sampler=SameGroupSampler(df_train, dstrain))

In [None]:
n_epochs = 1

In [None]:
# optim = torch.optim.AdamW(model.parameters(), lr=1e-4, eps=1e-8, weight_decay=1e-2)
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.2)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optim, 1e-2, total_steps=n_epochs * (2*len(dltrain)-1),
                                               base_momentum=0.0, max_momentum=0.5, pct_start=0.1, div_factor=1e2, final_div_factor=1e4)
criterion = TripletLoss(device)

In [None]:
for epoch in range(n_epochs):
    with tqdm(total=2*len(dltrain)-1) as bar:
        loss_mean = RollingMean()
        for images, texts, targets in dltrain:
            targets = targets.to(device)
            
            # 图文特征
            images_features = model.encode_image(images.to(device))
            texts_features = model.encode_text(texts.to(device))

            optim.zero_grad()

            # 图文特征融合
            features = torch.hstack([images_features, texts_features])
            
            # L2 规范化
            features = features / features.norm(2, dim=1, keepdim=True)

            # 应用 Triplet SemiHardLoss
            loss = criterion(features, targets)

            loss.backward()
            optim.step()
            scheduler.step()

            # 进度条
            loss_mean.update(loss.item())
            bar.update()
            bar.set_description('{:.4f}'.format(loss_mean.result()))

## 在训练集上运行

使用 CLIP 生成特征并执行相似性搜索以查找最接近的match。

通过去掉那些低于阈值（0.7）的数据的结果来创建结果集

In [None]:
def find_similarities_and_indexes(df, images_path, top_n=100, features_file=None):
    # 创建 pytorch Dataset/DataLoader
    ds = MyDataset(df, images_path)
    dl = DataLoader(ds, batch_size=32, shuffle=False, num_workers=2)

    
    features = np.empty((len(df), 2*embed_dim), dtype=np.float32)

    # 开始预测
    i = 0
    for images, texts, _ in tqdm(dl):
        n = len(images)
        with torch.no_grad():
            # Generate image and text features
            images_features = model.encode_image(images.to(device))
            texts_features = model.encode_text(texts.to(device))

        # 拼接特征
        features[i:i+n, :embed_dim] = images_features.cpu()
        features[i:i+n, embed_dim:] = texts_features.cpu()

        i += n

    # 保存特征
    if features_file is not None:
        np.save(features_file, features)

    # L2 规范化
    features /= np.linalg.norm(features, 2, axis=1, keepdims=True)

    # 创建 index
    index = faiss.IndexFlatIP(2*embed_dim)
    index.add(features)

    # 搜索 index
    return index.search(features, top_n)

In [None]:
if RUN_ON_TRAIN:
    # 执行相似商品的搜索
    similarities, indexes = find_similarities_and_indexes(df_train, train_images_path, features_file='features-no-norm.npy')
    
    # `similarities`  (n, 100) 具有最接近匹配的相似分数
    # `indexes` (n, 100) 具有最接近匹配的索引。
    # 两个数组都是对齐的

    # 将索引转换为组，形状为 (n, 100)
    found_groups = df_train['label_group'].values[indexes]

    # 检查匹配是否来自同一组

    # 绘制同一组和不同组的相似度得分
    plt.figure(figsize=(10, 5))
    plt.hist([similarities[is_same_group], similarities[~is_same_group]], density=False, bins=51,
         label=['Same group', 'Different group'], histtype='stepfilled', alpha=0.75)
    plt.xlim(0, 1)
    plt.xlabel('Similarity score')
    plt.legend();

### Tune CUT

In this last step we will move the `cut_value` to find optimal F1-score.

In [None]:
def row_wise_f1_score(y_true, y_pred):
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = y_pred.apply(lambda x: len(x)).values - tp
    fn = y_true.apply(lambda x: len(x)).values - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * ((precision * recall) / (precision + recall))
    return f1


def calc_score(cut_value):
    # 应用相似性截断
    groups_are_same = (similarities > cut_value)

    # 创建结果集
    results = []
    for i, (group_is_same, index_result) in enumerate(zip(groups_are_same, indexes)):
        row_results = df_train.index[index_result[group_is_same]]

        results.append(set(row_results))

    df_results = pd.Series(results, index=df_answer.index)
    
    # 计算F1 得分
    return row_wise_f1_score(df_answer, df_results).mean()

In [None]:
if RUN_ON_TRAIN:
    groups = df_train.reset_index().groupby('label_group')['posting_id'].apply(set)
    df_answer = df_train['label_group'].map(groups)


    cuts = np.linspace(0.5, 0.95, 51)
    scores = [calc_score(c) for c in tqdm(cuts)]


    plt.plot(cuts, scores)
    plt.xlabel('Cutoff value')
    plt.ylabel('F1 score')

    print('Best cutoff is {:.2f} with expected F1 score of {:.4f}'.format(cuts[np.argmax(scores)], max(scores)))

## Run on test

In [None]:
GROUP_CUT = 0.71 

In [None]:
test_images_path = Path('../input/shopee-product-matching/test_images')

In [None]:
# Find similar matches
similarities, indexes = find_similarities_and_indexes(df_test, test_images_path)

In [None]:
# Apply cutoff of similiarites
test_are_same_groups = (similarities > GROUP_CUT)

In [None]:
# Build submission
results = []

for i, (test_is_same_group, index_result) in enumerate(zip(test_are_same_groups, indexes)):
    row_results = set(df_test.index[index_result[test_is_same_group]])
    
    results.append({
        'posting_id': df_test.index[i],
        'matches': ' '.join(row_results)
    })
    
df_sub = pd.DataFrame(results)

In [None]:
df_sub.to_csv('submission.csv', index=False)