# 0. dependecies

In [1]:
%pip install git+https://github.com/openai/CLIP.git
%pip install tqdm

import re
import clip
import torch

import numpy as np
import pandas as pd
from io import StringIO

from PIL import Image
from tqdm.notebook import trange
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms

from sklearn.metrics import f1_score, recall_score, precision_score, classification_report

import warnings
warnings.filterwarnings("ignore")

# setup torch and clip model
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device is", device)
print(clip.available_models())
model, preprocess = clip.load('ViT-L/14@336px', device)

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to c:\users\19824\appdata\local\temp\pip-req-build-_h0r9liy
  Resolved https://github.com/openai/CLIP.git to commit b46f5ac7587d2e1862f8b7b1573179d80dcdd620



  Running command git clone -q https://github.com/openai/CLIP.git 'C:\Users\19824\AppData\Local\Temp\pip-req-build-_h0r9liy'


Note: you may need to restart the kernel to use updated packages.
device is cuda
['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']


# 1. load data 

## 1.1 load data from csv

In [2]:
# load cleaned data from csv
train = "../Input/df_train_aug_clean.csv"
test = "../Input/df_test_aug_clean.csv"

# train data
with open(train) as file:
    lines=[line for line in file]
    df_train = pd.read_csv(StringIO(''.join(lines)), escapechar="/")

# test data
with open(test) as file:
    lines=[line for line in file]
    df_test = pd.read_csv(StringIO(''.join(lines)), escapechar="/")

## 1.2 self defined dataset

In [3]:
# trainset
class Trainset(Dataset):
    def __init__(self, df):

        print("initializing trainset...")

        # init
        self.combined_feature = df.ImageID

        # image to image features       
        self.image_raw = df.ImageID
        self.image_train = [None]*len(df)
        for i in trange(len(df)):
            image = Image.open("../Input/data/{}".format(self.image_raw[i]))
            augmenter = transforms.RandAugment()
            image = augmenter(image)
            image_input = preprocess(image).unsqueeze(0).to(device)
            with torch.no_grad():
                image_features = model.encode_image(image_input)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            self.image_train[i] = image_features

        # caption to text features
        self.caption_raw = df.Caption
        self.caption_train = [None]*len(df)
        for i in trange(len(df)):
            text_inputs = clip.tokenize(self.caption_raw[i]).to(device)
            with torch.no_grad():
                text_features = model.encode_text(text_inputs)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            self.caption_train[i] = text_features

        # concatenate image and text features
        for i in trange(len(df)):
            self.combined_feature[i] = torch.cat((self.image_train[i], self.caption_train[i]),1)

        # one-hot encode labels
        self.labels_train = torch.zeros([len(df),20],dtype=torch.float32)
        for i in range(len(df)):
            for j in df.Labels[i].split(" "):
                self.labels_train[i][int(j)] = 1

    def __len__(self):
        return len(self.combined_feature)
    
    def __getitem__(self, idx):
        if idx >= len(self): raise IndexError
        return self.combined_feature[idx], self.labels_train[idx]

In [4]:
# testset
class Testset(Dataset):
    def __init__(self, df):

        print("initializing testset...")
        
        # init
        self.combined_feature = df.ImageID

        # image to image features
        self.image_raw = df.ImageID
        self.image_test = [None]*len(df)
        for i in trange(len(df)):
            image = Image.open("../Input/data/{}".format(self.image_raw[i]))
            image_input = preprocess(image).unsqueeze(0).to(device)
            with torch.no_grad():
                image_features = model.encode_image(image_input)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            self.image_test[i] = image_features

        # caption to text features
        self.caption_raw = df.Caption
        self.caption_test = [None]*len(df)
        for i in trange(len(df)):
            text_inputs = clip.tokenize(self.caption_raw[i]).to(device)
            with torch.no_grad():
                text_features = model.encode_text(text_inputs)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            self.caption_test[i] = text_features

        # concatenate image and text features
        for i in trange(len(df)):
            self.combined_feature[i] = torch.cat((self.image_test[i], self.caption_test[i]),1)

    def __len__(self):
        return len(self.combined_feature)
    
    def __getitem__(self, idx):
        if idx >= len(self): raise IndexError
        return self.combined_feature[idx]

## 1.3 load into dataloader and encode with clip

In [5]:
train_set = Trainset(df_train)
train_set, val_set = torch.utils.data.random_split(train_set,[29000,1000])
print(len(train_set), len(val_set))

test_set = Testset(df_test)
batch_size = 16
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

initializing trainset...


  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

29000 1000
initializing testset...


  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

# 2. classification model

## 2.1 model structure

In [6]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(768*2, 2048)
        self.fc2 = nn.Linear(2048, 1024)
        self.fc = nn.Linear(1024, 18)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc(x)
        x = torch.squeeze(x)
        return x

## 2.2 model initialization & optimizer

In [7]:
n_epochs = 10
lr = 1e-3

net = Net().to(device)

# criterion = torch.nn.MultiLabelSoftMarginLoss()
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(net.parameters(), lr = lr) # Using Adam optimizer

# 3. training

## 3.1 make prediction

In [8]:
def predict():
    result = []
    with torch.no_grad():
        for data in test_set:
            features = data
            # calculate outputs by running images through the network
            outputs = net(features.float())
            # the label with the positive energy is what we choose as prediction
            ones = torch.ones(18).to(device)
            zeros = torch.zeros(18).to(device)
            sig = nn.Sigmoid()
            outputs = sig(outputs)
            outputs_01 = torch.where(outputs>0.53, ones, zeros).to(device)
            outputs_01 = torch.cat((torch.zeros(1).to(device), outputs_01[:11], torch.zeros(1).to(device), outputs_01[11:]))
            
            label = ""
            for i in range(len(outputs_01)):
                if outputs_01[i] == 1:
                    label += "{} ".format(i)

            if len(label) == 0:
                # when no positive predictions,
                # either take the most possible label, 
                outputs[0] = -100 # no label 1
                the_one = int(torch.argmax(outputs))
                if the_one <= 10:
                    the_one += 1
                else:
                    the_one += 2
                result.append(str(the_one))
                
                # or just predict nothing. not performing very well
                # result.append("") 
            else:
                result.append(label.rstrip(" "))
                
        print("Finished predicting")
        return result

## 3.2 save result to csv

In [9]:
def save_result(result):
    with open(test) as file:
        lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
        df2 = pd.read_csv(StringIO(''.join(lines)), escapechar="/")

    df = pd.DataFrame({"ImageID": df2.ImageID, 'Labels': result})
    df.to_csv('../Output/result.csv', index=False)  

    print("successfully saved to 'result.csv'")

## 3.3 validation function

In [10]:
global best_acc 
best_acc = 0
def validate():

    correct = 0
    total = 0

    with torch.no_grad():
        for data in val_set:
            features, labels = data
            # calculate outputs by running images through the network
            outputs = net(features.float())
            # the class with the highest energy is what we choose as prediction
            ones = torch.ones(18).to(device)
            zeros = torch.zeros(18).to(device)
            sig = nn.Sigmoid()
            outputs = sig(outputs)
            outputs = torch.where(outputs>0.53, ones, zeros).to(device)
            outputs = torch.cat((torch.zeros(1).to(device), outputs[:11], torch.zeros(1).to(device), outputs[11:])).to(device)
            total += labels.size(0)

            correct += (outputs == labels.to(device)).sum().item()

    # f1-score
    y_true = []
    y_pred = []
    with torch.no_grad():
        for data in val_set:
            features, labels = data
            # calculate outputs by running images through the network
            outputs = net(features.float())
            # the class with the highest energy is what we choose as prediction
            ones = torch.ones(18).to(device)
            zeros = torch.zeros(18).to(device)
            sig = nn.Sigmoid()
            outputs = sig(outputs)
            outputs = torch.where(outputs>0.53, ones, zeros).to(device)
            outputs = torch.cat((torch.zeros(1).to(device), outputs[:11], torch.zeros(1).to(device), outputs[11:])).to(device)
            y_true.append(labels.tolist())
            y_pred.append(outputs.tolist())


    # print(f'Accuracy of the network: {100 * correct // total} %')
    print('Average F1 score of the network: {:.4f}'.format(f1_score(y_true=np.array(y_true), y_pred=np.array(y_pred), average='samples')))
    print('Average recall_score score of the network: {:.4f}'.format(recall_score(y_true=np.array(y_true), y_pred=np.array(y_pred), average='samples')))
    print('Average precision_score score of the network: {:.4f}'.format(precision_score(y_true=np.array(y_true), y_pred=np.array(y_pred), average='samples')))
    print(classification_report(y_true=np.array(y_true), y_pred=np.array(y_pred)))

    # save prediction result to csv if this epoch performs the best
    global best_acc
    if f1_score(y_true=np.array(y_true), y_pred=np.array(y_pred), average='samples') >= best_acc:
        best_acc = f1_score(y_true=np.array(y_true), y_pred=np.array(y_pred), average='samples')
        final_result = predict()
        save_result(final_result)
        print("Yet the Best epoch, saving result. BEST F1:",best_acc,"\n")

## 3.4 training process
after running this sections, the final result of prediction will be saved in result.csv.

In [11]:
for epoch in trange(n_epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    just_for_curious = []
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        features, labels = data
        labels = labels.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(features.float())
        label_renewed = torch.cat((labels[:, 1:12], labels[:, 13:]), 1)
        loss = criterion(outputs, label_renewed)
        loss.backward()
        optimizer.step()
        # ema.update(net)
        
        # print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print(f'[{epoch + 1}/{i + 1:5d}] loss: {running_loss / 100:.3f}')
            validate()
            running_loss = 0.0

print('Finished Training')

  0%|          | 0/10 [00:00<?, ?it/s]

[1/  100] loss: 0.198
Average F1 score of the network: 0.6629
Average recall_score score of the network: 0.6175
Average precision_score score of the network: 0.7690
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.89      0.96      0.93       761
           2       0.00      0.00      0.00        35
           3       0.60      0.27      0.38       153
           4       0.00      0.00      0.00        42
           5       1.00      0.13      0.24        30
           6       0.00      0.00      0.00        38
           7       0.00      0.00      0.00        39
           8       0.00      0.00      0.00        77
           9       0.00      0.00      0.00        44
          10       0.00      0.00      0.00        48
          11       0.00      0.00      0.00        13
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00        19
          14       0.00 

In [None]:
# run all above cells to get result.

# The following section is just for examine purpose. Running them might affect the final result.

# 4. machine learning approach

## 6.1 dependecies

In [None]:
%pip install scikit-multilearn
import warnings
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# suppress warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

## 6.2 prepare data

In [None]:
train_x = np.zeros((len(train_set.combined_feature), 1024))
train_y = np.array(train_set.labels_train)
for i in range(len(train_x)):
    train_x[i, :] = np.array(train_set.combined_feature[i].squeeze())
print(train_x.shape, train_y.shape)

## 6.3 methods tryout

### 6.3.1 knn

In [None]:
# knn
parameters = {'n_neighbors': (3, 5, 10, 20)}
score = 'f1_samples'
clf = GridSearchCV(KNeighborsClassifier(), parameters, scoring=score, verbose=10)
clf.fit(train_x, train_y)
print (clf.cv_results_)

### 6.3.2 random forest

In [None]:
# random forest
parameters = {'max_depth': (1, 3, 5, 10), "n_estimators": (50, 100 ,200)}
score = 'f1_samples'
clf = GridSearchCV(RandomForestClassifier(), parameters, scoring=score, verbose=10)
clf.fit(train_x, train_y)
print(clf.cv_results_)

## 6.4 make predictions

In [None]:
# make predictions
predictions = []
for i in trange(len(test_set)):
    test_input = np.array(test_set.combined_feature[i])
    predictions.append(clf.predict(test_input))

for i in trange(len(test_set)):
    label = ""
    for j in range(len(predictions[i][0])):
        if predictions[i][0][j] == 1:
            label += "{} ".format(j)
    predictions[i] = label.rstrip(" ")

## 6.5 save result 

In [None]:
with open(test) as file:
    lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
    df2 = pd.read_csv(StringIO(''.join(lines)), escapechar="/")

df = pd.DataFrame({"ImageID": df2.ImageID, 'Labels': predictions})
df.to_csv('result_ml.csv', index=False)
print("successfully saved to 'result_ml.csv'")  

# 5. car detection

In [None]:
%pip install --upgrade tencentcloud-sdk-python

import json
import base64
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.tiia.v20190529 import tiia_client, models

def get_result(image_base64):
    try:
        cred = credential.Credential("", "")
        httpProfile = HttpProfile()
        httpProfile.endpoint = "tiia.tencentcloudapi.com"

        clientProfile = ClientProfile()
        clientProfile.httpProfile = httpProfile
        client = tiia_client.TiiaClient(cred, "ap-guangzhou", clientProfile)

        req = models.RecognizeCarProRequest()
        params = {"ImageBase64": image_base64}
        req.from_json_string(json.dumps(params))

        resp = json.loads(client.RecognizeCarPro(req).to_json_string())
        print(resp)

        return True

    except TencentCloudSDKException as err:
        return False



In [None]:
api_result = []
with open(test) as file:
    lines=[line for line in file]
    df_test_api = pd.read_csv(StringIO(''.join(lines)), escapechar="/")[:30]

    image_raw = df_test_api.ImageID
    for i in trange(len(df_test_api)):
        with open("./data/{}".format(image_raw[i]),"rb") as image:
            image = image.read()
            image_base64 = base64.encodebytes(image).decode("utf-8")
            result = get_result(image_base64)
            if result:
                api_result.append((image_raw[i], result))