In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from torchvision import models
from torchvision import transforms as tt
import numpy as np
import pandas as pd
import random
from PIL import Image

In [None]:
import gc

In [None]:
use_cuda = torch.cuda.is_available()
device = 'cuda' if use_cuda else 'cpu'

In [None]:
import pandas as pd

def get_file_paths(path, base_path, image_path_postfix, data_type = "Train"):
  df = pd.read_csv(path)
  image_id_list, image_path_list, eye_lvl_list = list(df["image_id"]), list(df["image_path"]), list(df["eye_DR_Level"])
  image_path_list = [s.replace('\\', '/') for s in image_path_list]

  image_path_list_final = []
  for s in image_path_list:
    splits = s.split('/')
    sf = '/' + splits[1] + '/' + image_path_postfix
    postfix = '/'.join(splits[2:])
    sf = sf + '/' + postfix
    image_path_list_final.append(sf)

  if data_type == "Train":
    n = len(image_path_list)
    counts = [0 for i in range(5)]
    thresh = 150 # max(counts)
    for it in eye_lvl_list:
      counts[it] += 1 #

    diff = [max(thresh, counts[i]) - counts[i] for i in range(len(counts))]
    make_per_image = [max(1,diff[i]//counts[i]) for i in range(len(counts))]

    image_id_list_final, image_path_list_train, eye_lvl_list_final = [], [], []

    for i in range(n):
      # Impliment early stopping as well
      to_make = make_per_image[eye_lvl_list[i]]
      d = diff[eye_lvl_list[i]]
      image_id_list_final.append(image_id_list[i])
      image_path_list_train.append(image_path_list_final[i])
      eye_lvl_list_final.append(eye_lvl_list[i])

      if d > 0:
        if d < to_make:
          to_make = 1
        d -= 1
        for j in range(to_make):
          image_path_list_train.append(image_path_list_final[i])
          eye_lvl_list_final.append(eye_lvl_list[i])
          name = image_id_list[i].split('.')[0]
          name += "_{}.jpg".format(j+1)
          image_id_list_final.append(name)

        diff[eye_lvl_list[i]] = d
    file_paths = [(image_id_list_final[i], base_path + image_path_list_train[i], eye_lvl_list_final[i]) for i in range(len(image_id_list_final))]

  else:
    file_paths = [(image_id_list[i], base_path + image_path_list_final[i], eye_lvl_list[i]) for i in range(len(image_path_list_final))]
  return file_paths

In [None]:
def make_transform(pattern, flag):
  random.seed(0)
  tsfm = tt.RandomChoice(
                  [tt.RandomHorizontalFlip(),
                  tt.RandomVerticalFlip(),
                  tt.RandomRotation(30)
                  ]
                )
  pattern = (pattern - np.min(pattern))/(np.max(pattern) - np.min(pattern))
  pattern = np.moveaxis(pattern,2,0)
  pattern_tensor = torch.from_numpy(pattern).float()
  pattern_tensor = tt.Resize((512, 512))(pattern_tensor)
  if flag:
    pattern_tensor = tsfm(pattern_tensor)
  return pattern_tensor

In [None]:
class EmbeddingDataset(torch.utils.data.Dataset):
    def __init__(self, file_paths, file_ids):
        super(EmbeddingDataset, self,).__init__()

        self.file_paths = file_paths
        self.transform = make_transform
        self.file_ids = file_ids

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        img = np.array(Image.open(self.file_paths[idx]))
        n = len(self.file_ids[idx].split('_'))
        aug_flag = False
        if n == 3:
          aug_flag = True
        return self.transform(img, aug_flag)

In [None]:
base_path = "/content/drive/MyDrive/Fundus"

In [None]:
drid_path = base_path + "/DeepDRiD"
train_csv_path = drid_path + "/training_data.csv"
valid_csv_path = drid_path + "/validation_data.csv"
image_path_prefix = "Images"


In [None]:
file_items_train = get_file_paths(train_csv_path, drid_path, image_path_prefix)

In [None]:
print(len(file_items_train))
print(file_items_train[:3])

806
[('5_l1', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/5/5_l1.jpg', 0), ('5_l2', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/5/5_l2.jpg', 0), ('7_l2', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/7/7_l2.jpg', 0)]


In [None]:
print(file_items_train[300:311])

[('207_l1_1.jpg', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/207/207_l1.jpg', 1), ('215_l1', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/215/215_l1.jpg', 1), ('215_l1_1.jpg', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/215/215_l1.jpg', 1), ('226_l1', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/226/226_l1.jpg', 1), ('226_l1_1.jpg', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/226/226_l1.jpg', 1), ('226_l2', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/226/226_l2.jpg', 1), ('226_l2_1.jpg', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/226/226_l2.jpg', 1), ('278_l2', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/278/278_l2.jpg', 1), ('278_l2_1.jpg', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/278/278_l2.jpg', 1), ('282_r1', '/content/drive/MyDrive/Fundus/

In [None]:
file_items_valid = get_file_paths(valid_csv_path, drid_path, image_path_prefix, data_type = "Valid")

batch_size = 10

In [None]:
print(len(file_items_valid))

58


In [None]:
file_paths_train = [i[1] for i in file_items_train]
file_ids_train = [i[0] for i in file_items_train]

file_paths_valid = [i[1] for i in file_items_valid]
file_ids_valid = [i[0] for i in file_items_valid]

In [None]:
print(len(file_ids_train))

806


In [None]:
ckpt_path = base_path + '/EyePACS/resnet50_128_08.pt'

weights = torch.load(ckpt_path)
model = models.resnet50()
# Weights of fully connected layer are removed in the file, so set strict to be False.
model.load_state_dict(weights, strict=False)
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
model = model.to(device)

In [None]:
train_data = EmbeddingDataset(file_paths_train, file_ids_train)


train_loader = torch.utils.data.DataLoader(train_data,
                                          batch_size=batch_size,
                                          shuffle=False,
                                          num_workers=2,
                                          pin_memory=True)

del train_data
gc.collect()

138

In [None]:
patterns = []
with torch.no_grad():
  for data in train_loader:
    data = data.to(device)
    pattern_out = model(data)
    pattern_out = pattern_out.cpu().numpy()
    patterns.append(pattern_out)



In [None]:
print(len(patterns))
print(len(patterns[0]))

81
10


In [None]:
del train_loader
gc.collect()

patterns = np.vstack(patterns)

In [None]:
print(len(patterns))

806


In [None]:
print(patterns.shape)

(806, 1000)


In [None]:
np.save(drid_path + '/train_patterns_augmented.npy', patterns)

In [None]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-2.2.2-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.1/179.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting loguru>=0.5.0 (from pinecone-client)
  Downloading loguru-0.7.0-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting dnspython>=2.0.0 (from pinecone-client)
  Downloading dnspython-2.4.0-py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.0/300.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore>=0.17.3 (from dnspython>=2.0.0->pinecone-client)
  Downloading httpcore-0.17.3-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.5/74.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting h11<0.15,>=0.13 (from httpcore>=0.17.3->dnspython>=2.0.0->pinecone-client)
  Downloadin

In [None]:
import os
import requests

import tqdm
import httpimport
import pinecone
import numpy as np

  from tqdm.autonotebook import tqdm


In [None]:
os.environ["PINECONE_EXAMPLE_API_KEY"] = "5d09425a-8d43-4833-8d44-9113d452c3ed"

In [None]:
DATA_DIRECTORY = 'pinecone'
INDEX_NAME = 'aug-1'
INDEX_DIMENSION = 1000
BATCH_SIZE=20

In [None]:
pinecone.init(os.environ["PINECONE_EXAMPLE_API_KEY"], environment='us-west1-gcp-free')

if INDEX_NAME not in pinecone.list_indexes():
  pinecone.create_index(name=INDEX_NAME, dimension=INDEX_DIMENSION)

index = pinecone.Index(INDEX_NAME)

In [None]:
base_path = "/content/drive/MyDrive/Fundus"
drid_path = base_path + "/DeepDRiD"

In [None]:
train_dataset_path = drid_path + "/train_patterns_augmented.npy"
train_ds = np.load(train_dataset_path)
print(train_ds.shape)

(806, 1000)


In [None]:
def make_batches(list_obj, batch_size):
  j = 0
  i = batch_size
  if i > len(list_obj):
    return [list_obj]
  toret = []
  for i in range(batch_size, len(list_obj), batch_size):
    toret.append(list_obj[j : i])
    j = i
  return toret

In [None]:
l_obj = []
for i in range(len(file_items_train)):
  embedding = train_ds[i]
  embedding = embedding.tolist()
  id = file_items_train[i][0]
  label = file_items_train[i][2]
  l_obj.append((id, embedding, {'label' : label}))

batches = make_batches(l_obj, BATCH_SIZE)

In [None]:
print(len(batches))
print(len(batches[0]))

40
20


In [None]:
for batch in tqdm.tqdm(batches):
  index.upsert(batch)

100%|██████████| 40/40 [00:07<00:00,  5.54it/s]


In [None]:
valid_dataset_path = drid_path + '/valid_patterns.npy'
valid_ds = np.load(valid_dataset_path)
print(valid_ds.shape)

(58, 1000)


In [None]:
q1 = valid_ds[0].tolist()
response = index.query(q1, top_k=5, include_metadata=True)
print(response)

{'matches': [{'id': '20_l2',
              'metadata': {'label': 0.0},
              'score': 0.0917797163,
              'values': []},
             {'id': '272_l2',
              'metadata': {'label': 0.0},
              'score': 0.0846818462,
              'values': []},
             {'id': '48_r2',
              'metadata': {'label': 0.0},
              'score': 0.0837313682,
              'values': []},
             {'id': '123_l2',
              'metadata': {'label': 3.0},
              'score': 0.0832095519,
              'values': []},
             {'id': '123_l2_1.jpg',
              'metadata': {'label': 3.0},
              'score': 0.0799304619,
              'values': []}],
 'namespace': ''}


In [None]:
print(file_items_valid[0])

('60_r1', '/content/drive/MyDrive/Fundus/DeepDRiD/regular-fundus-training/Images/60/60_r1.jpg', 0)


In [None]:
from sklearn.metrics import classification_report

In [None]:
top_1 = []
top_5 = []
top_1_ids = []
top_5_ids = []

nbr_ids = []
nbr_labels = []
nbr_scores = []

for i in range(len(valid_ds)):
  q = valid_ds[i].tolist()
  resp = index.query(q, top_k = 5, include_metadata = True)
  nbrs_list = resp["matches"]
  top_1.append(int(nbrs_list[0]["metadata"]["label"]))
  top_1_ids.append(nbrs_list[0]["id"])

  gnd_val = file_items_valid[i][2]
  top_5_flag = True

  tmp_labels, tmp_ids, tmp_scores = [],[],[]
  for nbr in nbrs_list:
    if top_5_flag and int(nbr["metadata"]["label"]) == gnd_val:
      top_5.append(int(nbr["metadata"]["label"]))
      top_5_ids.append(nbr["id"])
      top_5_flag = False
    tmp_labels.append(int(nbr["metadata"]["label"]))
    tmp_ids.append(nbr["id"])
    tmp_scores.append(nbr["score"])

  nbr_labels.append(tmp_labels)
  nbr_ids.append(tmp_ids)
  nbr_scores.append(tmp_scores)

gnd_items = [file_items_valid[i][2] for i in range(len(file_items_valid))]

In [None]:
print(classification_report(gnd_items, top_1, digits = 4))

              precision    recall  f1-score   support

           0     0.4286    0.9545    0.5915        22
           1     0.0000    0.0000    0.0000         8
           2     0.5000    0.1333    0.2105        15
           3     0.5000    0.1000    0.1667        10
           4     0.0000    0.0000    0.0000         3

    accuracy                         0.4138        58
   macro avg     0.2857    0.2376    0.1937        58
weighted avg     0.3781    0.4138    0.3076        58



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
classification_report(gnd_items, top_5, digits = 4)

ValueError: ignored