1. Get all file list in bucket

In [2]:
from google.cloud import storage
from tqdm import tqdm

bucket = 'gs://efiss/data/product_images/'

def list_blobs(location):
    bucket = location.replace("gs://", "").split("/")[0]
    prefix = "/".join(location.replace("gs://", "").split("/")[1:])
    # Initialize a client
    client = storage.Client()

    # Get the bucket
    bucket = client.get_bucket(bucket)

    # List blobs in the bucket with the given prefix
    blobs = bucket.list_blobs(prefix=prefix)

    print("Blobs:", blobs)

    # return [blob.name for blob in blobs]
    for blob in blobs:
        yield blob.name


In [2]:
files_list = list_blobs(bucket)

with open('files_list_efiss.txt', 'w') as f:
    for item in tqdm(files_list):
        f.write("%s\n" % item)



Blobs: <google.api_core.page_iterator.HTTPIterator object at 0x7f94f44f0ed0>


6682310it [24:33, 4533.48it/s]


2. Get all file list in thumbnail bucket

In [3]:
thumbnail_list = list_blobs('gs://efiss/data/thumbnail/')

with open('thumbnail_list_efiss.txt', 'w') as f:
    for item in tqdm(thumbnail_list):
        f.write("%s\n" % item)



Blobs: <google.api_core.page_iterator.HTTPIterator object at 0x7f94d3339090>


3788746it [14:12, 4446.15it/s]


3. Get all file list in Database

In [3]:
from pymongo import MongoClient
from bson import ObjectId

# Connection string
connection_string = "mongodb+srv://efiss:...c@efiss.0ziet.mongodb.net/efiss"

# Connect to the MongoDB database
client = MongoClient(connection_string)
db = client.efiss

# Access the "products" collection
collection = db.products

# Retrieve the list of ObjectIDs from the collection
imgs: list[list[str]] = (doc['images'] for doc in collection.find({}, {'images': 1}))

with open('product_images_efiss.txt', 'w') as f:
    for item in tqdm(imgs):
        for i in item:
            f.write("%s\n" % i)

944400it [01:45, 8944.10it/s] 


Load back

In [18]:
# load
files_list = []
with open('files_list_efiss.txt', 'r') as f:
    for line in f:
        files_list.append(line.strip())

thumbnail_list = []
with open('thumbnail_list_efiss.txt', 'r') as f:
    for line in f:
        thumbnail_list.append(line.strip())
        
product_images = []
with open('product_images_efiss.txt', 'r') as f:
    for line in f:
        product_images.append(line.strip()[37:])

In [19]:
files_list = set(files_list)  # GCS
thumbnail_list = set(thumbnail_list)  # GCS thumbnail
product_images = set(product_images) # MongoDB

In [20]:
thumbnail_list = {i.replace("thumbnail", "product_images") for i in thumbnail_list}

In [21]:
len(thumbnail_list)

3788746

In [22]:
# get list of needed images to be thumbnail
to_be_thumbnail = files_list - thumbnail_list

In [23]:
len(to_be_thumbnail)

2893564

In [17]:
from tqdm import tqdm
with open('to_be_thumbnail.txt', 'w') as f:
    for item in tqdm(to_be_thumbnail):
        f.write("%s\n" % item)

  0%|          | 0/2893564 [00:00<?, ?it/s]

100%|██████████| 2893564/2893564 [00:01<00:00, 1874515.82it/s]


In [24]:
list(files_list)[:5]

['data/product_images/shopee-WYB99.vn/64dd6660e424a3f432851da5_6_shopee_WYB99_vn.jpeg',
 'data/product_images/shopee-okeydokey.vn/64dda91a02950026f341d41b_7_shopee_okeydokey_vn.jpeg',
 'data/product_images/shopee-Tie_Dye_Boutique/64db6eaf58970237c234b171_2_shopee_Tie_Dye_Boutique.jpeg',
 'data/product_images/shopee-Qun_o_Nam_Pon.mens/64d5205efc6c377e4ea1e3a8_7_shopee_Qun_o_Nam_Pon_mens.jpeg',
 'data/product_images/shopee-SKYMENFASHION_SHOP/64dbb7c8deede18b72f0f850_1_shopee_SKYMENFASHION_SHOP.jpeg']

In [25]:
len(files_list - product_images) # -> se ra anh tren GCS va k trong product nafo -> xoa anh tren GCS

47113

In [26]:
len(product_images - files_list) # -> se ra anh trong product nhung k co tren GCS -> xoa anh trong product MongoDB
# xong thi product [] de cho DA

40145

In [27]:
remove_from_GCS = files_list - product_images
remove_from_MongoDB = product_images - files_list

In [36]:
list(remove_from_MongoDB)[:5]

['',
 'data/product_images/www.muji.com/64494c4a49cef98e49779bb0_0_www_muji_com.jpg',
 'data/product_images/www.muji.com/64494c4b49cef98e4977a21c_2_www_muji_com.jpg',
 'data/product_images/www.muji.com/64494c4b49cef98e4977a6d5_8_www_muji_com.jpg',
 'data/product_images/www2.hm.com/644c5a311a1373b05d9c4e8c_10_www2_hm_com.jpg']

In [37]:
len(remove_from_GCS)

47113

In [34]:
to_be_remove_from_GCS = [ img for img in remove_from_GCS if img.strip() ]
len(to_be_remove_from_GCS)

47113

In [38]:
to_be_remove_from_MongoDB = { img for img in remove_from_MongoDB if img.strip() }
len(to_be_remove_from_MongoDB)

40144

In [29]:
list(remove_from_GCS)[:5]

['data/product_images/shopee-DOUMA.vn/64d4d2795fe046b1fb69d4b8_7_shopee_DOUMA_vn.jpeg',
 'data/product_images/shopee-Mt_knh_Vit_Thnh/64d3fb8a82f1ff9ead2fbb74_0_shopee_Mt_knh_Vit_Thnh.jpeg',
 'data/product_images/www2.hm.com/644c387d1a1373b05d9c49fd_2_www2_hm_com.webp',
 'data/product_images/www2.hm.com/644c1f651a1373b05d9c440f_6_www2_hm_com.webp',
 'data/product_images/shopee-Mt_knh_Vit_Thnh/64d3faec82f1ff9ead2fb823_7_shopee_Mt_knh_Vit_Thnh.jpeg']

In [39]:
with open('to_be_remove_from_GCS.txt', 'w') as f:
    for item in tqdm(to_be_remove_from_GCS):
        f.write("%s\n" % item)
        
with open('to_be_remove_from_MongoDB.txt', 'w') as f:
    for item in tqdm(to_be_remove_from_MongoDB):
        f.write("%s\n" % item)

100%|██████████| 47113/47113 [00:00<00:00, 773040.84it/s]
100%|██████████| 40144/40144 [00:00<00:00, 656411.60it/s]


Remove from MongoDB

In [50]:
from pymongo import MongoClient
from bson import ObjectId

# Connection string
connection_string = "mongodb+srv://efiss:...@efiss.0ziet.mongodb.net/efiss"

# Connect to the MongoDB database
client = MongoClient(connection_string)
db = client.efiss

# Access the "products" collection
collection = db.products

# Retrieve the list of ObjectIDs from the collection
imgs: list[list[str]] = (doc for doc in collection.find({}, {}))
img = next(imgs)

In [51]:
img

{'_id': ObjectId('64494c4949cef98e49779181'),
 'title': 'ÁO THUN THẤM HÚT MỒ HÔI NHANH KHÔ KHÔNG TAY NỮ XS VÀNG',
 'url': 'https://www.muji.com/vn/products/cmdty/detail/4550512291770',
 'price': 294000,
 'description': 'Được may từ vật liệu đặc trưng bởi khả năng co dãn và tạo cảm giác mềm mại. Phù hợp mặc hàng ngày. ',
 'images': ['https://storage.googleapis.com/efiss/data/product_images/www.muji.com/64494c4949cef98e49779181_0_www_muji_com.jpg',
  'https://storage.googleapis.com/efiss/data/product_images/www.muji.com/64494c4949cef98e49779181_1_www_muji_com.jpg',
  'https://storage.googleapis.com/efiss/data/product_images/www.muji.com/64494c4949cef98e49779181_2_www_muji_com.jpg',
  'https://storage.googleapis.com/efiss/data/product_images/www.muji.com/64494c4949cef98e49779181_3_www_muji_com.jpg',
  'https://storage.googleapis.com/efiss/data/product_images/www.muji.com/64494c4949cef98e49779181_4_www_muji_com.jpg',
  'https://storage.googleapis.com/efiss/data/product_images/www.muji.com/

In [61]:
import pymongo
from bson import ObjectId

# Connection string
connection_string = "mongodb+srv://efiss:...@efiss.0ziet.mongodb.net/efiss"

# Connect to MongoDB
client = pymongo.MongoClient(connection_string)
db = client.efiss
products_collection = db.products

# Remove images from documents
for image_url in remove_from_MongoDB:
    # Start a MongoDB session
    with client.start_session() as session:
        # Start a transaction
        session.start_transaction()

        try:
            # Extract the _oid from the URL
            product_oid = image_url.split("/")[-1].split("_")[0]

            image_url = "https://storage.googleapis.com/efiss/" + image_url

            if not product_oid:
                continue

            # Convert the _oid string to ObjectId
            product_oid = ObjectId(product_oid)

            print(f"Removing image {image_url} from product {product_oid}...")

            # Get the index of the image URL in the document field "images"
            obj = products_collection.find_one(
                {"_id": product_oid}, {"images": 1, "originalImages": 1}
            )
            print(obj)
            print(f"Current images: {obj['images']}")
            print(f"Original images: {obj['originalImages']}")
            
            
            image_index = obj["images"].index(image_url)
            print(f"Image index: {image_index}")
            
            # Remove the image URL from the document
            products_collection.update_one(
                {"_id": product_oid},
                {"$unset": {f"images.{image_index}": 1}},
                {"$pull": {"images": None}}
            )

            # if originalImages is not empty, remove the image URL from the document
            if obj["originalImages"]:
                products_collection.update_one(
                    {"_id": product_oid},
                    {"$unset": {f"originalImages.{image_index}": 1}},
                    {"$pull": {"originalImages": None}}
                )
            
            obj = products_collection.find_one(
                {"_id": product_oid}, {"images": 1}
            )

            # Remove the image from the document
            print(f"After removing image: {obj['images']}")
            print(f"After removing original image: {obj['originalImages']}")

            # Abort the transaction
            session.abort_transaction()

        except Exception as e:
            print(f"Error removing image {image_url}: {e}")
            session.abort_transaction()
    break
            
session.end_session()

Removing image https://storage.googleapis.com/efiss/data/product_images/www2.hm.com/644c1fb61a1373b05d9c4452_5_www2_hm_com.jpg from product 644c1fb61a1373b05d9c4452...
{'_id': ObjectId('644c1fb61a1373b05d9c4452'), 'images': ['https://storage.googleapis.com/efiss/data/product_images/www2.hm.com/644c1fb61a1373b05d9c4452_0_www2_hm_com.jpg', 'https://storage.googleapis.com/efiss/data/product_images/www2.hm.com/644c1fb61a1373b05d9c4452_4_www2_hm_com.jpg', 'https://storage.googleapis.com/efiss/data/product_images/www2.hm.com/644c1fb61a1373b05d9c4452_5_www2_hm_com.jpg'], 'originalImages': []}
Current images: ['https://storage.googleapis.com/efiss/data/product_images/www2.hm.com/644c1fb61a1373b05d9c4452_0_www2_hm_com.jpg', 'https://storage.googleapis.com/efiss/data/product_images/www2.hm.com/644c1fb61a1373b05d9c4452_4_www2_hm_com.jpg', 'https://storage.googleapis.com/efiss/data/product_images/www2.hm.com/644c1fb61a1373b05d9c4452_5_www2_hm_com.jpg']
Original images: []
Image index: 2
Error remo

In [None]:
thumbnail_list = list_blobs('gs://efiss/data/thumbnail/')

with open('thumbnail_list_efiss2.txt', 'w') as f:
    for item in tqdm(thumbnail_list):
        f.write("%s\n" % item)

0it [00:00, ?it/s]

Blobs: <google.api_core.page_iterator.HTTPIterator object at 0x7fed50113050>


6592202it [48:09, 2281.08it/s]


In [None]:
files_list = []
with open('files_list_efiss.txt', 'r') as f:
    for line in f:
        files_list.append(line.strip())

thumbnail_list = []
with open('thumbnail_list_efiss3.txt', 'r') as f:
    for line in f:
        thumbnail_list.append(line.strip())

In [None]:
files_list = set(files_list)  # GCS
thumbnail_list = set(thumbnail_list)  # GCS thumbnail

In [None]:
thumbnail_list = {i.replace("thumbnail", "product_images") for i in thumbnail_list}
print(len(thumbnail_list))
# get list of needed images to be thumbnail
to_be_thumbnail = files_list - thumbnail_list
print(len(to_be_thumbnail))

from tqdm import tqdm
with open('to_be_thumbnail2.txt', 'w') as f:
    for item in tqdm(to_be_thumbnail):
        f.write("%s\n" % item)

5734089
948221


100%|██████████| 948221/948221 [00:00<00:00, 2164481.83it/s]


In [None]:
!gsutil -m cp to_be_thumbnail2.txt gs://efiss/queue/

Copying file://to_be_thumbnail2.txt [Content-Type=text/plain]...
/ [1/1 files][ 85.0 MiB/ 85.0 MiB] 100% Done                                    
Operation completed over 1 objects/85.0 MiB.                                     


In [None]:
# danh Index -> loc ra filter

In [3]:
thumbnail_list = list_blobs('gs://efiss/data/thumbnail/')

with open('thumbnail_list_efiss3.txt', 'w') as f:
    for item in tqdm(thumbnail_list):
        f.write("%s\n" % item)

0it [00:00, ?it/s]

Blobs: <google.api_core.page_iterator.HTTPIterator object at 0x7fed50113050>


6592202it [48:09, 2281.08it/s]


In [4]:
files_list = []
with open('files_list_efiss.txt', 'r') as f:
    for line in f:
        files_list.append(line.strip())

thumbnail_list = []
with open('thumbnail_list_efiss3.txt', 'r') as f:
    for line in f:
        thumbnail_list.append(line.strip())

In [5]:
files_list = set(files_list)  # GCS
thumbnail_list = set(thumbnail_list)  # GCS thumbnail

In [6]:
thumbnail_list = {i.replace("thumbnail", "product_images") for i in thumbnail_list}
print(len(thumbnail_list))
# get list of needed images to be thumbnail
to_be_thumbnail = files_list - thumbnail_list
print(len(to_be_thumbnail))

from tqdm import tqdm
with open('to_be_thumbnail3.txt', 'w') as f:
    for item in tqdm(to_be_thumbnail):
        f.write("%s\n" % item)

6592202
90108


100%|██████████| 90108/90108 [00:00<00:00, 1611507.23it/s]


In [7]:
!gsutil -m cp to_be_thumbnail3.txt gs://efiss/queue/

Copying file://to_be_thumbnail3.txt [Content-Type=text/plain]...
- [1/1 files][  8.1 MiB/  8.1 MiB] 100% Done                                    
Operation completed over 1 objects/8.1 MiB.                                      


In [8]:
print(len(to_be_thumbnail))

90108


In [9]:
!wc -l *.txt

   6682310 files_list_efiss.txt
     11644 non_exist_imgs.txt
      2238 out_imgs.txt
   6675521 product_images_efiss.txt
         0 size.txt
   5734089 thumbnail_list_efiss2.txt
   6592202 thumbnail_list_efiss3.txt
   3788746 thumbnail_list_efiss.txt
    948221 to_be_thumbnail2.txt
     90108 to_be_thumbnail3.txt
   2893564 to_be_thumbnail.txt
  33418643 total


In [7]:
thumbnail_list = []
with open('to_be_thumbnail4.txt', 'r') as f:
    for line in f:
        thumbnail_list.append(line.strip())

In [8]:
thumbnail_list = set(thumbnail_list)
len(thumbnail_list)

46099

In [9]:
done_thumbnail = []
# with open('./thumbnailed3.txt', 'r') as f:
#     for line in f:
#         done_thumbnail.append(line.strip())
# print(len(done_thumbnail))
with open('./thumbnailed5.txt', 'r') as f:
    for line in f:
        done_thumbnail.append(line.strip())
print(len(done_thumbnail))

46156


In [10]:
done_thumbnail = set(done_thumbnail)
len(done_thumbnail)

46081

In [11]:
list(thumbnail_list)[:3]

['data/product_images/shopee-phuongmyt/64dc9ac3e0cc244fc9798d35_2_shopee_phuongmyt.jpeg',
 'data/product_images/shopee-lovito.vn/64626a28614eb2054535261a_7_shopee_lovito_vn.jpeg',
 'data/product_images/shopee-Mt_knh_Vit_Thnh/64d3faec82f1ff9ead2fb823_7_shopee_Mt_knh_Vit_Thnh.jpeg']

In [12]:
list(done_thumbnail)[:3]

['data/product_images/shopee-phuongmyt/64dc9ac3e0cc244fc9798d35_2_shopee_phuongmyt.jpeg',
 'data/product_images/shopee-lovito.vn/64626a28614eb2054535261a_7_shopee_lovito_vn.jpeg',
 'data/product_images/shopee-Mt_knh_Vit_Thnh/64d3faec82f1ff9ead2fb823_7_shopee_Mt_knh_Vit_Thnh.jpeg']

In [13]:
"data/product_images/shopee-MsMona.VN/64dc1245d0fdbcfe0ec504e6_5_shopee_MsMona_VN.jpeg" in thumbnail_list

False

In [15]:
to_be_thumbnail6 = thumbnail_list - done_thumbnail
len(to_be_thumbnail6)

18

In [17]:
from tqdm import tqdm
with open('to_be_thumbnail6.txt', 'w') as f:
    for item in tqdm(to_be_thumbnail6):
        f.write("%s\n" % item)

100%|██████████| 18/18 [00:00<00:00, 137268.13it/s]


In [None]:
# danh Index -> loc ra filter