In [2]:
import re
import constants
import os
import pandas as pd
import multiprocessing
import time
from tqdm import tqdm
from pathlib import Path
from functools import partial
import urllib.request
from PIL import Image

def common_mistake(unit):
    if unit in constants.allowed_units:
        return unit
    if unit.replace('ter', 'tre') in constants.allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in constants.allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    s_stripped = "" if s is None or str(s) == 'nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError(f"Invalid format in {s}")
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in constants.allowed_units:
        raise ValueError(f"Invalid unit [{unit}] found in {s}. Allowed units: {constants.allowed_units}")
    return number, unit

def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        print(f"Error creating placeholder image: {e}")

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str) or not image_link.strip():
        print(f"Skipping invalid image link: {image_link}")
        return

    try:
        filename = Path(image_link).name
        image_save_path = os.path.join(save_folder, filename)

        if os.path.exists(image_save_path):
            print(f"Image already exists: {image_save_path}")
            return

        for _ in range(retries):
            try:
                urllib.request.urlretrieve(image_link, image_save_path)
                return
            except Exception as e:
                print(f"Error downloading {image_link}: {e}")
                time.sleep(delay)

        # Create a placeholder image if all download attempts fail
        create_placeholder_image(image_save_path)
    except Exception as e:
        print(f"Unhandled error with link {image_link}: {e}")

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        # Set the number of workers to 60 or lower to avoid the issue on Windows
        num_workers = min(60, multiprocessing.cpu_count())

        with multiprocessing.Pool(num_workers) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)

# Step 1: Read cleaned data from CSV
cleaned_csv_path = r"D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\Weightt_cleaned.csv"

# Load the CSV file into a DataFrame, ensuring low_memory is set to False to handle mixed data types properly
df = pd.read_csv(cleaned_csv_path, low_memory=False)

# Step 2: Extract image links from the 'img_link' column
# Convert all values to strings, drop NaNs, and strip whitespace to clean the links
image_links = df['image_link'].astype(str).dropna().map(str.strip).tolist()

# Step 3: Set the folder where images will be downloaded
download_folder = r"D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images"

# Step 4: Download images using the provided function
download_images(image_links, download_folder, allow_multiprocessing=False)


  5%|▍         | 6197/131187 [20:06<6:26:32,  5.39it/s] 

Error downloading https://m.media-amazon.com/images/I/DzP2RMRQO0.jpg: HTTP Error 400: Bad Request
Error downloading https://m.media-amazon.com/images/I/DzP2RMRQO0.jpg: HTTP Error 400: Bad Request
Error downloading https://m.media-amazon.com/images/I/DzP2RMRQO0.jpg: HTTP Error 400: Bad Request


 14%|█▍        | 19021/131187 [1:10:35<6:56:14,  4.49it/s] 


KeyboardInterrupt: 

In [1]:
import re
import constants
import os
import pandas as pd
import multiprocessing
import time
from tqdm import tqdm
from pathlib import Path
from functools import partial
import urllib.request
from PIL import Image

def common_mistake(unit):
    if unit in constants.allowed_units:
        return unit
    if unit.replace('ter', 'tre') in constants.allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in constants.allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    s_stripped = "" if s is None or str(s) == 'nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError(f"Invalid format in {s}")
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in constants.allowed_units:
        raise ValueError(f"Invalid unit [{unit}] found in {s}. Allowed units: {constants.allowed_units}")
    return number, unit

def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
        print(f"Placeholder image created at {image_save_path}")
    except Exception as e:
        print(f"Error creating placeholder image: {e}")

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str) or not image_link.strip():
        print(f"Skipping invalid image link: {image_link}")
        return

    try:
        filename = Path(image_link).name
        image_save_path = os.path.join(save_folder, filename)

        if os.path.exists(image_save_path):
            print(f"Image already exists: {image_save_path}")
            return

        print(f"Attempting to download image from {image_link}")

        for attempt in range(retries):
            try:
                urllib.request.urlretrieve(image_link, image_save_path)
                print(f"Downloaded {image_link} to {image_save_path}")
                return
            except Exception as e:
                print(f"Error on attempt {attempt+1} to download {image_link}: {e}")
                time.sleep(delay)

        # Create a placeholder image if all download attempts fail
        create_placeholder_image(image_save_path)
    except Exception as e:
        print(f"Unhandled error with link {image_link}: {e}")

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        num_workers = min(60, multiprocessing.cpu_count())

        with multiprocessing.Pool(num_workers) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)

# Step 1: Read cleaned data from CSV
cleaned_csv_path = r"D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\Weightt_cleaned.csv"

# Load the CSV file into a DataFrame, ensuring low_memory is set to False to handle mixed data types properly
df = pd.read_csv(cleaned_csv_path, low_memory=False)

# Step 2: Extract image links from the 'image_link' column
# Convert all values to strings, drop NaNs, and strip whitespace to clean the links
image_links = df['image_link'].astype(str).dropna().map(str.strip).tolist()

# Limit to the first 30 image links
image_links = image_links[:30]

# Step 3: Set the folder where images will be downloaded
download_folder = r"D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images"

# Step 4: Download images using the provided function
download_images(image_links, download_folder, allow_multiprocessing=False)


  0%|          | 0/30 [00:00<?, ?it/s]

Attempting to download image from https://m.media-amazon.com/images/I/61I9XdN6OFL.jpg


  7%|▋         | 2/30 [00:00<00:08,  3.12it/s]

Downloaded https://m.media-amazon.com/images/I/61I9XdN6OFL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\61I9XdN6OFL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/61BZ4zrjZXL.jpg
Downloaded https://m.media-amazon.com/images/I/61BZ4zrjZXL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\61BZ4zrjZXL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/612mrlqiI4L.jpg


 13%|█▎        | 4/30 [00:00<00:05,  5.16it/s]

Downloaded https://m.media-amazon.com/images/I/612mrlqiI4L.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\612mrlqiI4L.jpg
Attempting to download image from https://m.media-amazon.com/images/I/617Tl40LOXL.jpg
Downloaded https://m.media-amazon.com/images/I/617Tl40LOXL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\617Tl40LOXL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/61QsBSE7jgL.jpg


 17%|█▋        | 5/30 [00:01<00:04,  5.15it/s]

Downloaded https://m.media-amazon.com/images/I/61QsBSE7jgL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\61QsBSE7jgL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/81xsq6vf2qL.jpg


 20%|██        | 6/30 [00:01<00:05,  4.68it/s]

Downloaded https://m.media-amazon.com/images/I/81xsq6vf2qL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\81xsq6vf2qL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/71DiLRHeZdL.jpg


 23%|██▎       | 7/30 [00:01<00:05,  4.25it/s]

Downloaded https://m.media-amazon.com/images/I/71DiLRHeZdL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\71DiLRHeZdL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/91Cma3RzseL.jpg


 30%|███       | 9/30 [00:02<00:05,  3.99it/s]

Downloaded https://m.media-amazon.com/images/I/91Cma3RzseL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\91Cma3RzseL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/71jBLhmTNlL.jpg
Downloaded https://m.media-amazon.com/images/I/71jBLhmTNlL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\71jBLhmTNlL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/81N73b5khVL.jpg


 37%|███▋      | 11/30 [00:02<00:04,  4.66it/s]

Downloaded https://m.media-amazon.com/images/I/81N73b5khVL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\81N73b5khVL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/61oMj2iXOuL.jpg
Downloaded https://m.media-amazon.com/images/I/61oMj2iXOuL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\61oMj2iXOuL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/91LPf6OjV9L.jpg


 43%|████▎     | 13/30 [00:03<00:04,  4.00it/s]

Downloaded https://m.media-amazon.com/images/I/91LPf6OjV9L.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\91LPf6OjV9L.jpg
Attempting to download image from https://m.media-amazon.com/images/I/81fOxWWWKYL.jpg
Downloaded https://m.media-amazon.com/images/I/81fOxWWWKYL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\81fOxWWWKYL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/81dzao1Ob4L.jpg


 47%|████▋     | 14/30 [00:03<00:04,  3.95it/s]

Downloaded https://m.media-amazon.com/images/I/81dzao1Ob4L.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\81dzao1Ob4L.jpg
Attempting to download image from https://m.media-amazon.com/images/I/91-iahVGEDL.jpg


 50%|█████     | 15/30 [00:03<00:03,  4.11it/s]

Downloaded https://m.media-amazon.com/images/I/91-iahVGEDL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\91-iahVGEDL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/81S2+GnYpTL.jpg


 53%|█████▎    | 16/30 [00:03<00:03,  4.17it/s]

Downloaded https://m.media-amazon.com/images/I/81S2+GnYpTL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\81S2+GnYpTL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/81e2YtCOKvL.jpg


 57%|█████▋    | 17/30 [00:04<00:03,  4.04it/s]

Downloaded https://m.media-amazon.com/images/I/81e2YtCOKvL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\81e2YtCOKvL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/81RNsNEM1EL.jpg


 60%|██████    | 18/30 [00:04<00:02,  4.04it/s]

Downloaded https://m.media-amazon.com/images/I/81RNsNEM1EL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\81RNsNEM1EL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/91prZeizZnL.jpg


 67%|██████▋   | 20/30 [00:04<00:02,  4.24it/s]

Downloaded https://m.media-amazon.com/images/I/91prZeizZnL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\91prZeizZnL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/31EvJszFVfL.jpg
Downloaded https://m.media-amazon.com/images/I/31EvJszFVfL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\31EvJszFVfL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/61sQ+qAKr4L.jpg


 73%|███████▎  | 22/30 [00:05<00:01,  4.94it/s]

Downloaded https://m.media-amazon.com/images/I/61sQ+qAKr4L.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\61sQ+qAKr4L.jpg
Attempting to download image from https://m.media-amazon.com/images/I/81x77l2T5NL.jpg
Downloaded https://m.media-amazon.com/images/I/81x77l2T5NL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\81x77l2T5NL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/71nywfWZUwL.jpg


 80%|████████  | 24/30 [00:05<00:01,  4.83it/s]

Downloaded https://m.media-amazon.com/images/I/71nywfWZUwL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\71nywfWZUwL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/51WsuKKAVrL.jpg
Downloaded https://m.media-amazon.com/images/I/51WsuKKAVrL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\51WsuKKAVrL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/61XGDKap+JL.jpg


 87%|████████▋ | 26/30 [00:06<00:00,  5.69it/s]

Downloaded https://m.media-amazon.com/images/I/61XGDKap+JL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\61XGDKap+JL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/715vVcWJxGL.jpg
Downloaded https://m.media-amazon.com/images/I/715vVcWJxGL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\715vVcWJxGL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/613v+2W4UwL.jpg


 93%|█████████▎| 28/30 [00:06<00:00,  5.60it/s]

Downloaded https://m.media-amazon.com/images/I/613v+2W4UwL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\613v+2W4UwL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/71+fn9TWQmL.jpg
Downloaded https://m.media-amazon.com/images/I/71+fn9TWQmL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\71+fn9TWQmL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/71aKgRRQ2wL.jpg


100%|██████████| 30/30 [00:06<00:00,  4.40it/s]

Downloaded https://m.media-amazon.com/images/I/71aKgRRQ2wL.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\71aKgRRQ2wL.jpg
Attempting to download image from https://m.media-amazon.com/images/I/71rKXZJrh4L.jpg
Downloaded https://m.media-amazon.com/images/I/71rKXZJrh4L.jpg to D:\Hackathon\66e31d6ee96cd_student_resource_33\student_resource 3\dataset\images\71rKXZJrh4L.jpg



