# Crawl data from Google Images

## Make the dataset connected into Azure

In [None]:
# Install required packages
# %pip install beautifulsoup4
# %pip install openpyxl
# !pip install selenium pillow
# !pip install webdriver_manager

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Github/Dermatology_LLM

/content/drive/MyDrive/Github/Dermatology_LLM


In [None]:
# %restart_python

In [5]:
# Import necessary packages
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [6]:
# Load Excel data
ethnic_df = pd.read_excel("Datasets/Ethnic_Origin_Facial_Issues.xlsx")
issue_df = pd.read_excel("Datasets/Frequent_Facial_Issues.xlsx")

In [7]:
# Add a constant key and perform a cross join
ethnic_df['key'] = 1
issue_df['key'] = 1
cross_joined_df = pd.merge(ethnic_df, issue_df, on='key').drop('key', axis=1)

In [8]:
# Check the first few rows
cross_joined_df.head()

Unnamed: 0,Ethnic Origin,Common Facial Features,Potential Issues and Solutions,Issue,Reason,Solution
0,Caucasian,"Lighter skin, more prone to sunburn, fine line...","Acne (due to genetics, skin type), Hyperpigmen...",Acne and Breakouts,"Hormonal fluctuations, clogged pores, stress, ...","Gentle cleansing, salicylic acid, benzoyl pero..."
1,Caucasian,"Lighter skin, more prone to sunburn, fine line...","Acne (due to genetics, skin type), Hyperpigmen...",Hyperpigmentation,"Sun exposure, acne scars, hormonal changes (me...","Sunscreen, vitamin C, niacinamide, chemical pe..."
2,Caucasian,"Lighter skin, more prone to sunburn, fine line...","Acne (due to genetics, skin type), Hyperpigmen...",Dryness and Dehydration,"Weather changes, over-exfoliation, harsh clean...","Hydrating ingredients (hyaluronic acid, cerami..."
3,Caucasian,"Lighter skin, more prone to sunburn, fine line...","Acne (due to genetics, skin type), Hyperpigmen...",Oily Skin and Shine,Overactive sebaceous glands or genetic predisp...,"Oil-control products, clay masks, niacinamide,..."
4,Caucasian,"Lighter skin, more prone to sunburn, fine line...","Acne (due to genetics, skin type), Hyperpigmen...",Fine Lines and Wrinkles,"Aging, loss of collagen, sun exposure","Retinoids, peptides, sunscreen, and anti-aging..."


In [9]:
# Check the shape of the DataFrame
cross_joined_df.shape

(70, 6)

In [10]:
# Extract column names
col_name = list(cross_joined_df.columns)
col_name

['Ethnic Origin',
 'Common Facial Features',
 'Potential Issues and Solutions',
 'Issue',
 'Reason',
 'Solution']

In [11]:
# Create a new DataFrame with selected columns
df = cross_joined_df[['Ethnic Origin', 'Issue']]
df.head()

Unnamed: 0,Ethnic Origin,Issue
0,Caucasian,Acne and Breakouts
1,Caucasian,Hyperpigmentation
2,Caucasian,Dryness and Dehydration
3,Caucasian,Oily Skin and Shine
4,Caucasian,Fine Lines and Wrinkles


In [12]:
# Add a new column for search keywords
df['key_word'] = df['Ethnic Origin'] + "'s " + df['Issue']
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['key_word'] = df['Ethnic Origin'] + "'s " + df['Issue']


Unnamed: 0,Ethnic Origin,Issue,key_word
0,Caucasian,Acne and Breakouts,Caucasian's Acne and Breakouts
1,Caucasian,Hyperpigmentation,Caucasian's Hyperpigmentation
2,Caucasian,Dryness and Dehydration,Caucasian's Dryness and Dehydration
3,Caucasian,Oily Skin and Shine,Caucasian's Oily Skin and Shine
4,Caucasian,Fine Lines and Wrinkles,Caucasian's Fine Lines and Wrinkles


In [13]:
# Iterate over rows to check keywords
for index, row in df.iterrows():
    print(row['key_word'])

Caucasian's Acne and Breakouts
Caucasian's Hyperpigmentation
Caucasian's Dryness and Dehydration
Caucasian's Oily Skin and Shine
Caucasian's Fine Lines and Wrinkles
Caucasian's Dark Circles
Caucasian's Redness and Sensitivity
Caucasian's Blackheads and Pores
Caucasian's Uneven Skin Tone
Caucasian's Sun Damage
East Asian's Acne and Breakouts
East Asian's Hyperpigmentation
East Asian's Dryness and Dehydration
East Asian's Oily Skin and Shine
East Asian's Fine Lines and Wrinkles
East Asian's Dark Circles
East Asian's Redness and Sensitivity
East Asian's Blackheads and Pores
East Asian's Uneven Skin Tone
East Asian's Sun Damage
South Asian's Acne and Breakouts
South Asian's Hyperpigmentation
South Asian's Dryness and Dehydration
South Asian's Oily Skin and Shine
South Asian's Fine Lines and Wrinkles
South Asian's Dark Circles
South Asian's Redness and Sensitivity
South Asian's Blackheads and Pores
South Asian's Uneven Skin Tone
South Asian's Sun Damage
African's Acne and Breakouts
African'

In [None]:
# Set up directory for downloading images
import os
os.mkdir("/content/drive/MyDrive/Github/Dermatology_LLM/final_image")
os.chdir("/content/drive/MyDrive/Github/Dermatology_LLM/final_image")
print(os.getcwd())

# Crawl images from Google
for index, row in df.iterrows():
    search_query = row['key_word']
    url = f"https://www.google.com/search?hl=en&tbm=isch&q={search_query}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    images = soup.find_all('img')
    image_urls = [img['src'] for img in images if img['src'].startswith('http')]

    # Create a folder for each issue
    if not os.path.exists(row['Issue']):
        os.makedirs(search_query)

    # Download first 10 images
    count = 0
    for img_url in image_urls:
        if count >= 10:
            break
        try:
            img_data = requests.get(img_url).content
            img_size = len(img_data)
            if img_size > 50000:
                with open(f"{search_query}/image_{count+1}.jpg", 'wb') as f:
                    f.write(img_data)
                print(f"Downloaded {search_query}/image_{count+1}.jpg")
                count += 1
        except Exception as e:
            print(f"Failed to download image: {e}")

In [21]:
download_dir = "/content/drive/MyDrive/Github/Dermatology_LLM/final_image"
os.makedirs(download_dir, exist_ok=True)  # Ensure the directory exists
os.chdir(download_dir)
print(os.getcwd())


# Crawl images from Google
for index, row in df.iterrows():
    search_query = row['key_word']
    name = row['Issue']

    url = f"https://www.google.com/search?hl=en&tbm=isch&q={search_query}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    images = soup.find_all('img')
    image_urls = [img['src'] for img in images if img['src'].startswith('http')]

    # Download first 10 images with unique naming
    count = 0
    for img_url in image_urls:
        if count >= 10:
            break
        try:
            img_data = requests.get(img_url).content
            img_size = len(img_data)
            if img_size > 5000:  # Only download if image size is sufficient
                # Use search query and index for unique image naming
                img_filename = f"{download_dir}/{name.replace(' ', '_')}_{count+1}.jpg"
                with open(img_filename, 'wb') as f:
                    f.write(img_data)
                print(f"Downloaded {img_filename}")
                count += 1
        except Exception as e:
            print(f"Failed to download image: {e}")

/content/drive/MyDrive/Github/Dermatology_LLM/final_image
Downloaded /content/drive/MyDrive/Github/Dermatology_LLM/final_image/Acne_and_Breakouts_1.jpg
Downloaded /content/drive/MyDrive/Github/Dermatology_LLM/final_image/Redness_and_Sensitivity_1.jpg
Downloaded /content/drive/MyDrive/Github/Dermatology_LLM/final_image/Oily_Skin_and_Shine_1.jpg
Downloaded /content/drive/MyDrive/Github/Dermatology_LLM/final_image/Oily_Skin_and_Shine_2.jpg
Downloaded /content/drive/MyDrive/Github/Dermatology_LLM/final_image/Fine_Lines_and_Wrinkles_1.jpg
Downloaded /content/drive/MyDrive/Github/Dermatology_LLM/final_image/Uneven_Skin_Tone_1.jpg
Downloaded /content/drive/MyDrive/Github/Dermatology_LLM/final_image/Sun_Damage_1.jpg
Failed to download image: [Errno 2] No such file or directory: '/content/drive/MyDrive/Github/Dermatology_LLM/final_image/Dryness_and_Dehydration_1.jpg'
Failed to download image: [Errno 2] No such file or directory: '/content/drive/MyDrive/Github/Dermatology_LLM/final_image/Oily_Sk

In [17]:
search_query

"Indigenous/Native American's Sun Damage"

In [20]:
for index, row in df.iterrows():
  print(row)
  print("!!!!!!!!!!!!!!!!!!!!!!!")
  print(row["key_word"])
  break

Ethnic Origin                         Caucasian
Issue                        Acne and Breakouts
key_word         Caucasian's Acne and Breakouts
Name: 0, dtype: object
!!!!!!!!!!!!!!!!!!!!!!!
Caucasian's Acne and Breakouts


In [22]:
name

'Sun Damage'

In [23]:
import os
import requests
from bs4 import BeautifulSoup

download_dir = "/content/drive/MyDrive/Github/Dermatology_LLM/final_image"
os.makedirs(download_dir, exist_ok=True)  # Ensure the directory exists
os.chdir(download_dir)
print(os.getcwd())


# Crawl images from Google with improved structure
for index, row in df.iterrows():
    search_query = row['key_word']
    name = row['Issue']

    # Create Google Image search URL
    url = f"https://www.google.com/search?hl=en&tbm=isch&q={search_query}"

    try:
        # Perform request to Google image search
        response = requests.get(url)
        response.raise_for_status()  # Check if request succeeded
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract image URLs
        images = soup.find_all('img')
        image_urls = [img['src'] for img in images if img.get('src') and img['src'].startswith('http')]

        # Download first 10 images
        count = 0
        for img_url in image_urls:
            if count >= 10:
                break
            try:
                img_data = requests.get(img_url).content
                if len(img_data) > 5000:  # Only download if image is sufficiently large
                    # Create a unique file name
                    img_filename = f"{name.replace(' ', '_')}_{index}_{count+1}.jpg"
                    with open(img_filename, 'wb') as f:
                        f.write(img_data)
                    print(f"Downloaded {img_filename}")
                    count += 1
            except Exception as e:
                print(f"Failed to download image from {img_url}: {e}")
    except Exception as e:
        print(f"Failed to process search query '{search_query}': {e}")


/content/drive/MyDrive/Github/Dermatology_LLM/final_image
Downloaded Acne_and_Breakouts_0_1.jpg
Downloaded Redness_and_Sensitivity_6_1.jpg
Downloaded Oily_Skin_and_Shine_13_1.jpg
Downloaded Oily_Skin_and_Shine_13_2.jpg
Downloaded Fine_Lines_and_Wrinkles_14_1.jpg
Downloaded Uneven_Skin_Tone_18_1.jpg
Downloaded Sun_Damage_19_1.jpg
Downloaded Dryness_and_Dehydration_22_1.jpg
Downloaded Oily_Skin_and_Shine_23_1.jpg
Downloaded Redness_and_Sensitivity_26_1.jpg
Downloaded Uneven_Skin_Tone_28_1.jpg
Downloaded Uneven_Skin_Tone_28_2.jpg
Downloaded Dryness_and_Dehydration_32_1.jpg
Downloaded Oily_Skin_and_Shine_33_1.jpg
Downloaded Oily_Skin_and_Shine_33_2.jpg
Downloaded Blackheads_and_Pores_37_1.jpg
Downloaded Uneven_Skin_Tone_38_1.jpg
Downloaded Uneven_Skin_Tone_38_2.jpg
Downloaded Sun_Damage_39_1.jpg
Downloaded Sun_Damage_39_2.jpg
Downloaded Oily_Skin_and_Shine_43_1.jpg
Downloaded Oily_Skin_and_Shine_43_2.jpg
Downloaded Fine_Lines_and_Wrinkles_44_1.jpg
Downloaded Dark_Circles_45_1.jpg
Downloade