WEB SCRAPING OF ALL SKY IMAGES

In [None]:
import os
import requests
# from google.colab import drive

# Mount Google Drive
# drive.mount('/content/drive')

# Define the base URL
base_url = "https://midcdmz.nrel.gov/tsi/SRRLASI/"

# Iterate over years (2021 to 2023)
for year in range(2021,2024):
    # Iterate over months (1 to 12)
    for month in range(1, 13):
        # Iterate over days (1 to 31)
        for day in range(1, 32):
            # Format the date as YYYYMMDD
            date_str = f"{year}{month:02d}{day:02d}"

            # Construct the download URL
            download_url = f"{base_url}{year}/{date_str}.zip"

            # Check if the URL exists and the file can be downloaded
            response = requests.head(download_url)
            if response.status_code == 200:
                # Create a directory for the year if it doesn't exist in Google Drive
                year_directory_drive = '/content/drive/MyDrive/All Sky Images/' + str(year)
                if not os.path.exists(year_directory_drive):
                    os.mkdir(year_directory_drive)

                # Download the file to Google Drive
                file_name_drive = os.path.join(year_directory_drive, f"{date_str}.zip")
                with open(file_name_drive, 'wb') as file:
                    file.write(requests.get(download_url).content)
                print(f"Downloaded: {file_name_drive}")
            else:
                # If the URL does not exist, move to the next date
                print(download_url)
                continue


Extracting zip files and filtering the images

In [None]:
import os
import zipfile
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Specify the source and destination folders

source_folder = '/content/drive/MyDrive/All Sky Images/2023'  # Source folder in Google Drive
destination_folder = '/content/drive/MyDrive/All Sky Images/Extracted Images'  # Desired destination folder

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# List of file extensions to keep
extensions_to_keep = ['.jpg']

# File names to keep
filenames_to_keep = ['11_NE.jpg', '12_UE.jpg']

# Iterate through the ZIP files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith('.zip'):
        zip_filepath = os.path.join(source_folder, filename)
        print(zip_filepath)

        # Create a folder for the extracted files
        extracted_folder = os.path.join(destination_folder, '2023')
        os.makedirs(extracted_folder, exist_ok=True)
        print(extracted_folder)
        # Extract the ZIP file into the extracted folder
        with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
            for file_info in zip_ref.infolist():
                if file_info.filename.endswith('.jpg') and \
                   (file_info.filename.endswith(tuple(filenames_to_keep))):
                    # Extract and save the specified JPG files
                    zip_ref.extract(file_info, extracted_folder)

print("Extraction completed. Extracted files are in the '2022' folder within the destination folder.")

Sample Images Display

In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt

# Define the folder path where your JPG images are located
folder_path = '/content/drive/MyDrive/All Sky Images/Extracted Images/2021'

# List all JPG files in the folder
jpg_files = [f for f in os.listdir(folder_path) if f.endswith('.jpg')]

print(len(jpg_files))
# Number of sample images to display
num_samples = 5

# Display sample images
for i, jpg_file in enumerate(jpg_files[:num_samples]):
    img_path = os.path.join(folder_path, jpg_file)
    img = Image.open(img_path)

    plt.subplot(1, num_samples, i+1)
    plt.imshow(img)
    plt.axis('off')
    plt.title(jpg_file)

plt.show()