In [None]:
import os
import sys

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from mpl_toolkits.basemap import Basemap

%matplotlib inline

sys.path.append("..")
from pangaea_downloader import checker, utilz

## Load downloaded files

In [None]:
# Load files from downloads directory
TEST_DIR = "../query-outputs/"
files = os.listdir(TEST_DIR)
df_list = [pd.read_csv(os.path.join(TEST_DIR, f)) for f in files]

# List of valid image file extensions
valid_img_extensions = (".jpg", ".jpeg", ".png", ".tif", ".tiff")

## 1. Count images in each file
- Count values in URL column
- Number of valid URLs
- Number of invalid URLs
- URLs with image file extensions

In [None]:
img_counts = []
for i, (file, df) in enumerate(zip(files, df_list)):
    # Count only the first url column
    col = utilz.get_url_cols(df)[0]
    # Count stuff
    n_rows = len(df)
    count = df[col].count()
    valid_url = df[col].apply(checker.is_url).sum()
    invalid_url = count - valid_url
    valid_img_ext = df[col].apply(checker.is_img_url).sum()
    missing = df[col].isna().sum()
    # Keep record of counts
    img_counts.append(
        {
            "file": file,
            "column": col,
            "n_rows": n_rows,
            "count": count,
            "valid_url": valid_url,
            "invalid_url": invalid_url,
            "valid_img_ext": valid_img_ext,
            "missing": missing,
        }
    )
# Make a dataframe
img_counts = pd.DataFrame(img_counts)

# Show resuts
print(f"Raw image count in all files: {img_counts['count'].sum()}")
print(f"Total number of valid urls: {img_counts['valid_url'].sum()}")
print(f"Total number of valid image urls: {img_counts['valid_img_ext'].sum()}")
img_counts

## 2. Campaigns
- How many campaigns (Why 33 campaigns for 290+ files?)
- Distribution of images across campaigns
- How many sites
- Distribution of images across sites

### 2.1 Check if each file has only one campaign

In [None]:
# Datasets with more than one campaign (should be empty)
a = [df["Campaign"].unique() for df in df_list if df["Campaign"].nunique() > 1]
if not len(a) > 0:
    print("Each files has one associated campaign.")

### 2.2 Number of datasets per campaign
Many of the datasets are from the same campaign

In [None]:
datasets = pd.DataFrame(
    {
        "file": files,
        "campaign": [df["Campaign"].unique()[0] for df in df_list],
        "total_nans": [df.isna().sum().sum() for df in df_list],
        "nan_percent": [round(df.isna().sum().sum() / df.size, 4) for df in df_list],
    }
)
datasets.loc[35:45]

In [None]:
print(f"Total datasets: {datasets['campaign'].count()}")
print(f"Total number of campaigns in all files: {datasets['campaign'].nunique()}")
plt.figure(figsize=(10, 8))
plt.title("Number of datasets from each campaign")
sns.countplot(data=datasets, y="campaign", edgecolor="black", linewidth=1)
plt.show()

### 2.3 Number of images per campaign

In [None]:
camps = {campaign: 0 for campaign in datasets["campaign"].unique()}
for df in df_list:
    campaign = df["Campaign"].unique()[0]
    img_cols = utilz.get_url_cols(df)
    if len(img_cols) > 0:
        img_col = img_cols[0]
        camps[campaign] += df[img_col].count()
camps = pd.Series(camps)

In [None]:
plt.figure(figsize=(10, 8))
plt.title("Number of images per campaign")
sns.barplot(y=camps.index, x=camps.values, edgecolor="black", linewidth=1)
plt.show()

## 3. Analyze missing values
- Raw total missing values
- How many missing values in mandatory columns

### 3.1 How many datasets have missing values

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(20, 5))
fig.suptitle("Missing values across datasets")
ax[0].hist(datasets["total_nans"], bins=20, edgecolor="black", linewidth=1)
ax[0].set_xlabel("Number of missing values")
ax[0].set_ylabel("Count")
# ax[0].set_yscale("log")
ax[1].hist(datasets["nan_percent"], bins=20, edgecolor="black", linewidth=1)
ax[1].set_xlabel("Percentage of missing values")
ax[1].set_ylabel("Count")
# ax[1].set_yscale("log")
plt.show()

As we can see most datasets have close to 0 missing values. There are a few datasets with 5,000 or close to 40,000 missing values. The percentage plot also shows a similar picture. Most datasets have below 10% missing values. While a few have 20-25% missing. Percentages are calcualted by dividing the total number of missing values of a dataset and dividing by the size (rowsXcols) of then dataset.

### 3.2 Detailed breakdown of missing values
Let us now examine each dataset and check which columns have how many missing values.

In [None]:
for i, (df, file) in enumerate(zip(df_list, files)):
    nans_per_column = df.isna().sum()
    total_nans = nans_per_column.sum()
    if total_nans > 0:
        print(f"[{i}][{file}] Total {total_nans} null values in dataframe.")
        display(nans_per_column[nans_per_column > 0])

## 4. Spatial distribution of samples

In [None]:
# Join all data
all_dfs = pd.concat(df_list)

### 4.1 Explore Latitude, Longitude metadata

In [None]:
# Range
print(f" Latitude range: {all_dfs.Latitude.min()} to {all_dfs.Latitude.max()}")
print(f"Longitude range: {all_dfs.Longitude.min()} to {all_dfs.Longitude.max()}")

In [None]:
# Counts
all_dfs[["Latitude", "Longitude"]].count()

In [None]:
# Missing values
all_dfs[["Latitude", "Longitude"]].isna().sum()

### 4.2 Plot

In [None]:
sns.jointplot(data=all_dfs, x="Longitude", y="Latitude", kind="kde")

In [None]:
def make_basemap(df: pd.DataFrame, full_map=True):
    llcrnrlat = df.Latitude.min() if not full_map else -90
    urcrnrlat = df.Latitude.max() if not full_map else 90
    llcrnrlon = df.Longitude.min() if not full_map else -180
    urcrnrlon = df.Longitude.max() if not full_map else 180

    m = Basemap(
        projection="mill",
        resolution="c",
        llcrnrlat=llcrnrlat,
        urcrnrlat=urcrnrlat,
        llcrnrlon=llcrnrlon,
        urcrnrlon=urcrnrlon,
    )
    return m

In [None]:
# Prepare map
fig = plt.figure(figsize=(12, 9))
m = make_basemap(all_dfs, full_map=True)
# m.drawcoastlines()
m.drawlsmask(land_color="grey", ocean_color="white", lakes=True)
# m.etopo()
# m.bluemarble()
# m.shadedrelief()
m.drawparallels(np.arange(-90, 90, step=10), labels=[1, 0, 0, 0])
m.drawmeridians(np.arange(-180, 180, step=30), labels=[0, 0, 0, 1])

# Plot data
lon_x = all_dfs["Longitude"].to_list()
lat_y = all_dfs["Latitude"].to_list()

m.scatter(lon_x, lat_y, latlon=True, alpha=0.25, s=20, c="red", marker="o")

plt.title("Spatial distribution of Pangaea dataset images", fontsize=20)
plt.show()

## 5. Plot Sample images

### 5.1 Take a sample of image urls

In [None]:
# Sample each file/dataset's url column
sample_imgs = []
for i, (file, df) in enumerate(zip(files, df_list)):
    # Take a sample from the first url column
    col = utilz.get_url_cols(df)[0]
    sample = df[col].sample().iloc[0]
    # Check if it is string and is valid url
    if (
        isinstance(sample, str)
        and checker.is_url(sample)
        and (sample.lower().endswith(valid_img_extensions))
    ):
        sample_imgs.append(sample)

# Keep a subset of samples
sample_imgs = np.random.choice(sample_imgs, size=12)

### 5.2 Retrieve sampled images

In [None]:
# TODO: Add function to utilz.py
def img_from_url(url: str, verbose=False) -> np.array:
    """Take an image url and return retrieved image array."""
    success = False
    while not success:
        resp = requests.get(url, stream=True)
        print(f"status code: {resp.status_code}") if verbose else 0
        success = True if (resp.status_code == 200) else False
        if success:
            arr = np.asarray(bytearray(resp.content), dtype=np.uint8)
            img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

### 5.3 Plot sampled images

In [None]:
ncols = 4
nrows = int(len(sample_imgs) / ncols)
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16, nrows * 4))
for ax, url in zip(axes.flat, sample_imgs):
    # Retrieve image from url
    print(f"Retrieving: {url} ...")
    img = img_from_url(url, verbose=True)
    # Plot image
    ax.imshow(img)
    ax.set_title(f"Shape: {img.shape}")
fig.tight_layout()
fig.set_facecolor("w")
plt.show()