In [None]:
import os
import sys

import cartopy.crs as ccrs
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from sklearn.neighbors import KernelDensity

sys.path.append("..")
from pangaea_downloader import checker, eda, utilz

## Load downloaded files

In [None]:
# Load files from downloads directory
TEST_DIR = "../query-outputs/"
files = os.listdir(TEST_DIR)
df_list = [pd.read_csv(os.path.join(TEST_DIR, f)) for f in files]
print(f"Total {len(df_list)} datasets loaded.")

## 1. Count images in each file
- Count values in URL column
- Number of valid URLs
- Number of invalid URLs
- URLs with image file extensions

In [None]:
img_counts = []
for i, (file, df) in enumerate(zip(files, df_list)):
    # Count only the first url column
    col = utilz.get_url_cols(df)[0]
    # Count stuff
    n_rows = len(df)
    count = df[col].count()
    valid_url = df[col].apply(checker.is_url).sum()
    invalid_url = count - valid_url
    valid_img_ext = df[col].apply(checker.is_img_url).sum()
    missing = df[col].isna().sum()
    # Keep record of counts
    img_counts.append(
        {
            "file": file,
            "column": col,
            "n_rows": n_rows,
            "count": count,
            "valid_url": valid_url,
            "invalid_url": invalid_url,
            "valid_img_ext": valid_img_ext,
            "missing": missing,
        }
    )
# Make a dataframe
img_counts = pd.DataFrame(img_counts)

# Show resuts
print(f"Raw image count in all files: {img_counts['count'].sum()}")
print(f"Total number of valid urls: {img_counts['valid_url'].sum()}")
print(f"Total number of valid image urls: {img_counts['valid_img_ext'].sum()}")
img_counts

## 2. Campaigns
- How many campaigns (Why 33 campaigns for 290+ files?)
- Distribution of images across campaigns

### 2.1 Check if each file has only one campaign

In [None]:
# Datasets with more than one campaign (should be empty)
a = [df["Campaign"].unique() for df in df_list if df["Campaign"].nunique() > 1]
if not len(a) > 0:
    print("Each files has one associated campaign.")

### 2.2 Number of datasets per campaign
Many of the datasets are from the same campaign

In [None]:
datasets = pd.DataFrame(
    {
        "file": files,
        "campaign": [df["Campaign"].unique()[0] for df in df_list],
        "total_nans": [df.isna().sum().sum() for df in df_list],
        "nan_percent": [round(df.isna().sum().sum() / df.size, 4) for df in df_list],
    }
)
datasets.loc[35:45]

In [None]:
print(f"Total datasets: {datasets['campaign'].count()}")
print(f"Total number of campaigns in all files: {datasets['campaign'].nunique()}")
plt.figure(figsize=(10, 8))
plt.title("Number of datasets per campaign")
sns.countplot(data=datasets, y="campaign", edgecolor="black", linewidth=1)
plt.grid()
plt.show()

### 2.3 Number of images per campaign

In [None]:
camps = {campaign: 0 for campaign in datasets["campaign"].unique()}
for df in df_list:
    campaign = df["Campaign"].unique()[0]
    img_cols = utilz.get_url_cols(df)
    if len(img_cols) > 0:
        img_col = img_cols[0]
        camps[campaign] += df[img_col].count()
camps = pd.Series(camps)

In [None]:
plt.figure(figsize=(10, 8))
plt.title("Number of images per campaign")
sns.barplot(y=camps.index, x=camps.values, edgecolor="black", linewidth=1)
plt.grid()
plt.show()

## 3. Site/deployment/event
### 3.1 Number of sites per dataset

In [None]:
num_site_per_file = pd.DataFrame(
    {"file": files, "n_sites": [df["Site"].nunique() for df in df_list]}
)

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(16, 5))
# Raw numbers
ax[0].set_title("Number of sites in downloaded datasets\n")
ax[0].hist(num_site_per_file.n_sites, bins=30, edgecolor="black", linewidth=1)
ax[0].set_xlabel("Number of sites")
ax[0].set_ylabel("Frequency")
ax[0].grid()
# Log scale
ax[1].set_title("Number of sites in downloaded datasets\n(log scale)")
ax[1].hist(num_site_per_file.n_sites, bins=30, edgecolor="black", linewidth=1)
ax[1].set_xlabel("Number of sites")
ax[1].set_ylabel("Frequency")
ax[1].set_yscale("log")
ax[1].grid()
plt.show()

In [None]:
print(
    "Total number of unique sites:",
)
print(len(num_site_per_file[num_site_per_file.n_sites == 1]))
print("\nDatasets/files with more than 1 site:")
num_site_per_file[num_site_per_file.n_sites > 1]

- Most datasets have images from 1 site
- A few datasets have images from several sites (shown in table above)

### 3.2 Number of datasets per site/event/deployment

In [None]:
site_datasets = []
for i, (file, df) in enumerate(zip(files, df_list)):
    for site in df["Site"].unique():
        site_datasets.append({"file": file, "site": site})
site_datasets = pd.DataFrame(site_datasets)

In [None]:
print(f"Total datasets: {len(df_list)}")
print(f"Total number of sites in all files: {site_datasets['site'].nunique()}")
plt.figure(figsize=(5, 40))
plt.title("Number of datasets from each site")
sns.countplot(data=site_datasets, y="site", edgecolor="black", linewidth=1)
plt.tight_layout()
plt.grid()
plt.show()

### 3.2 Number of images per site/event/deployment

In [None]:
site_img_counts = {site: 0 for site in site_datasets.site.unique()}
for df in df_list:
    img_col = utilz.get_url_cols(df)[0]
    for site in df["Site"].unique():
        site_img_counts[site] += df[df.Site == site][img_col].count()
site_img_counts = pd.Series(site_img_counts)

In [None]:
plt.figure(figsize=(8, 45))
plt.title("Number of images per site/deployment/event")
sns.barplot(
    y=site_img_counts.index, x=site_img_counts.values, edgecolor="black", linewidth=1
)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(45, 8))
plt.title("Number of images per site/deployment/event", fontsize=35)
sns.barplot(
    x=site_img_counts.index, y=site_img_counts.values, edgecolor="black", linewidth=1
)
plt.xticks(rotation=90)
plt.yticks(fontsize=30)
plt.grid()
plt.show()

## 4. Analyze missing values
- Raw total missing values
- How many missing values in mandatory columns

### 4.1 How many datasets have missing values

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(20, 5))
fig.suptitle("Missing values across datasets")
ax[0].hist(datasets["total_nans"], bins=20, edgecolor="black", linewidth=1)
ax[0].set_xlabel("Number of missing values")
ax[0].set_ylabel("Count")
ax[0].grid()
# ax[0].set_yscale("log")
ax[1].hist(datasets["nan_percent"], bins=20, edgecolor="black", linewidth=1)
ax[1].set_xlabel("Percentage of missing values")
ax[1].set_ylabel("Count")
ax[1].grid()
# ax[1].set_yscale("log")
plt.show()

As we can see most datasets have close to 0 missing values. There are a few datasets with 5,000 or close to 40,000 missing values. The percentage plot also shows a similar picture. Most datasets have below 10% missing values. While a few have 20-25% missing. Percentages are calcualted by dividing the total number of missing values of a dataset and dividing by the size (rowsXcols) of then dataset.

### 4.2 Detailed breakdown of missing values
Let us now examine each dataset and check which columns have how many missing values.

In [None]:
for i, (df, file) in enumerate(zip(df_list, files)):
    nans_per_column = df.isna().sum()
    total_nans = nans_per_column.sum()
    if total_nans > 0:
        print(f"[{i}][{file}] Total {total_nans} null values in dataframe.")
        display(nans_per_column[nans_per_column > 0])

## 5. Spatial distribution of samples

In [None]:
# Join all data
all_dfs = pd.concat(df_list)

# Values for plotting
x = all_dfs["Longitude"].dropna().to_numpy()
y = all_dfs["Latitude"].dropna().to_numpy()
print("x.shape:", x.shape, "y.shape:", y.shape)

# Projection
projection = ccrs.EqualEarth()
# Transform
transform = ccrs.Geodetic()

### 5.1 Scatter plot

In [None]:
# Prepare map
fig = plt.figure(figsize=(25, 8))
ax = fig.add_subplot(projection=projection)
ax.stock_img()

# Plot data
ax.scatter(x, y, color="r", alpha=0.15, transform=transform)
ax.set_title("Spatial distribution of image samples")

plt.show()

### 5.2 Kernal Density Estimate

In [None]:
# Transparent colormap for plotting kernel density
my_cmap = eda.make_transparent_cmap()

#### 5.2.1 KDE on a sample of all coordinates

In [None]:
# Perform KDE
X2_, Y2_, Z2_ = eda.kde_sklearn(x[::100], y[::100], metric="haversine", bw_factor=0.1)

# Grid scatter
fig = plt.figure(figsize=(25, 8))
ax = fig.add_subplot(projection=projection)
ax.stock_img()
ax.scatter(
    np.degrees(X2_),
    np.degrees(Y2_),
    color="r",
    s=Z2_,
    transform=ccrs.PlateCarree(),
)
ax.set_title("KDE of image sample spatial distribution (grid scatter)")
plt.show()

# Contour plot (KDE)
fig = plt.figure(figsize=(25, 8))
ax = fig.add_subplot(projection=projection)
ax.stock_img()
ax.contourf(
    np.degrees(X2_),
    np.degrees(Y2_),
    np.exp(Z2_),
    cmap=my_cmap,
    # extent=[x0, x1, y0, y1],
    levels=np.linspace(0, np.exp(Z2_.max()), 25),
    # origin='lower',
    # transform=transform,
    transform=ccrs.PlateCarree(),
    # transform=ccrs.RotatedPole(),
)
# ax.scatter(x, y, color='r', alpha=0.25, transform=transform)
ax.set_title("KDE of image sample spatial distribution (densities)")
plt.show()

#### 5.2.2 KDE on all coordinates

In [None]:
# Perform KDE
X2_, Y2_, Z2_ = eda.kde_sklearn(x, y, metric="haversine", bw_factor=0.5)

# Grid scatter
fig = plt.figure(figsize=(25, 8))
ax = fig.add_subplot(projection=projection)
ax.stock_img()
ax.scatter(
    np.degrees(X2_),
    np.degrees(Y2_),
    color="r",
    s=Z2_,
    transform=ccrs.PlateCarree(),
)
ax.set_title("KDE of image sample spatial distribution (grid scatter)")
plt.show()

# Contour plot (KDE)
fig = plt.figure(figsize=(25, 8))
ax = fig.add_subplot(projection=projection)
ax.stock_img()
ax.contourf(
    np.degrees(X2_),
    np.degrees(Y2_),
    np.exp(Z2_),
    cmap=my_cmap,
    # extent=[x0, x1, y0, y1],
    levels=np.linspace(0, np.exp(Z2_.max()), 25),
    # origin='lower',
    # transform=transform,
    transform=ccrs.PlateCarree(),
    # transform=ccrs.RotatedPole(),
)
# ax.scatter(x, y, color='r', alpha=0.25, transform=transform)
ax.set_title("KDE of image sample spatial distribution (densities)")
plt.show()

## 6. Plot Sample images

### 6.1 Take a sample of image urls

In [None]:
# Sample each file/dataset's url column
sample_imgs = []
for i, (file, df) in enumerate(zip(files, df_list)):
    # Take a sample from the first url column
    col = utilz.get_url_cols(df)[0]
    sample = df[col].sample().iloc[0]
    # Check if it is string and is valid url
    if (
        isinstance(sample, str)
        and checker.is_url(sample)
        and (sample.lower().endswith(checker.VALID_IMG_EXTENSIONS))
    ):
        sample_imgs.append(sample)

# Keep a subset of samples
sample_imgs = np.random.choice(sample_imgs, size=12)

### 6.2 Retrieve sampled images

In [None]:
# TODO: Add function to utilz.py
def img_from_url(url: str, verbose=False) -> np.array:
    """Take an image url and return retrieved image array."""
    success = False
    while not success:
        resp = requests.get(url, stream=True)
        print(f"status code: {resp.status_code}") if verbose else 0
        success = True if (resp.status_code == 200) else False
        if success:
            arr = np.asarray(bytearray(resp.content), dtype=np.uint8)
            img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

### 6.3 Plot sampled images

In [None]:
ncols = 4
nrows = int(len(sample_imgs) / ncols)
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16, nrows * 4))
for ax, url in zip(axes.flat, sample_imgs):
    print(f"Retrieving: {url} ...")
    img = img_from_url(url, verbose=True)
    ax.imshow(img)
    ax.set_title(f"Shape: {img.shape}")
fig.tight_layout()
fig.set_facecolor("w")
plt.show()