In [None]:
import os
import re
import sys

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns

%matplotlib inline

# sys.path.append("..")
# from pangaea_downloader import utilz

## 1. Load downloaded data
Load the data we downloaded from Pangaea. Presumably these should datasets of "*seabed photographs*" which was the search query that resulted in the downloaded data. 

**\# Mandatory Columns**<br>
- image URL
- longitude
- latitude
- campaign name (ID)
- deployment/site/dive

**\# Optional Columns**<br>
  - depth (optional)
  - altitude (optional)
  - timestamp (optional)
  - web/media source (optional)
  - name of provider (optional)
  - salinity (optional)
  - chlorophyll concentration (optional)
  - temperature (optional)

In [None]:
TEST_DIR = "../query-outputs"
# List of files in directory
files = os.listdir(TEST_DIR)
N_FILES = len(files)
print(f"[INFO] Total {N_FILES} files in directory.")
# Load data
df_list = [pd.read_csv(os.path.join(TEST_DIR, f)) for f in files]

## 2. Check one dataframe

In [None]:
df = df_list[16]
print(f"[INFO] {df.shape[0]} rows X {df.shape[1]} columns")
df.head(2)

### 2.1 Check null values

In [None]:
# Null values in each column of dataset
nans_per_column = df.isna().sum()
# Total null values in dataset
total_nans = nans_per_column.sum()
print(f"[INFO] Total {total_nans} null values in dataframe.")
# Showing only the columns with null values
nans_per_column[nans_per_column > 0]

**Note:** Although there are quite a few null values in the 'Course' and 'Speed' column, the dataset isn't missing values in the mandatory columns such as Image URL, Latitude, Longitude, campaign name or site.

### 2.2 Number of Images (with valid urls)
- Since the name for the image URL column varies with each file/dataset we have to first identify which column(s) have URLs.

- We should also check if the values in the URL columns have valid URLs.

In [None]:
def get_url_cols(df: pd.DataFrame) -> list:
    """Take a Pandas DataFrame and return a list of URL columns."""
    return [col for col in df.columns if ("url" in col.lower())]

In [None]:
# Test
get_url_cols(df)

In [None]:
def is_url(string: str) -> bool:
    """src: https://stackoverflow.com/questions/7160737/how-to-validate-a-url-in-python-malformed-or-not"""
    regex = re.compile(
        r"^(?:http|ftp)s?://"  # http:// or https://
        r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"  # domain...
        r"localhost|"  # localhost...
        r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
        r"(?::\d+)?"  # optional port
        r"(?:/?|[/?]\S+)$",
        re.IGNORECASE,
    )
    return re.match(regex, string) is not None

In [None]:
# Test
print(is_url("http://www..com"))  # False
print(is_url("http://www.google.com"))  # True

**Number of images in sample dataset**

In [None]:
# Get the image URL columns
url_cols = get_url_cols(df)
for col in url_cols:
    # Pick a random value of that column
    i = np.random.randint(0, len(df))
    sample_url = df[col][i]
    # Check if the URL is valid
    if is_url(df[col][0]):
        print(f"{col} : {df[col].count()}")
    else:
        print(f"{col} value not url! {sample_url}")

### 2.3 Number of Campaigns

In [None]:
df["Campaign"].unique()

### 2.4 Number of Sites

In [None]:
df["Site"].unique()

### 2.5 Plot sample image

In [None]:
def img_from_bytes(resp):
    arr = np.asarray(bytearray(resp.content), dtype=np.uint8)
    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

In [None]:
def plot_sample_img(df, verbose=False):
    # Random sample
    idx = np.random.randint(0, len(df))
    # Select the first url column
    col = get_url_cols(df)[0]
    url = df[col][idx]
    print(f"Index: {idx}, Column: '{col}', URL: '{url}'") if verbose else 0
    # Check if URL is valids
    if not is_url(url):
        print(f"[ERROR] Invalid URL: '{url}'")
        return
    # Fetch image if valid URL
    resp = requests.get(url)
    while resp.status_code != 200:
        url = df[col][idx]
        # Fetch image
        resp = requests.get(url)
        print("Response status code:", resp.status_code) if verbose else 0
        img = img_from_bytes(resp)
        # Plot
        fig, ax = plt.subplots(figsize=(10, 10))
        ax.imshow(img)
        plt.show()

In [None]:
plot_sample_img(df, verbose=True)

## 3. Check all dataframes

### 3.1 Check null values

In [None]:
for i, (df, file) in enumerate(zip(df_list, files)):
    nans_per_column = df.isna().sum()
    total_nans = nans_per_column.sum()
    if total_nans > 0:
        print(f"[{i}][{file}] Total {total_nans} null values in dataframe.")
        display(nans_per_column[nans_per_column > 0])

**Issue:** Many files are missing Latitude and Longitude values: 
- 873995.csv, 873996.csv, 873997.csv, 873998.csv, 873999.csv, 874000.csv, 874001.csv, 874002.csv, 875071.csv, 875073.csv, 875080.csv, 875084.csv, 878001.csv, 878003.csv, 878004.csv, 878006.csv, 878007.csv, 878008.csv, 878009.csv, 878013.csv, 878014.csv, 878016.csv, 878019.csv, 894732.csv, 914155.csv, 914192.csv, 918924.csv, 928814.csv
    
**Issue:** Files missing image URL values:
- 914212.csv, 918925.csv, 919836.csv, 928814.csv

### 3.2 Number of Images (with valid urls)

In [None]:
d = {}
valid_cols = []
invalid_cols = []
for i, (df, file) in enumerate(zip(df_list, files)):
    # print(f"[{i}][{file}]")
    # Get the image URL columns
    url_cols = get_url_cols(df)
    for col in url_cols:
        # Pick a random value of that column
        i = np.random.randint(0, len(df))
        sample_url = df[col][i]
        # Check if the URL is valid
        if is_url(df[col][0]):
            print(f"[{i}] [{file}]")
            print(f"\t{col} : {df[col].count()}")
            d[file] = df[col].count()
            valid_cols.append(col)
        else:
            # print(f"\tColumn name: '{col}' has invalid url value: '{sample_url}' at index: {i}")
            invalid_cols.append(col)

In [None]:
pd.Series(d).sum()

#### 3.2.1. URL columns with valid URLs

In [None]:
print("Unique image URL columns names (valid):")
set(valid_cols)

**Various names of image URL column**:
- URL
- URL file
- URL image
- URL raw (also URL thumb: lower res version)
- URL ref
- URL source

**Unexpected:**
- URL movie (839384.csv, 839386.csv, 839387.csv, 839388.csv, 839389.csv, 839390.csv, 839391.csv, 839392.csv, 839393.csv, 839394.csv, 839395.csv, 839396.csv, 839397.csv, 839398.csv, 839399.csv)

#### 3.2.2. URL columns with invalid URLs

In [None]:
print("Unique image URL columns names (invalid):")
set(invalid_cols)

'IMAGE', 'IMAGE (Size)', 'Image' these columns were also being identified as URL columns but their values are not valid URLs.

### Number of unique campaigns

In [None]:
campaigns = []
n_campaigns = []
for i, (df, file) in enumerate(zip(df_list, files)):
    name = df["Campaign"].unique()
    n = df["Campaign"].nunique()
    # Add to list of campaigns
    campaigns.extend(name)
    n_campaigns.append(n)

# Check if all files have a campaign column
print("All files have campaign:", (len(campaigns) == N_FILES))

In [None]:
# Keep only unique entries
campaigns = set(campaigns)
len(campaigns)

### 3.3 Total number of Campaigns in all files

In [None]:
unique_campaigns = set([df["Campaign"].unique()[0] for df in df_list])
print("Total number of campaigns:", len(unique_campaigns), end="\n\n")
print(unique_campaigns)

### 3.4 Total number of Sites in all files

In [None]:
unique_sites = set([df["Site"].unique()[0] for df in df_list])
print("Total number of sites:", len(unique_sites), end="\n\n")
print(unique_sites)

### 3.5 How many images per campaign

In [None]:
camps = {campaign: 0 for campaign in unique_campaigns}
for df in df_list:
    campaign = df["Campaign"].unique()[0]
    img_cols = get_url_cols(df)
    if len(img_cols) > 0:
        img_col = img_cols[0]
        camps[campaign] += df[img_col].count()

In [None]:
camps = pd.Series(camps).sort_values(ascending=False)
camps

In [None]:
plt.figure(figsize=(16, 5))
plt.title("Number of images per campaign")
sns.barplot(x=camps.index, y=camps.values)
plt.xticks(rotation=90);

### 2.6 Plot random samples

In [None]:
for i, (df, file) in enumerate(zip(df_list, files)):
    plot_sample_img(df)
    if i > 10:
        break

## 4. Dataset metadata table
This table will contain data about each dataset/file. Each row of will contain information such as the number of nans in that dataset, the number of images, etc.

In [None]:
# Dataset metadata table
datasets = pd.DataFrame(
    {
        "filename": files,
        "n_rows": [len(df) for df in df_list],
        "size": [df.size for df in df_list],  # size = rows x columns
        "total_nans": [df.isna().sum().sum() for df in df_list],
    }
)
# Calculate the percentage of null values in each dataframe
datasets["percent_nans"] = (datasets["total_nans"] / datasets["size"] * 100).round(2)

print("Number of datasets downloaded:", len(datasets))
datasets.head()

In [None]:
print(f"Total datapoints in all datasets combined: {datasets['n_rows'].sum()}")
print(f"Total null values in all datasets combined: {datasets['total_nans'].sum()}")
print(
    f"% of null values in all datasets combined: {(datasets['total_nans'].sum()/datasets['size'].sum()).round(4)}%"
)

In [None]:
# Showing Files with null values
nan_datasets = datasets[datasets["total_nans"] > 0]
print("Number of files with null values:", len(nan_datasets))
nan_datasets.head()

## 5. Visualizations

In [None]:
sns.set_style("whitegrid")

### 5.1 Null values

In [None]:
datasets.total_nans.max()

In [None]:
fig, ax = plt.subplots(figsize=(16, 4))
ax.hist(datasets["total_nans"], bins=20)
ax.set_title("Total null values in each dataframe")
ax.set_ylabel("Frequency")
ax.set_xlabel("Number of null values")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16, 4))
ax.hist(nan_datasets["percent_nans"], bins=20)
ax.set_title("% of null values in each dataframe")
ax.set_ylabel("Frequency")
ax.set_xlabel("% of null values")
plt.show()

**Checking the dataset with the highest number of null values**

In [None]:
# Check the dataset with max null values
idx = datasets["total_nans"].argmax()
sample = df_list[idx]
print("Total NaNs in sample:", sample.isna().sum().sum())
sample.isna().sum()[sample.isna().any()].sort_values(ascending=False)

**Note:** Although it has the highest number of null values, it is not missing data in any of the columns we care about, such as Image URL, Latitude, Longitude, campaign name or site.