In [None]:
import os

import cartopy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from mpl_toolkits.basemap import Basemap
from scipy import stats
from sklearn.datasets import fetch_species_distributions, make_blobs
from sklearn.neighbors import KernelDensity

%matplotlib inline

## Load data

In [None]:
DATA_DIR = "../query-outputs/"
files = os.listdir(DATA_DIR)
df_list = [pd.read_csv(os.path.join(DATA_DIR, f)) for f in files]

Check if all files have latitude and longitude columns

In [None]:
def has_lat_lon(frame):
    cond1 = any([col == "Latitude" for col in frame.columns])
    cond2 = any([col == "Longitude" for col in frame.columns])
    return cond1 and cond2


print("All files have lat/lon columns:", all([has_lat_lon(df) for df in df_list]))

In [None]:
all_dfs = pd.concat(df_list)
all_dfs.describe()

## Plot data

In [None]:
def make_basemap(df: pd.DataFrame, projection="mill", full_map=True) -> Basemap:
    """
    Create basemap for geographical plotting.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        a DataFrame containing the Latitude and Longitude data.
        Note: must have "Latitude" and "Longitude" columns.

    full_map: bool
        Show the entire world map or only area where the
        sample latitude and longitude values are found.
        (Default = True, show entire world map)
    """
    llcrnrlat = df.Latitude.min() if not full_map else -90
    urcrnrlat = df.Latitude.max() if not full_map else 90
    llcrnrlon = df.Longitude.min() if not full_map else -180
    urcrnrlon = df.Longitude.max() if not full_map else 180

    m = Basemap(
        projection=projection,
        resolution="c",
        llcrnrlat=llcrnrlat,
        urcrnrlat=urcrnrlat,
        llcrnrlon=llcrnrlon,
        urcrnrlon=urcrnrlon,
    )
    return m

### 1. Scatter plot

In [None]:
fig = plt.figure(figsize=(12, 9))

lat_y = all_dfs["Latitude"].to_list()
lon_x = all_dfs["Longitude"].to_list()

m = make_basemap(all_dfs, full_map=True)
m.scatter(lon_x, lat_y, latlon=True, alpha=0.1, s=20, c="red", marker="o")
m.drawlsmask(land_color="grey", ocean_color="lightblue", lakes=True)
m.drawparallels(np.arange(-90, 90, step=10), labels=[1, 0, 0, 0])
m.drawmeridians(np.arange(-180, 180, step=30), labels=[0, 0, 0, 1])

plt.title("Spatial distribution of Pangaea datasets", fontsize=20)
plt.xlabel("x axis (Longitude)", fontsize=15, labelpad=25)
plt.ylabel("y axis (Latitude)", fontsize=15, labelpad=40)
plt.show()

### 2. Kernal density estimate

#### 2.1 Simple example of 2D density plots in python - How to visualize joint distributions 
Src: https://towardsdatascience.com/simple-example-of-2d-density-plots-in-python-83b83b934f67

In [None]:
n_components = 3
X, truth = make_blobs(
    n_samples=300, centers=n_components, cluster_std=[2, 1.5, 1], random_state=42
)

fig = plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], s=50, c=truth)
plt.title(f"Example of a mixture of {n_components} distributions")
plt.xlabel("x")
plt.ylabel("y");

In [None]:
# Extract x and y
x = X[:, 0]
y = X[:, 1]
# Define the borders
deltaX = (max(x) - min(x)) / 10
deltaY = (max(y) - min(y)) / 10
xmin = min(x) - deltaX
xmax = max(x) + deltaX
ymin = min(y) - deltaY
ymax = max(y) + deltaY
print(xmin, xmax, ymin, ymax)
# Create meshgrid
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]

In [None]:
positions = np.vstack([xx.ravel(), yy.ravel()])
values = np.vstack([x, y])
kernel = stats.gaussian_kde(values)
f = np.reshape(kernel(positions).T, xx.shape)

In [None]:
fig = plt.figure(figsize=(8, 8))
ax = fig.gca()
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
cfset = ax.contourf(xx, yy, f, cmap="Blues")
ax.imshow(np.rot90(f), cmap="Blues", extent=[xmin, xmax, ymin, ymax])
cset = ax.contour(xx, yy, f, colors="k")
ax.clabel(cset, inline=1, fontsize=10)
ax.set_xlabel("X")
ax.set_ylabel("Y")
plt.title("2D Gaussian Kernel density estimation")
plt.show()

#### 2.2 Seaborn `kdeplot`

In [None]:
ax = sns.kdeplot(data=all_dfs, x="Longitude", y="Latitude", shade=True, cmap="PuBu")

#### 2.3 `scipy.stats.gaussian_kde` example implementation
src: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html

##### 1. Prepare data

In [None]:
m1 = all_dfs["Longitude"].dropna()
m2 = all_dfs["Latitude"].dropna()
xmin = m1.min()
xmax = m1.max()
ymin = m2.min()
ymax = m2.max()

##### 2. Perform a kernel density estimate on the data:

In [None]:
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)

##### 3. Plot the results:

In [None]:
fig, ax = plt.subplots(figsize=(12, 9))
ax.imshow(np.rot90(Z), cmap=plt.cm.Reds, extent=[xmin, xmax, ymin, ymax])
ax.scatter(m1, m2, color="k", marker=".")
ax.set_xlim([-180, 180])
ax.set_ylim([-90, 90])
plt.show()

In [None]:
# Prepare map
fig = plt.figure(figsize=(12, 9))
ax = fig.gca()
m = make_basemap(all_dfs, full_map=True)
m.drawparallels(np.arange(-90, 90, step=10), labels=[1, 0, 0, 0])
m.drawmeridians(np.arange(-180, 180, step=30), labels=[0, 0, 0, 1])

# Plot data
lat_y = all_dfs["Latitude"].to_list()
lon_x = all_dfs["Longitude"].to_list()
m.scatter(lon_x, lat_y, latlon=True, alpha=0.1, s=20, c="k", marker="o")
m.imshow(
    np.rot90(Z),
    cmap=plt.cm.Reds,
)  # extent=[xmin, xmax, ymin, ymax])

plt.title("Spatial distribution of Pangaea datasets", fontsize=20)
plt.xlabel("x axis (Longitude)", fontsize=15, labelpad=25)
plt.ylabel("y axis (Latitude)", fontsize=15, labelpad=40)
plt.show()

##### 4. Try it on a map

In [None]:
# Prepare map
fig = plt.figure(figsize=(12, 9))
ax = fig.gca()
m = make_basemap(all_dfs, full_map=True)
m.drawparallels(np.arange(-90, 90, step=10), labels=[1, 0, 0, 0])
m.drawmeridians(np.arange(-180, 180, step=30), labels=[0, 0, 0, 1])

# Plot data
lat_y = all_dfs["Latitude"].to_list()
lon_x = all_dfs["Longitude"].to_list()
m.scatter(lon_x, lat_y, latlon=True, alpha=0.1, s=20, c="k", marker="o")
m.imshow(
    np.rot90(Z),
    cmap=plt.cm.Reds,
)  # extent=[xmin, xmax, ymin, ymax])

m.drawcoastlines()
plt.title("Spatial distribution of Pangaea datasets", fontsize=20)
plt.xlabel("x axis (Longitude)", fontsize=15, labelpad=25)
plt.ylabel("y axis (Latitude)", fontsize=15, labelpad=40)
plt.show()

In [None]:
fig = plt.figure(figsize=(12, 9))
m.contourf(X, Y, Z, levels=np.linspace(0, Z.max(), 25), cmap="Reds")
plt.show()

In [None]:
fig = plt.figure(figsize=(12, 9))
plt.contourf(X, Y, Z, levels=np.linspace(0, Z.max(), 25), cmap="Reds")
plt.show()

- [Kernel Density Estimate of Species Distributions](https://scikit-learn.org/stable/auto_examples/neighbors/plot_species_kde.html)
- [In Depth kernel density estiation](https://jakevdp.github.io/PythonDataScienceHandbook/05.13-kernel-density-estimation.html)

In [None]:
# all_dfs[['Longitude', 'Latitude']].to_csv("../all_dfs.csv", index=False)