Skip to content

Commit

Permalink
added dataset struct for simpler access to metadata and restructured …
Browse files Browse the repository at this point in the history
…project
  • Loading branch information
0x6e66 committed Feb 14, 2024
1 parent a41821c commit 9ec1e4f
Show file tree
Hide file tree
Showing 14 changed files with 417 additions and 835 deletions.
11 changes: 6 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
[package]
name = "pangaea"
version = "0.1.2"
version = "0.2.0"
edition = "2021"
authors = ["Niklas Frondorf <niklas.frondorf@web.de>"]
readme = "README.md"
license-file = "LICENSE"
license = "MIT OR Apache-2.0"
description = "A rust module to to access data from pangaea.de"
repository = "https://github.com/0x6e66/pangaea"
homepage = "https://github.com/0x6e66/pangaea"
include = ["src", "Cargo.toml", "README.md", "LICENSE"]
keywords = ["data_publisher", "pangaea", "geo_data"]
include = ["src", "Cargo.toml", "README.md"]
keywords = ["data-publisher", "pangaea", "geographic"]
documentation = "https://docs.rs/pangaea/"


[dependencies]
chrono = { version = "0.4.33", features = ["serde"] }
elasticsearch = "8.5.0-alpha.1"
futures-util = "0.3.30"
reqwest = { version = "0.11.23", features = ["stream"] }
Expand All @@ -30,4 +31,4 @@ tokio = { version = "1.35.1", features = ["full"] }


[patch.crates-io]
elasticsearch = { git = "https://github.com/0x6e66/elasticsearch-rs.git", branch = "version-5.6-compatibility", version = "8.6.0-alpha.1"}
elasticsearch = { git = "https://github.com/0x6e66/elasticsearch-rs.git", branch = "version-5.6-compatibility", version = "8.6.0-alpha.1"}
674 changes: 0 additions & 674 deletions LICENSE

This file was deleted.

23 changes: 15 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,32 +1,39 @@
# A rust module to to access [PANGAEA](https://www.pangaea.de/) (meta)data

## Getting started
Run `cargo add pangaea` or add `pangaea = "0.1.0"` to your dependencies

## Get metadata for a specific PANGAEA dataset
```rust
use std::{fs::File, io::Write};
use pangaea::elastic::elastic_doc;

use pangaea::dataset::datasettype::Dataset;
use elasticsearch::{Elasticsearch, http::transport::Transport};

#[tokio::main]
pub async fn main() {
let dataset_id = 820322;
let metadata = elastic_doc(dataset_id).await.unwrap();
let transport = Transport::single_node("https://ws.pangaea.de/es/pangaea").unwrap();
let client = Elasticsearch::new(transport);

let dateset = Dataset::new(dataset_id, &client).await.unwrap();

let mut file = File::create(format!("pangaea-dataset-{}.json", dataset_id)).unwrap();
let json = serde_json::to_string(&metadata).unwrap();
let json = serde_json::to_string(&dateset).unwrap();
write!(file, "{}", json).unwrap();
}
```

## Search for multiple datasets
```rust
use std::{fs::File, io::Write};
use pangaea::elastic::elastic_search;

use pangaea::dataset::datasettype::Dataset;
use elasticsearch::{Elasticsearch, http::transport::Transport};

#[tokio::main]
pub async fn main() {
let res = elastic_search(0, 10, None, &["sp-lastModified:desc"])
let transport = Transport::single_node("https://ws.pangaea.de/es/pangaea").unwrap();
let client = Elasticsearch::new(transport);

let res = Dataset::search(0, 10, None, &["sp-lastModified:desc"], &client)
.await
.unwrap();

Expand Down
13 changes: 0 additions & 13 deletions examples/elastic_doc.rs

This file was deleted.

19 changes: 0 additions & 19 deletions examples/elastic_search.rs

This file was deleted.

197 changes: 197 additions & 0 deletions src/dataset/datasettype.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
use crate::metadata::metadatatype;
use chrono::{DateTime, Utc};
use serde_derive::{Deserialize, Serialize};

#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Dataset {
pub pangaea_id: Option<String>,
pub title: String,
pub text_abstract: Option<String>,
pub authors: Vec<Author>,
pub publication_date: Option<DateTime<Utc>>,
pub uri: Option<String>,
pub extent: Option<Extent>,
pub keywords: Vec<String>,
}

impl From<metadatatype::MetaDataType> for Dataset {
fn from(md: metadatatype::MetaDataType) -> Self {
let pangaea_id = md
.citation
.citation_type
.id_attributes
.id
.map(|id| id.to_owned());

let title = md.citation.citation_type.title;
let text_abstract = md.text_abstract;

let authors: Vec<Author> = md
.citation
.citation_type
.authors
.into_iter()
.map(|a| a.into())
.collect();

let publication_date: Option<DateTime<Utc>> = match md
.citation
.date_time
.map(|dt| DateTime::parse_from_rfc3339(&dt).ok().map(|dt| dt.to_utc()))
{
None => None,
Some(v) => v,
};

let uri = md.citation.citation_type.uri;
let extent = md.extent.map(|ext| ext.into());

let keywords = md
.keywords
.map(|k| k.keywords)
.unwrap_or_default()
.into_iter()
.map(|kt| kt.text)
.collect();

Dataset {
pangaea_id,
title,
text_abstract,
authors,
publication_date,
uri,
extent,
keywords,
}
}
}

#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Author {
pub last_name: String,
pub first_name: Option<String>,
pub e_mail: Option<String>,
pub uri: Option<String>,
pub orcid: Option<String>,
pub affiliations: Vec<Institution>,
}

impl From<metadatatype::ResponsiblePartyType> for Author {
fn from(party: metadatatype::ResponsiblePartyType) -> Self {
Self {
last_name: party.last_name,
first_name: party.first_name,
e_mail: party.e_mail,
uri: party.uri,
orcid: party.orcid,
affiliations: party
.affiliations
.into_iter()
.map(|aff| aff.into())
.collect(),
}
}
}

#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Institution {
pub name: String,
pub uri: Option<String>,
pub ror: Option<String>,
pub crossref_funder_id: Option<String>,
}

impl From<metadatatype::InstitutionType> for Institution {
fn from(inst: metadatatype::InstitutionType) -> Self {
Self {
name: inst.linked_name_type.name,
uri: inst.linked_name_type.uri,
ror: inst.ror,
crossref_funder_id: inst.crossref_funder_id,
}
}
}

#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Extent {
pub geographic: Option<Geographic>,
pub temporal: Option<Temporal>,
pub elevation: Option<Elevation>,
}

impl From<metadatatype::ExtentType> for Extent {
fn from(extent: metadatatype::ExtentType) -> Self {
Self {
geographic: extent.geographic.map(|g| g.into()),
temporal: extent.temporal.map(|t| t.into()),
elevation: extent.elevation.map(|e| e.into()),
}
}
}

#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Geographic {
pub west_bound_longitude: f64,
pub east_bound_longitude: f64,
pub south_bound_latitude: f64,
pub north_bound_latitude: f64,
pub mean_longitude: f64,
pub mean_latitude: f64,
}

impl From<metadatatype::Geographic> for Geographic {
fn from(geo: metadatatype::Geographic) -> Self {
Self {
west_bound_longitude: geo.west_bound_longitude,
east_bound_longitude: geo.east_bound_longitude,
south_bound_latitude: geo.south_bound_latitude,
north_bound_latitude: geo.north_bound_latitude,
mean_longitude: geo.mean_longitude,
mean_latitude: geo.mean_latitude,
}
}
}

#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Temporal {
pub min_date_time: Option<DateTime<Utc>>,
pub max_date_time: Option<DateTime<Utc>>,
}

impl From<metadatatype::Temporal> for Temporal {
fn from(temp: metadatatype::Temporal) -> Self {
let min_date_time: Option<DateTime<Utc>> =
DateTime::parse_from_rfc3339(&temp.min_date_time)
.ok()
.map(|dt| dt.to_utc());
let max_date_time: Option<DateTime<Utc>> =
DateTime::parse_from_rfc3339(&temp.min_date_time)
.ok()
.map(|dt| dt.to_utc());

Self {
min_date_time,
max_date_time,
}
}
}

#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Elevation {
pub name: String,
pub unit: Option<String>,
pub min: f64,
pub max: f64,
}

impl From<metadatatype::Elevation> for Elevation {
fn from(el: metadatatype::Elevation) -> Self {
Self {
name: el.name,
unit: el.unit,
min: el.min,
max: el.max,
}
}
}
31 changes: 31 additions & 0 deletions src/dataset/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
pub mod datasettype;

use elasticsearch::Elasticsearch;
use serde_json::Value;

use self::datasettype::Dataset;
use crate::{metadata::metadatatype::MetaDataType, prelude::*};

impl Dataset {
/// Get metadata over a single dataset with the id `dataset_id`
pub async fn new(dataset_id: u32, client: &Elasticsearch) -> Result<Dataset> {
MetaDataType::new(dataset_id, client)
.await
.map(|md| md.into())
}

/// Search for datasets in the elasticsearch index.
///
/// Note: `from` + `size` must be less than or equal to 10000
pub async fn search(
from: i64,
size: i64,
body: Option<Value>,
sort: &[&str],
client: &Elasticsearch,
) -> Result<Vec<Result<Dataset>>> {
MetaDataType::search(from, size, body, sort, client)
.await
.map(|vec| vec.into_iter().map(|res| res.map(|md| md.into())).collect())
}
}

0 comments on commit 9ec1e4f

Please sign in to comment.