Skip to content

Commit

Permalink
fix: Drop single events if corrupted instead of failing whole import
Browse files Browse the repository at this point in the history
  • Loading branch information
johan-bjareholt committed Oct 29, 2020
1 parent 7d55fca commit 96cbc9d
Show file tree
Hide file tree
Showing 11 changed files with 354 additions and 122 deletions.
174 changes: 89 additions & 85 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion aw-datastore/src/datastore.rs
Expand Up @@ -373,7 +373,7 @@ impl DatastoreInstance {
self.buckets_cache.insert(bucket.id.clone(), bucket.clone());
// Insert events
if let Some(events) = events {
self.insert_events(conn, &bucket.id, events)?;
self.insert_events(conn, &bucket.id, events.take_inner())?;
bucket.events = None;
}
Ok(())
Expand Down
55 changes: 30 additions & 25 deletions aw-datastore/src/legacy_import.rs
Expand Up @@ -103,30 +103,7 @@ mod import {
let timestamp_str: String = row.get(0)?;
let duration_float: f64 = row.get(1)?;
let data_str: String = row.get(2)?;

let timestamp_str = timestamp_str.replace(" ", "T");
let timestamp = match DateTime::parse_from_rfc3339(&timestamp_str) {
Ok(timestamp) => timestamp.with_timezone(&Utc),
Err(err) => panic!("Timestamp string {}: {:?}", timestamp_str, err),
};

let duration_ns = (duration_float * 1_000_000_000.0) as i64;

let data: serde_json::map::Map<String, serde_json::Value> =
match serde_json::from_str(&data_str) {
Ok(data) => data,
Err(err) => panic!(
"Unable to parse JSON data in event from bucket {}\n{}\n{}",
bucket_id, err, data_str
),
};

Ok(Event {
id: None,
timestamp,
duration: Duration::nanoseconds(duration_ns),
data,
})
Ok((timestamp_str, duration_float, data_str))
}) {
Ok(rows) => rows,
Err(err) => {
Expand All @@ -139,7 +116,35 @@ mod import {
let mut list = Vec::new();
for row in rows {
match row {
Ok(event) => list.push(event),
Ok((timestamp_str, duration_float, data_str)) => {
let timestamp_str = timestamp_str.replace(" ", "T");
let timestamp = match DateTime::parse_from_rfc3339(&timestamp_str) {
Ok(timestamp) => timestamp.with_timezone(&Utc),
Err(err) => panic!("Timestamp string {}: {:?}", timestamp_str, err),
};

let duration_ns = (duration_float * 1_000_000_000.0) as i64;

let data: serde_json::map::Map<String, serde_json::Value> =
match serde_json::from_str(&data_str) {
Ok(data) => data,
Err(err) => {
warn!(
"Unable to parse JSON data in event from bucket {}\n{}\n{}",
bucket_id, err, data_str
);
continue;
}
};

let event = Event {
id: None,
timestamp,
duration: Duration::nanoseconds(duration_ns),
data,
};
list.push(event)
}
Err(err) => panic!("Corrupt event in bucket {}: {}", bucket_id, err),
};
}
Expand Down
8 changes: 8 additions & 0 deletions aw-models/examples/schema.rs
@@ -0,0 +1,8 @@
extern crate aw_models;

use schemars::schema_for;

fn main() {
let schema = schema_for!(aw_models::Bucket);
println!("{}", serde_json::to_string_pretty(&schema).unwrap());
}
6 changes: 5 additions & 1 deletion aw-models/src/bucket.rs
Expand Up @@ -7,6 +7,7 @@ use serde_json::value::Value;
use std::collections::HashMap;

use crate::Event;
use crate::TryVec;

#[derive(Serialize, Deserialize, JsonSchema, Clone, Debug)]
pub struct Bucket {
Expand All @@ -22,7 +23,10 @@ pub struct Bucket {
pub data: Map<String, Value>,
#[serde(default, skip_deserializing)]
pub metadata: BucketMetadata,
pub events: Option<Vec<Event>>, /* Should only be set during import/export */
// Events should only be "Some" during import/export
// It's using a TryVec to discard only the events which were failed to be serialized so only a
// few events are being dropped during import instead of failing the whole import
pub events: Option<TryVec<Event>>,
pub last_updated: Option<DateTime<Utc>>, // TODO: Should probably be moved into metadata field
}

Expand Down
4 changes: 4 additions & 0 deletions aw-models/src/export.rs
@@ -0,0 +1,4 @@
#[derive(Serialize, Deserialize, JsonSchema, Clone)]
pub struct BucketsExport {
pub buckets: HashMap<String, Bucket>,
}
2 changes: 2 additions & 0 deletions aw-models/src/lib.rs
Expand Up @@ -24,6 +24,7 @@ mod info;
mod key_value;
mod query;
mod timeinterval;
mod tryvec;

pub use self::bucket::Bucket;
pub use self::bucket::BucketMetadata;
Expand All @@ -34,3 +35,4 @@ pub use self::key_value::Key;
pub use self::key_value::KeyValue;
pub use self::query::Query;
pub use self::timeinterval::TimeInterval;
pub use self::tryvec::TryVec;
202 changes: 202 additions & 0 deletions aw-models/src/tryvec.rs
@@ -0,0 +1,202 @@
use schemars::JsonSchema;
use serde::de::{DeserializeOwned, SeqAccess, Visitor};
use serde::export::PhantomData;
use serde::ser::SerializeSeq;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_json::Value;
use std::fmt;
use std::fmt::Debug;

#[derive(Debug, Clone, JsonSchema)]
#[serde(untagged)]
// TODO: JsonSchema is invalid, we should only allow "Parsed" value as the
// others will be dropped
pub enum TryParse<T> {
Parsed(T),
Unparsed(Value),
NotPresent,
}

impl<'de, T: DeserializeOwned> Deserialize<'de> for TryParse<T> {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
match Option::<Value>::deserialize(deserializer)? {
None => Ok(TryParse::NotPresent),
Some(value) => match T::deserialize(&value) {
Ok(t) => Ok(TryParse::Parsed(t)),
Err(_) => Ok(TryParse::Unparsed(value)),
},
}
}
}

#[derive(Debug, Clone, JsonSchema)]
#[serde(transparent)]
pub struct TryVec<T> {
inner: Vec<TryParse<T>>,
}

impl<T> TryVec<T> {
pub fn new(mut vec: Vec<T>) -> Self {
let mut vec_marked: Vec<TryParse<T>> = Vec::new();
for item in vec.drain(..) {
vec_marked.push(TryParse::Parsed(item));
}
TryVec { inner: vec_marked }
}

pub fn new_empty() -> Self {
TryVec { inner: Vec::new() }
}

pub fn take_inner(self) -> Vec<T> {
let mut vec: Vec<T> = Vec::new();
for item in self.inner {
match item {
TryParse::Parsed(i) => vec.push(i),
_ => continue,
};
}
return vec;
}
}

impl<T> Serialize for TryVec<T>
where
T: Serialize,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let mut seq = serializer.serialize_seq(Some(self.inner.len()))?;
for element in &self.inner {
match element {
TryParse::Parsed(t) => seq.serialize_element(t)?,
_ => continue,
};
}
seq.end()
}
}

struct TryVecVisitor<T> {
marker: PhantomData<fn() -> TryVec<T>>,
}

impl<T> TryVecVisitor<T> {
fn new() -> Self {
TryVecVisitor {
marker: PhantomData,
}
}
}

impl<'de, T> Visitor<'de> for TryVecVisitor<T>
where
T: DeserializeOwned,
{
type Value = TryVec<T>;

// Format a message stating what data this Visitor expects to receive.
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a seqence")
}

fn visit_seq<M>(self, mut access: M) -> Result<Self::Value, M::Error>
where
M: SeqAccess<'de>,
{
let mut vec = Vec::new();

loop {
let res = match access.next_element() {
Ok(val) => val,
Err(err) => {
println!(
"Failed to parse event because '{}', the event will be discarded",
err
);
continue;
}
};
match res {
Some(item) => vec.push(item),
None => break,
};
}

Ok(TryVec { inner: vec })
}
}

impl<'de, T> Deserialize<'de> for TryVec<T>
where
T: DeserializeOwned,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_seq(TryVecVisitor::new())
}
}

#[cfg(test)]
mod test {
use serde::{Deserialize, Serialize};

use super::TryVec;

#[derive(Deserialize, Serialize, Debug)]
struct TestEvent {
data: String,
}

fn assert_serialized_deserialized_eq(data: &str, eq: &str) {
let deserialized = serde_json::from_str::<TryVec<TestEvent>>(data).unwrap();
let serialized = serde_json::to_string(&deserialized).unwrap();
assert_eq!(serialized, eq);
}

#[test]
fn test_serialize_deserialize() {
println!("test empty array");
assert_serialized_deserialized_eq(r#"[]"#, r#"[]"#);

println!("test one valid event");
assert_serialized_deserialized_eq(r#"[{"data":"test"}]"#, r#"[{"data":"test"}]"#);

println!("test invalid type int, skip event");
assert_serialized_deserialized_eq(r#"[{ "data": 1 }]"#, r#"[]"#);

println!("test invalid type dict, skip event");
assert_serialized_deserialized_eq(r#"[{"data":{}}]"#, r#"[]"#);

println!("test invalid type arr, skip event");
assert_serialized_deserialized_eq(r#"[{"data":[]}]"#, r#"[]"#);

println!("test multiple valid events");
assert_serialized_deserialized_eq(
r#"[{"data":"test"},{"data":"test2"},{"data":"test3"}]"#,
r#"[{"data":"test"},{"data":"test2"},{"data":"test3"}]"#,
);

println!("test invalid event in middle of sequence, skip one event");
assert_serialized_deserialized_eq(
r#"[{"data":"test"},{"data":2},{"data":"test3"}]"#,
r#"[{"data":"test"},{"data":"test3"}]"#,
);

println!("test utf-16 character");
assert_serialized_deserialized_eq(r#"[{"data":"\ud835\udc47"}]"#, r#"[{"data":"𝑇"}]"#);

println!("test invalid utf-8/16, skip event");
assert_serialized_deserialized_eq(r#"[{"data":"\ud835"}]"#, r#"[]"#);
}

#[test]
fn test_methods() {
let tryvec = TryVec::<TestEvent>::new_empty();
assert_eq!(tryvec.take_inner().len(), Vec::<TestEvent>::new().len());
}
}
11 changes: 6 additions & 5 deletions aw-server/src/endpoints/bucket.rs
Expand Up @@ -9,6 +9,7 @@ use chrono::Utc;
use aw_models::Bucket;
use aw_models::BucketsExport;
use aw_models::Event;
use aw_models::TryVec;

use rocket::http::Header;
use rocket::http::Status;
Expand Down Expand Up @@ -174,11 +175,11 @@ pub fn bucket_export(
Ok(bucket) => bucket,
Err(err) => return Err(err.into()),
};
bucket.events = Some(
datastore
.get_events(&bucket_id, None, None, None)
.expect("Failed to get events for bucket"),
);
/* TODO: Replace expect with http error */
let events = datastore
.get_events(&bucket_id, None, None, None)
.expect("Failed to get events for bucket");
bucket.events = Some(TryVec::new(events));
export.buckets.insert(bucket_id.clone(), bucket);
let filename = format!("aw-bucket-export_{}.json", bucket_id);

Expand Down
6 changes: 4 additions & 2 deletions aw-server/src/endpoints/export.rs
Expand Up @@ -7,6 +7,7 @@ use rocket::response::Response;
use rocket::State;

use aw_models::BucketsExport;
use aw_models::TryVec;

use crate::endpoints::{HttpErrorJson, ServerState};

Expand All @@ -21,10 +22,11 @@ pub fn buckets_export(state: State<ServerState>) -> Result<Response, HttpErrorJs
Err(err) => return Err(err.into()),
};
for (bid, mut bucket) in buckets.drain() {
bucket.events = Some(match datastore.get_events(&bid, None, None, None) {
let events = match datastore.get_events(&bid, None, None, None) {
Ok(events) => events,
Err(err) => return Err(err.into()),
});
};
bucket.events = Some(TryVec::new(events));
export.buckets.insert(bid, bucket);
}

Expand Down

0 comments on commit 96cbc9d

Please sign in to comment.