In [2]:
:dep combee
:dep serde

In [10]:
use combee::{read_parquet, dataframe::DataFrame};
use serde::{Serialize, Deserialize}

In [4]:
use combee::functions::{all, mean, count};

In [5]:
#[derive(Clone, Serialize, Deserialize, Debug)]
struct Anime {
    anime_id: u64,
    name: String,
    genre: Vec<String>,
    r#type: String,
    episodes: usize,
    rating: f64,
    members: usize
}

let df = read_parquet::<Anime>("datasets/anime.parquet".to_string()).unwrap();

println!("Count: {}", df.len());
df.head(5)

Count: 12294


Anime { anime_id: 32281, name: "Kimi no Na wa.", genre: ["Drama", "Romance", "School", "Supernatural"], type: "Movie", episodes: 1, rating: 9.37, members: 200630 }
Anime { anime_id: 5114, name: "Fullmetal Alchemist: Brotherhood", genre: ["Action", "Adventure", "Drama", "Fantasy", "Magic", "Military", "Shounen"], type: "TV", episodes: 64, rating: 9.26, members: 793665 }
Anime { anime_id: 28977, name: "Gintama°", genre: ["Action", "Comedy", "Historical", "Parody", "Samurai", "Sci-Fi", "Shounen"], type: "TV", episodes: 51, rating: 9.25, members: 114262 }
Anime { anime_id: 9253, name: "Steins;Gate", genre: ["Sci-Fi", "Thriller"], type: "TV", episodes: 24, rating: 9.17, members: 673572 }
Anime { anime_id: 9969, name: "Gintama&#039;", genre: ["Action", "Comedy", "Historical", "Parody", "Samurai", "Sci-Fi", "Shounen"], type: "TV", episodes: 51, rating: 9.16, members: 151266 }


In [6]:
// Calculating the number of rows and the average rating
df.groupby(all).agg(|_,g| (count(g), mean(g, |x| x.rating))).head(1)

(12294, 6.35278591182692)


In [7]:
// Calculating the average rating for each type.
df.groupby(|x| x.r#type.clone()).agg(|r#type,g| (r#type.clone(), count(g), mean(g, |x| x.rating))).head(50)

("ONA", 659, 5.583353566009107)
("Music", 488, 5.588995901639343)
("Movie", 2348, 6.180826235093689)
("Special", 1676, 6.504039379474938)
("OVA", 3311, 6.3251585623678555)
("", 25, 0.0)
("TV", 3787, 6.690874042777925)


In [11]:
// Removing animes without a type
let df_filtered: DataFrame<Anime> = df.filter(|x| x.r#type.len() > 0);
println!("Count: {}", df_filtered.len());

df_filtered.groupby(|x| x.r#type.clone()).agg(|r#type,g| (r#type.clone(), count(g), mean(g, |x| x.rating))).head(50)

Count: 12269


("TV", 3787, 6.690874042777925)
("Music", 488, 5.588995901639343)
("OVA", 3311, 6.3251585623678555)
("ONA", 659, 5.583353566009107)
("Movie", 2348, 6.180826235093689)
("Special", 1676, 6.504039379474938)


In [13]:
// Saving result to parquet
df_filtered.to_parquet("datasets/anime_filtered.parquet".to_string())

Ok(())