In [0]:
# activate R magic
%load_ext rpy2.ipython

#**Split-Apply-Combine** - plyr

In [0]:
%%R
# Instalando os pacotes
install.packages("plyr")
install.packages("gapminder")
install.packages("dplyr")

library(plyr)
library(dplyr)
library(gapminder) 
# é um conjunto de dados sobre os países, renda percapita, população, expectativa de vida - Muito interessante
# https://www.gapminder.org

#**ddply()** - {plyr}
## Split data frame, apply function and return results in a data frame.

In [0]:
%%R
?ddply #Split
df <- ddply(gapminder, #pegamos o dataset
            ~ continent, #escolhemos a variável a sumarizar
            summarize, #aplicamnos a sumarização
            max_le = max(lifeExp)) #retornando valor máximo de expectativa de vida

In [5]:
%%R
str(df)

'data.frame':	5 obs. of  2 variables:
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 1 2 3 4 5
 $ max_le   : num  76.4 80.7 82.6 81.8 81.2


In [6]:
%%R
glimpse(df)

Rows: 5
Columns: 2
$ continent <fct> Africa, Americas, Asia, Europe, Oceania
$ max_le    <dbl> 76.442, 80.653, 82.603, 81.757, 81.235


In [7]:
%%R
head(df)

  continent max_le
1    Africa 76.442
2  Americas 80.653
3      Asia 82.603
4    Europe 81.757
5   Oceania 81.235


In [8]:
%%R
levels(df$continent) #temos cinco continentes

[1] "Africa"   "Americas" "Asia"     "Europe"   "Oceania" 


#**length(unique(country))**


In [9]:
%%R
ddply(gapminder, #dataset gapminder
      ~ continent, #variável continente
      summarize, #sumarizar
      n_uniq_countries = length(unique(country)))#retornar a quantidade única de países para cada continente

  continent n_uniq_countries
1    Africa               52
2  Americas               25
3      Asia               33
4    Europe               30
5   Oceania                2


In [10]:
%%R
ddply(gapminder, 
      ~ continent, 
      function(x) c(n_uniq_countries = length(unique(x$country))))
#criando própria função, gerando mesmo resultado anterior

  continent n_uniq_countries
1    Africa               52
2  Americas               25
3      Asia               33
4    Europe               30
5   Oceania                2


In [11]:
%%R
ddply(gapminder,#dataset
      ~ continent, #variável continenete a agrupar
      summarize,#sumarizar
      min = min(lifeExp), #para cada continente retornar valor minimo de lifeExp
      max = max(lifeExp), #para cada continente retornar valor mámixo de lifeExp
      median = median(gdpPercap)) #para cada continente retoranar mediana de gdpPercap

  continent    min    max    median
1    Africa 23.599 76.442  1192.138
2  Americas 37.579 80.653  5465.510
3      Asia 28.801 82.603  2646.787
4    Europe 43.585 81.757 12081.749
5   Oceania 69.120 81.235 17983.304


In [21]:
%%R
# Usando um dataset do ggplot
library(ggplot2)
str(mpg) #dataset mpg vem junto com ggplot2

tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
 $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
 $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
 $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
 $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
 $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
 $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
 $ drv         : chr [1:234] "f" "f" "f" "f" ...
 $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
 $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
 $ fl          : chr [1:234] "p" "p" "p" "p" ...
 $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...


In [18]:
%%R
glimpse(mpg)

Rows: 234
Columns: 11
$ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", …
$ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", …
$ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2…
$ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 20…
$ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8,…
$ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "aut…
$ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "…
$ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, …
$ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, …
$ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "…
$ class        <chr> "compact", "compact", "compact", "compact", "compact", "…


In [14]:
%%R
# Trabalhando com um subset do dataset
data <- mpg[,c(1,7:9)]#todas as linhas e coluna 1, 7, 8 e 9
str(data)


tibble [234 × 4] (S3: tbl_df/tbl/data.frame)
 $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
 $ drv         : chr [1:234] "f" "f" "f" "f" ...
 $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
 $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...


In [19]:
%%R
  glimpse(data)

Rows: 234
Columns: 4
$ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", …
$ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "…
$ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, …
$ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, …


In [15]:
%%R
# Sumarizando e Agregando Dados
ddply(data, 
      .(manufacturer), #agrupar por montadora - entender sintaxe do ponto
      summarize, #sumarizar
      avgcty = mean(cty)) #aplicar média 

   manufacturer   avgcty
1          audi 17.61111
2     chevrolet 15.00000
3         dodge 13.13514
4          ford 14.00000
5         honda 24.44444
6       hyundai 18.64286
7          jeep 13.50000
8    land rover 11.50000
9       lincoln 11.33333
10      mercury 13.25000
11       nissan 18.07692
12      pontiac 17.00000
13       subaru 19.28571
14       toyota 18.52941
15   volkswagen 20.92593


In [16]:
%%R
# Várias funções em uma única chamada
ddply(data, #dataset data
      .(manufacturer), #coluna montadora
      summarize, #sumarizar
      avgcty = mean(cty), #média
      sdcty = sd(cty), #desvio-padrão
      maxhwy = max(hwy)) #máximo 

   manufacturer   avgcty     sdcty maxhwy
1          audi 17.61111 1.9745108     31
2     chevrolet 15.00000 2.9249881     30
3         dodge 13.13514 2.4850907     24
4          ford 14.00000 1.9148542     26
5         honda 24.44444 1.9436506     36
6       hyundai 18.64286 1.4990840     31
7          jeep 13.50000 2.5071327     22
8    land rover 11.50000 0.5773503     18
9       lincoln 11.33333 0.5773503     18
10      mercury 13.25000 0.5000000     19
11       nissan 18.07692 3.4268921     32
12      pontiac 17.00000 1.0000000     28
13       subaru 19.28571 0.9138735     27
14       toyota 18.52941 4.0469614     37
15   volkswagen 20.92593 4.5567020     44


In [17]:
%%R
# Sumarizando os dados pela combinação de variáveis/fatores
ddply(data, 
      .(manufacturer, drv), #duas variáveis
      summarize, 
      avgcty = mean(cty), 
      sdcty = sd(cty),
      maxhwy = max(hwy))

   manufacturer drv   avgcty     sdcty maxhwy
1          audi   4 16.81818 1.6624188     28
2          audi   f 18.85714 1.8644545     31
3     chevrolet   4 12.50000 1.7320508     19
4     chevrolet   f 18.80000 1.9235384     30
5     chevrolet   r 14.10000 1.6633300     26
6         dodge   4 12.00000 1.7435596     19
7         dodge   f 15.81818 1.8340219     24
8          ford   4 13.30769 0.9473309     19
9          ford   r 14.75000 2.4167973     26
10        honda   f 24.44444 1.9436506     36
11      hyundai   f 18.64286 1.4990840     31
12         jeep   4 13.50000 2.5071327     22
13   land rover   4 11.50000 0.5773503     18
14      lincoln   r 11.33333 0.5773503     18
15      mercury   4 13.25000 0.5000000     19
16       nissan   4 13.75000 1.2583057     20
17       nissan   f 20.00000 1.8708287     32
18      pontiac   f 17.00000 1.0000000     28
19       subaru   4 19.28571 0.9138735     27
20       toyota   4 14.93333 1.4375906     22
21       toyota   f 21.36842 3.022

In [25]:
%%R
nome <- c("Didi","Dedé","Mussum","Zacarias")
ano.nasc <- c(1936,1936,1941,1934)
vive <- c("V","V","F","F")
trapalhoes <- data.frame(nome,ano.nasc,vive)
trapalhoes


      nome ano.nasc vive
1     Didi     1936    V
2     Dedé     1936    V
3   Mussum     1941    F
4 Zacarias     1934    F
