# `DataFrames.jl`

In [1]:
# julia version
versioninfo()

Julia Version 1.10.1
Commit 7790d6f0641 (2024-02-13 20:41 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: macOS (arm64-apple-darwin22.4.0)
  CPU: 8 × Apple M1
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, apple-m1)
Threads: 1 default, 0 interactive, 1 GC (on 4 virtual cores)
Environment:
  JULIA_NUM_THREADS = 


In [2]:
# loading packages
using Pkg
using CSV
using DataFrames
using Downloads
using Missings
using Random
using Statistics

In [3]:
# package versions
Pkg.status()

[36m[1mProject[22m[39m JuliaBasics v0.1.0
[32m[1mStatus[22m[39m `~/Documents/Projects/Github/JuliaBasics/Project.toml`
  [90m[336ed68f] [39mCSV v0.10.12
  [90m[a93c6f00] [39mDataFrames v1.6.1
  [90m[e1d29d7a] [39mMissings v1.1.0


## download data

In [4]:
# set directory
data_dir = "data";

In [5]:
# create directory
if !isdir(data_dir)
    mkdir(data_dir)
end;

In [6]:
# set file name and path
file_name = "iris.csv"
file_path = joinpath(pwd(), data_dir, file_name);

In [7]:
# set file url
file_url = string("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/", file_name);

In [8]:
# download data
if !isfile(file_path)
    Downloads.download(file_url, file_path);
end

"/Users/rene/Documents/Projects/Github/JuliaBasics/data/iris.csv"

## read data

In [9]:
# read data
iris = DataFrame(CSV.File(file_path, delim = ","));

## explore data

### `nrow`, `ncol` and `size`

In [10]:
# number of rows
nrow(iris)

150

In [11]:
# number of columns
ncol(iris)

5

In [12]:
# size
size(iris)

(150, 5)

### `names`

In [13]:
# column names
names(iris)

5-element Vector{String}:
 "sepal_length"
 "sepal_width"
 "petal_length"
 "petal_width"
 "species"

### `propertynames`

In [14]:
# column names
propertynames(iris)

5-element Vector{Symbol}:
 :sepal_length
 :sepal_width
 :petal_length
 :petal_width
 :species

### `eltype`

In [15]:
# element type
eltype(iris.species)

String15

In [16]:
# element types
eltype.(eachcol(iris))

5-element Vector{DataType}:
 Float64
 Float64
 Float64
 Float64
 String15

In [17]:
# dict of element types
Dict(names(iris) .=> eltype.(eachcol(iris)))

Dict{String, DataType} with 5 entries:
  "sepal_length" => Float64
  "petal_width"  => Float64
  "petal_length" => Float64
  "sepal_width"  => Float64
  "species"      => String15

### `unique`

In [18]:
# unique values
unique(iris.species)

3-element Vector{String15}:
 "setosa"
 "versicolor"
 "virginica"

### value counts

In [19]:
# value counts
combine(groupby(iris, :species), nrow)

Row,species,nrow
Unnamed: 0_level_1,String15,Int64
1,setosa,50
2,versicolor,50
3,virginica,50


### `first`, `last`

In [20]:
# first rows
first(iris, 5)

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [21]:
# last rows
last(iris, 3)

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,6.5,3.0,5.2,2.0,virginica
2,6.2,3.4,5.4,2.3,virginica
3,5.9,3.0,5.1,1.8,virginica


### `describe`

#### all columns

In [22]:
# all columns
describe(iris)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,sepal_length,5.84333,4.3,5.8,7.9,0,Float64
2,sepal_width,3.05733,2.0,3.0,4.4,0,Float64
3,petal_length,3.758,1.0,4.35,6.9,0,Float64
4,petal_width,1.19933,0.1,1.3,2.5,0,Float64
5,species,,setosa,,virginica,0,String15


#### numeric columns

In [23]:
# number
describe(iris, cols=names(iris, Number))

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Float64,Float64,Float64,Int64,DataType
1,sepal_length,5.84333,4.3,5.8,7.9,0,Float64
2,sepal_width,3.05733,2.0,3.0,4.4,0,Float64
3,petal_length,3.758,1.0,4.35,6.9,0,Float64
4,petal_width,1.19933,0.1,1.3,2.5,0,Float64


In [24]:
# statistics
describe(iris, cols=names(iris, Number), :min, :q25, :median, :q75, :max, :mean, :std)

Row,variable,min,q25,median,q75,max,mean,std
Unnamed: 0_level_1,Symbol,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,sepal_length,4.3,5.1,5.8,6.4,7.9,5.84333,0.828066
2,sepal_width,2.0,2.8,3.0,3.3,4.4,3.05733,0.435866
3,petal_length,1.0,1.6,4.35,5.1,6.9,3.758,1.7653
4,petal_width,0.1,0.3,1.3,1.8,2.5,1.19933,0.762238


In [25]:
# statistics, adding an anonymous function
describe(iris, cols=names(iris, Number), :min, :max, (x -> maximum(x) - minimum(x)) => :range)

Row,variable,min,max,range
Unnamed: 0_level_1,Symbol,Float64,Float64,Float64
1,sepal_length,4.3,7.9,3.6
2,sepal_width,2.0,4.4,2.4
3,petal_length,1.0,6.9,5.9
4,petal_width,0.1,2.5,2.4


In [26]:
# statistics, adding a function

function my_function(x)
    range = maximum(x) - minimum(x)
    return range
end

describe(iris, cols=names(iris, Number), :min, :max, my_function => :range)

Row,variable,min,max,range
Unnamed: 0_level_1,Symbol,Float64,Float64,Float64
1,sepal_length,4.3,7.9,3.6
2,sepal_width,2.0,4.4,2.4
3,petal_length,1.0,6.9,5.9
4,petal_width,0.1,2.5,2.4


#### alphanumeric columns

In [27]:
# abstractstring
describe(iris, cols=names(iris, AbstractString))

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Nothing,String15,Nothing,String15,Int64,DataType
1,species,,setosa,,virginica,0,String15


In [28]:
# statistics
describe(iris, cols=names(iris, AbstractString), :first, :last, :nunique, :nmissing, :nnonmissing)

Row,variable,first,last,nunique,nmissing,nnonmissing
Unnamed: 0_level_1,Symbol,String15,String15,Int64,Int64,Int64
1,species,setosa,virginica,3,0,150


In [29]:
# statistics, adding anonymous functions
describe(
    iris, 
    cols=names(iris, AbstractString), 
    (x -> minimum(length.(x))) => :min_length, 
    (x -> maximum(length.(x))) => :max_length
)

Row,variable,min_length,max_length
Unnamed: 0_level_1,Symbol,Int64,Int64
1,species,6,10


#### selected columns

In [30]:
# range
describe(iris, cols=1:3)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Float64,Float64,Float64,Int64,DataType
1,sepal_length,5.84333,4.3,5.8,7.9,0,Float64
2,sepal_width,3.05733,2.0,3.0,4.4,0,Float64
3,petal_length,3.758,1.0,4.35,6.9,0,Float64


In [31]:
# index
describe(iris, cols=[1, 2, 5])

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,sepal_length,5.84333,4.3,5.8,7.9,0,Float64
2,sepal_width,3.05733,2.0,3.0,4.4,0,Float64
3,species,,setosa,,virginica,0,String15


In [32]:
# propertynames
describe(iris, cols=[:petal_length, :petal_width])

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Float64,Float64,Float64,Int64,DataType
1,petal_length,3.758,1.0,4.35,6.9,0,Float64
2,petal_width,1.19933,0.1,1.3,2.5,0,Float64


In [33]:
# regex
describe(iris, cols=r"width")

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Float64,Float64,Float64,Int64,DataType
1,sepal_width,3.05733,2.0,3.0,4.4,0,Float64
2,petal_width,1.19933,0.1,1.3,2.5,0,Float64


#### selected values

In [34]:
# setosa
describe(iris[iris.species .== "setosa", :])

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,sepal_length,5.006,4.3,5.0,5.8,0,Float64
2,sepal_width,3.428,2.3,3.4,4.4,0,Float64
3,petal_length,1.462,1.0,1.5,1.9,0,Float64
4,petal_width,0.246,0.1,0.2,0.6,0,Float64
5,species,,setosa,,setosa,0,String15


## Subsetting data

### rows

In [35]:
# describe
describe(iris)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,sepal_length,5.84333,4.3,5.8,7.9,0,Float64
2,sepal_width,3.05733,2.0,3.0,4.4,0,Float64
3,petal_length,3.758,1.0,4.35,6.9,0,Float64
4,petal_width,1.19933,0.1,1.3,2.5,0,Float64
5,species,,setosa,,virginica,0,String15


In [36]:
# select rows
iris[5:10, :]

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,5.0,3.6,1.4,0.2,setosa
2,5.4,3.9,1.7,0.4,setosa
3,4.6,3.4,1.4,0.3,setosa
4,5.0,3.4,1.5,0.2,setosa
5,4.4,2.9,1.4,0.2,setosa
6,4.9,3.1,1.5,0.1,setosa


In [37]:
# select rows
iris[end-52:end-48, :]

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,6.2,2.9,4.3,1.3,versicolor
2,5.1,2.5,3.0,1.1,versicolor
3,5.7,2.8,4.1,1.3,versicolor
4,6.3,3.3,6.0,2.5,virginica
5,5.8,2.7,5.1,1.9,virginica


In [38]:
# bitvector
iris.sepal_length .< 4.8

150-element BitVector:
 0
 0
 1
 1
 0
 0
 1
 0
 1
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [39]:
# subset
iris[iris.sepal_length .< 4.8, :]

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,4.7,3.2,1.3,0.2,setosa
2,4.6,3.1,1.5,0.2,setosa
3,4.6,3.4,1.4,0.3,setosa
4,4.4,2.9,1.4,0.2,setosa
5,4.3,3.0,1.1,0.1,setosa
6,4.6,3.6,1.0,0.2,setosa
7,4.7,3.2,1.6,0.2,setosa
8,4.4,3.0,1.3,0.2,setosa
9,4.5,2.3,1.3,0.3,setosa
10,4.4,3.2,1.3,0.2,setosa


In [40]:
# subset
iris[(iris.sepal_length .> 4.5) .&& (iris.sepal_length .< 4.8), :]

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,4.7,3.2,1.3,0.2,setosa
2,4.6,3.1,1.5,0.2,setosa
3,4.6,3.4,1.4,0.3,setosa
4,4.6,3.6,1.0,0.2,setosa
5,4.7,3.2,1.6,0.2,setosa
6,4.6,3.2,1.4,0.2,setosa


In [41]:
# bitwise and
iris[.&(iris.sepal_length .> 4.5, iris.sepal_length .< 4.8), :]

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,4.7,3.2,1.3,0.2,setosa
2,4.6,3.1,1.5,0.2,setosa
3,4.6,3.4,1.4,0.3,setosa
4,4.6,3.6,1.0,0.2,setosa
5,4.7,3.2,1.6,0.2,setosa
6,4.6,3.2,1.4,0.2,setosa


In [42]:
# subset
iris[(iris.sepal_length .< 4.5 ) .|| (iris.sepal_length .> 7.5), :]

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,4.4,2.9,1.4,0.2,setosa
2,4.3,3.0,1.1,0.1,setosa
3,4.4,3.0,1.3,0.2,setosa
4,4.4,3.2,1.3,0.2,setosa
5,7.6,3.0,6.6,2.1,virginica
6,7.7,3.8,6.7,2.2,virginica
7,7.7,2.6,6.9,2.3,virginica
8,7.7,2.8,6.7,2.0,virginica
9,7.9,3.8,6.4,2.0,virginica
10,7.7,3.0,6.1,2.3,virginica


In [43]:
# bitwise or
iris[.|(iris.sepal_length .< 4.5, iris.sepal_length .> 7.5), :]

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,4.4,2.9,1.4,0.2,setosa
2,4.3,3.0,1.1,0.1,setosa
3,4.4,3.0,1.3,0.2,setosa
4,4.4,3.2,1.3,0.2,setosa
5,7.6,3.0,6.6,2.1,virginica
6,7.7,3.8,6.7,2.2,virginica
7,7.7,2.6,6.9,2.3,virginica
8,7.7,2.8,6.7,2.0,virginica
9,7.9,3.8,6.4,2.0,virginica
10,7.7,3.0,6.1,2.3,virginica


In [44]:
# filter
filter(row -> row.sepal_length > 7.5, iris)

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,7.6,3.0,6.6,2.1,virginica
2,7.7,3.8,6.7,2.2,virginica
3,7.7,2.6,6.9,2.3,virginica
4,7.7,2.8,6.7,2.0,virginica
5,7.9,3.8,6.4,2.0,virginica
6,7.7,3.0,6.1,2.3,virginica


In [45]:
# filter (subset dataframe)
filter(row -> row.sepal_length > 7.5, iris, view=true)

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,7.6,3.0,6.6,2.1,virginica
2,7.7,3.8,6.7,2.2,virginica
3,7.7,2.6,6.9,2.3,virginica
4,7.7,2.8,6.7,2.0,virginica
5,7.9,3.8,6.4,2.0,virginica
6,7.7,3.0,6.1,2.3,virginica


In [46]:
# filter (subset dataframe)
filter(row -> row.sepal_length > 6 && row.sepal_width < 3, iris, view=true)

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,6.5,2.8,4.6,1.5,versicolor
2,6.6,2.9,4.6,1.3,versicolor
3,6.1,2.9,4.7,1.4,versicolor
4,6.2,2.2,4.5,1.5,versicolor
5,6.1,2.8,4.0,1.3,versicolor
6,6.3,2.5,4.9,1.5,versicolor
7,6.1,2.8,4.7,1.2,versicolor
8,6.4,2.9,4.3,1.3,versicolor
9,6.8,2.8,4.8,1.4,versicolor
10,6.3,2.3,4.4,1.3,versicolor


In [47]:
# filter (subset dataframe)
filter(row -> row.sepal_length > 6 && row.sepal_width < 3 && row.species == "virginica", iris, view=true)

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,6.3,2.9,5.6,1.8,virginica
2,7.3,2.9,6.3,1.8,virginica
3,6.7,2.5,5.8,1.8,virginica
4,6.4,2.7,5.3,1.9,virginica
5,7.7,2.6,6.9,2.3,virginica
6,7.7,2.8,6.7,2.0,virginica
7,6.3,2.7,4.9,1.8,virginica
8,6.2,2.8,4.8,1.8,virginica
9,6.4,2.8,5.6,2.1,virginica
10,7.4,2.8,6.1,1.9,virginica


In [48]:
# in
in.(iris.species, Ref(["virginica", "setosa"]))

150-element BitVector:
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 1
 1
 1
 1
 1
 1
 1
 1
 1

In [49]:
# in
iris[in.(iris.species, Ref(["virginica", "setosa"])), :]

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa
6,5.4,3.9,1.7,0.4,setosa
7,4.6,3.4,1.4,0.3,setosa
8,5.0,3.4,1.5,0.2,setosa
9,4.4,2.9,1.4,0.2,setosa
10,4.9,3.1,1.5,0.1,setosa


### columns

In [50]:
# index (! indicates that underlying columns are not copied)
iris[!, 3:4]

Row,petal_length,petal_width
Unnamed: 0_level_1,Float64,Float64
1,1.4,0.2
2,1.4,0.2
3,1.3,0.2
4,1.5,0.2
5,1.4,0.2
6,1.7,0.4
7,1.4,0.3
8,1.5,0.2
9,1.4,0.2
10,1.5,0.1


In [51]:
# column names
iris[1:5, Cols(:sepal_length, :petal_length, :petal_width, :species)]

Row,sepal_length,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,String15
1,5.1,1.4,0.2,setosa
2,4.9,1.4,0.2,setosa
3,4.7,1.3,0.2,setosa
4,4.6,1.5,0.2,setosa
5,5.0,1.4,0.2,setosa


In [52]:
# column ranges
iris[1:5, Between(:petal_length, :species)]


Row,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,String15
1,1.4,0.2,setosa
2,1.4,0.2,setosa
3,1.3,0.2,setosa
4,1.5,0.2,setosa
5,1.4,0.2,setosa


In [53]:
# column name and column range
iris[1:5, Cols(:sepal_length, Between(:petal_length, :species))]

Row,sepal_length,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,String15
1,5.1,1.4,0.2,setosa
2,4.9,1.4,0.2,setosa
3,4.7,1.3,0.2,setosa
4,4.6,1.5,0.2,setosa
5,5.0,1.4,0.2,setosa


In [54]:
# exclude columns
iris[!, Not([:sepal_length, :sepal_width])]

Row,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,String15
1,1.4,0.2,setosa
2,1.4,0.2,setosa
3,1.3,0.2,setosa
4,1.5,0.2,setosa
5,1.4,0.2,setosa
6,1.7,0.4,setosa
7,1.4,0.3,setosa
8,1.5,0.2,setosa
9,1.4,0.2,setosa
10,1.5,0.1,setosa


In [55]:
# regex
iris[1:5, r"width"]

Row,sepal_width,petal_width
Unnamed: 0_level_1,Float64,Float64
1,3.5,0.2
2,3.0,0.2
3,3.2,0.2
4,3.1,0.2
5,3.6,0.2


In [56]:
# regex
iris[1:5, r"petal"]

Row,petal_length,petal_width
Unnamed: 0_level_1,Float64,Float64
1,1.4,0.2
2,1.4,0.2
3,1.3,0.2
4,1.5,0.2
5,1.4,0.2


In [57]:
# regex
iris[1:5, r"s"]

Row,sepal_length,sepal_width,species
Unnamed: 0_level_1,Float64,Float64,String15
1,5.1,3.5,setosa
2,4.9,3.0,setosa
3,4.7,3.2,setosa
4,4.6,3.1,setosa
5,5.0,3.6,setosa


## Manipulate dataframe

### rename columns

In [58]:
# copy iris
iris_copy = deepcopy(iris);

In [59]:
# property names
propertynames(iris)

5-element Vector{Symbol}:
 :sepal_length
 :sepal_width
 :petal_length
 :petal_width
 :species

In [60]:
# rename
rename!(iris, :species => :Species)
rename!(iris, :sepal_length => :SepalLength, :sepal_width => :SepalWith);

In [61]:
# property names
propertynames(iris)

5-element Vector{Symbol}:
 :SepalLength
 :SepalWith
 :petal_length
 :petal_width
 :Species

In [62]:
# restore iris
iris = deepcopy(iris_copy);

### reorder columns

In [63]:
# copy iris
iris_copy = deepcopy(iris);

In [64]:
# reorder
select!(iris, :species, :petal_length, :sepal_length, :petal_width, :sepal_width);

In [65]:
# first rows
first(iris, 3)

Row,species,petal_length,sepal_length,petal_width,sepal_width
Unnamed: 0_level_1,String15,Float64,Float64,Float64,Float64
1,setosa,1.4,5.1,0.2,3.5
2,setosa,1.4,4.9,0.2,3.0
3,setosa,1.3,4.7,0.2,3.2


In [66]:
# restore iris
iris = deepcopy(iris_copy);

## Missing values

### `allowmissing`

In [67]:
# allowmissing
@which allowmissing

Missings

In [68]:
# allowmissing
allowmissing!(iris, [:sepal_length, :sepal_width, :petal_length, :petal_width]);

In [69]:
# element types
eltype.(eachcol(iris))

5-element Vector{Type}:
 Union{Missing, Float64}
 Union{Missing, Float64}
 Union{Missing, Float64}
 Union{Missing, Float64}
 String15

### Create missing values

In [70]:
# set seed
Random.seed!(20_000);

In [71]:
# create missings
n = 100
for i in 1:n
    iris[rand(1:nrow(iris)), rand(1:4)] = missing
end

In [72]:
# count missings
describe(iris, :nmissing, :eltype)

Row,variable,nmissing,eltype
Unnamed: 0_level_1,Symbol,Int64,Type
1,sepal_length,24,"Union{Missing, Float64}"
2,sepal_width,30,"Union{Missing, Float64}"
3,petal_length,29,"Union{Missing, Float64}"
4,petal_width,17,"Union{Missing, Float64}"
5,species,0,String15


In [73]:
# check total number of missings
sum(describe(iris, :nmissing).nmissing) == n

true

In [74]:
# first rows
first(iris, 5)

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64?,Float64?,Float64?,Float64?,String15
1,5.1,3.5,missing,missing,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,missing,1.3,0.2,setosa
4,4.6,missing,1.5,0.2,setosa
5,5.0,missing,1.4,0.2,setosa


### `completecases`

In [75]:
completecases(iris)

150-element BitVector:
 0
 1
 0
 0
 0
 1
 1
 1
 0
 1
 ⋮
 1
 1
 1
 1
 0
 0
 0
 0
 0

In [76]:
iris[completecases(iris), :]

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64?,Float64?,Float64?,Float64?,String15
1,4.9,3.0,1.4,0.2,setosa
2,5.4,3.9,1.7,0.4,setosa
3,4.6,3.4,1.4,0.3,setosa
4,5.0,3.4,1.5,0.2,setosa
5,4.9,3.1,1.5,0.1,setosa
6,4.3,3.0,1.1,0.1,setosa
7,5.4,3.4,1.7,0.2,setosa
8,5.1,3.7,1.5,0.4,setosa
9,4.6,3.6,1.0,0.2,setosa
10,5.1,3.3,1.7,0.5,setosa


### `eltype`, `nonmissingtype`

In [77]:
eltype(iris.sepal_length)

Union{Missing, Float64}

In [78]:
nonmissingtype(eltype(iris.sepal_length))

Float64

In [79]:
nonmissingtype.(eltype.(eachcol(iris)))

5-element Vector{DataType}:
 Float64
 Float64
 Float64
 Float64
 String15

### count missings

In [80]:
# vector
[sum(ismissing.(col)) for col in eachcol(iris)]

5-element Vector{Int64}:
 24
 30
 29
 17
  0

In [81]:
# dictionary
Dict(zip(names(iris), [sum(ismissing.(col)) for col in eachcol(iris)]))

Dict{String, Int64} with 5 entries:
  "sepal_length" => 24
  "petal_width"  => 17
  "petal_length" => 29
  "sepal_width"  => 30
  "species"      => 0

In [82]:
# dataframe
describe(iris, :nmissing)

Row,variable,nmissing
Unnamed: 0_level_1,Symbol,Int64
1,sepal_length,24
2,sepal_width,30
3,petal_length,29
4,petal_width,17
5,species,0


### `dropmissing`

In [83]:
# copy iris
iris_copy = deepcopy(iris);

In [84]:
# dropmissing
dropmissing(iris)

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String15
1,4.9,3.0,1.4,0.2,setosa
2,5.4,3.9,1.7,0.4,setosa
3,4.6,3.4,1.4,0.3,setosa
4,5.0,3.4,1.5,0.2,setosa
5,4.9,3.1,1.5,0.1,setosa
6,4.3,3.0,1.1,0.1,setosa
7,5.4,3.4,1.7,0.2,setosa
8,5.1,3.7,1.5,0.4,setosa
9,4.6,3.6,1.0,0.2,setosa
10,5.1,3.3,1.7,0.5,setosa


In [85]:
# dropmissing
dropmissing!(iris, [:sepal_length, :sepal_width]);

In [86]:
# first rows
first(iris, 7)

Row,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64?,Float64?,String15
1,5.1,3.5,missing,missing,setosa
2,4.9,3.0,1.4,0.2,setosa
3,5.4,3.9,1.7,0.4,setosa
4,4.6,3.4,1.4,0.3,setosa
5,5.0,3.4,1.5,0.2,setosa
6,4.4,2.9,missing,0.2,setosa
7,4.9,3.1,1.5,0.1,setosa


### `replace`

In [87]:
# restore copy
iris = deepcopy(iris_copy);

In [88]:
# replace values (without changing element type)
replace!(iris.sepal_width, missing => 1_000);

In [89]:
# describe
describe(iris, :nmissing, :eltype)

Row,variable,nmissing,eltype
Unnamed: 0_level_1,Symbol,Int64,Type
1,sepal_length,24,"Union{Missing, Float64}"
2,sepal_width,0,"Union{Missing, Float64}"
3,petal_length,29,"Union{Missing, Float64}"
4,petal_width,17,"Union{Missing, Float64}"
5,species,0,String15


### `coalesce`

In [90]:
# restore copy
iris = deepcopy(iris_copy);

In [91]:
# replace missing values (with changing the element type)
iris.sepal_width = coalesce.(iris.sepal_width, 1_000);

In [92]:
# describe
describe(iris, :nmissing, :eltype)

Row,variable,nmissing,eltype
Unnamed: 0_level_1,Symbol,Int64,Type
1,sepal_length,24,"Union{Missing, Float64}"
2,sepal_width,0,Real
3,petal_length,29,"Union{Missing, Float64}"
4,petal_width,17,"Union{Missing, Float64}"
5,species,0,String15


### Replace values by `mean` per `species`

In [93]:
# restore copy
iris = deepcopy(iris_copy);

In [94]:
# describe
describe(iris)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,sepal_length,5.81032,4.3,5.7,7.9,24,"Union{Missing, Float64}"
2,sepal_width,3.01833,2.0,3.0,4.4,30,"Union{Missing, Float64}"
3,petal_length,3.71901,1.0,4.3,6.7,29,"Union{Missing, Float64}"
4,petal_width,1.20977,0.1,1.3,2.5,17,"Union{Missing, Float64}"
5,species,,setosa,,virginica,0,String15


In [95]:
# mean from statistics.jl package
@which mean

Statistics

In [96]:
# mean for petal_length (independent of species)
mean(skipmissing(iris.petal_length))

3.7190082644628117

In [97]:
# mean petal_length for species versicolor
mean(skipmissing(iris[isequal.(iris.species, "versicolor"), :petal_length]))

4.269047619047619

In [98]:
# mean petal_length per species
means_before = combine(groupby(iris, :species), :petal_length .=> mean∘skipmissing .=> :mean)

Row,species,mean
Unnamed: 0_level_1,String15,Float64
1,setosa,1.46829
2,versicolor,4.26905
3,virginica,5.53947


In [99]:
# replace missing petal_length values by the mean per species
for group in groupby(iris, :species)
    group[ismissing.(group.petal_length), :petal_length] .= mean(skipmissing(group.petal_length))
end

In [100]:
# describe
describe(iris)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,sepal_length,5.81032,4.3,5.7,7.9,24,"Union{Missing, Float64}"
2,sepal_width,3.01833,2.0,3.0,4.4,30,"Union{Missing, Float64}"
3,petal_length,3.75894,1.0,4.26905,6.7,0,"Union{Missing, Float64}"
4,petal_width,1.20977,0.1,1.3,2.5,17,"Union{Missing, Float64}"
5,species,,setosa,,virginica,0,String15


In [101]:
# mean petal_length per species
means_after = combine(groupby(iris, :species), :petal_length .=> mean .=> :mean)

Row,species,mean
Unnamed: 0_level_1,String15,Float64
1,setosa,1.46829
2,versicolor,4.26905
3,virginica,5.53947


In [102]:
# compare means before and after replacing values
isapprox(means_before.mean, means_after.mean)

true

In [103]:
# same comparison, different notation
means_before.mean ≈ means_after.mean

true

In [104]:
# disallow missing values
disallowmissing!(iris, :petal_length);

In [105]:
# describe
describe(iris)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,sepal_length,5.81032,4.3,5.7,7.9,24,"Union{Missing, Float64}"
2,sepal_width,3.01833,2.0,3.0,4.4,30,"Union{Missing, Float64}"
3,petal_length,3.75894,1.0,4.26905,6.7,0,Float64
4,petal_width,1.20977,0.1,1.3,2.5,17,"Union{Missing, Float64}"
5,species,,setosa,,virginica,0,String15


In [106]:
# replace missing values by the mean per species for all numeric columns
for grp in groupby(iris, :species)
    for col in names(iris, Union{Missing, Float64})
        grp[ismissing.(grp[!, col]), col] .= mean(skipmissing(grp[!, col]))
    end
end

In [107]:
# disallow missing values
disallowmissing!(iris);

In [108]:
# describe
describe(iris)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,sepal_length,5.82851,4.3,5.8,7.9,0,Float64
2,sepal_width,3.03569,2.0,3.0,4.4,0,Float64
3,petal_length,3.75894,1.0,4.26905,6.7,0,Float64
4,petal_width,1.20098,0.1,1.3,2.5,0,Float64
5,species,,setosa,,virginica,0,String15


## Create dataframe

In [109]:
# Create dataframe with column data from vector, range, or constant
DataFrame(x = [10, 20, 30], y = 4:6, z = 9)

Row,x,y,z
Unnamed: 0_level_1,Int64,Int64,Int64
1,10,4,9
2,20,5,9
3,30,6,9


In [110]:
# Create dataframe from a vector of named tuples
DataFrame([(x=1, y=2), (x=3, y=4)])

Row,x,y
Unnamed: 0_level_1,Int64,Int64
1,1,2
2,3,4


In [111]:
# Create dataframe from pairs of column name and data
DataFrame("x" => [10, 20], "y" => [30 ,40])

Row,x,y
Unnamed: 0_level_1,Int64,Int64
1,10,30
2,20,40


In [112]:
# Create data frame from a matrix
DataFrame(rand(5, 3), :auto)

Row,x1,x2,x3
Unnamed: 0_level_1,Float64,Float64,Float64
1,0.179999,0.551681,0.465886
2,0.802308,0.291485,0.140368
3,0.674031,0.758827,0.762979
4,0.21323,0.381562,0.269365
5,0.846688,0.961208,0.405862


In [113]:
# Create dataframe from a matrix
DataFrame(rand(5, 3), [:x, :y, :z])

Row,x,y,z
Unnamed: 0_level_1,Float64,Float64,Float64
1,0.288159,0.989546,0.165529
2,0.958112,0.0759519,0.487494
3,0.953123,0.808883,0.618207
4,0.536712,0.839786,0.446132
5,0.876099,0.228024,0.310324


In [114]:
# Create an empty dataframe without column
DataFrame()

In [115]:
# Create an empty data frame with typed columns
df = DataFrame(x = Int[], y = Float64[])

Row,x,y
Unnamed: 0_level_1,Int64,Float64


In [116]:
# Append data
append!(df, DataFrame(x = Int[10, 20], y = Float64[.1, .2]))
append!(df, DataFrame(x = Int[30, 40], y = Float64[.3, .4]))

Row,x,y
Unnamed: 0_level_1,Int64,Float64
1,10,0.1
2,20,0.2
3,30,0.3
4,40,0.4


In [117]:
# Insert column
insertcols!(df, 3, "newcol" => 5:5:20)

Row,x,y,newcol
Unnamed: 0_level_1,Int64,Float64,Int64
1,10,0.1,5
2,20,0.2,10
3,30,0.3,15
4,40,0.4,20


In [118]:
# Push single row
push!(df, [50, .5, 25])

Row,x,y,newcol
Unnamed: 0_level_1,Int64,Float64,Int64
1,10,0.1,5
2,20,0.2,10
3,30,0.3,15
4,40,0.4,20
5,50,0.5,25


In [119]:
# Aloow missing
allowmissing!(df);

In [120]:
# Push single row
push!(df, [missing, missing, missing])

Row,x,y,newcol
Unnamed: 0_level_1,Int64?,Float64?,Int64?
1,10,0.1,5
2,20,0.2,10
3,30,0.3,15
4,40,0.4,20
5,50,0.5,25
6,missing,missing,missing


In [121]:
# Replace missing Values (datatype column unchanged)
replace!(df.x, missing=>99);
df

Row,x,y,newcol
Unnamed: 0_level_1,Int64?,Float64?,Int64?
1,10,0.1,5
2,20,0.2,10
3,30,0.3,15
4,40,0.4,20
5,50,0.5,25
6,99,missing,missing


In [122]:
# Replace missing value (with datatype change)
df.y = coalesce.(df.y, .99)
df

Row,x,y,newcol
Unnamed: 0_level_1,Int64?,Float64,Int64?
1,10,0.1,5
2,20,0.2,10
3,30,0.3,15
4,40,0.4,20
5,50,0.5,25
6,99,0.99,missing


In [123]:
# Rename columns
rename!(df, string.('a':'c'))

Row,a,b,c
Unnamed: 0_level_1,Int64?,Float64,Int64?
1,10,0.1,5
2,20,0.2,10
3,30,0.3,15
4,40,0.4,20
5,50,0.5,25
6,99,0.99,missing


In [124]:
# Horizontal concat
df1 = DataFrame(a=[10, 20])
df2 = DataFrame(b=[30, 40], c=[50, 60])
hcat(df1, df2)

Row,a,b,c
Unnamed: 0_level_1,Int64,Int64,Int64
1,10,30,50
2,20,40,60


In [125]:
# create dataframe

Random.seed!(1234); # set seed for reproducibility

n = 10 # number of rows

df = DataFrame(
    A = 1:n, # range
    B = [x * 5 for x in 1:n], # array comprehension
    C = 100, # broadcasting
    D = rand([rand(), rand(), missing], n),
    E = rand(["red", "green", "blue"], n),
    G = rand(['A', 'B', nothing], n),
    F = rand([true, false], n),
    H = rand([rand(Int64), rand(), missing], n),
)

Row,A,B,C,D,E,G,F,H
Unnamed: 0_level_1,Int64,Int64,Int64,Float64?,String,Union…,Bool,Float64?
1,1,5,100,0.325977,red,B,True,0.26906
2,2,10,100,missing,red,A,False,-4.27396e18
3,3,15,100,0.549051,green,B,False,-4.27396e18
4,4,20,100,0.549051,blue,A,False,missing
5,5,25,100,missing,blue,A,False,missing
6,6,30,100,missing,green,B,True,missing
7,7,35,100,0.549051,blue,,False,0.26906
8,8,40,100,missing,blue,,True,missing
9,9,45,100,0.549051,red,B,False,0.26906
10,10,50,100,missing,green,B,True,missing


In [126]:
# complete cases (nothing ≠ missing !!, see row 3, col G)
df[completecases(df), :]

Row,A,B,C,D,E,G,F,H
Unnamed: 0_level_1,Int64,Int64,Int64,Float64?,String,Union…,Bool,Float64?
1,1,5,100,0.325977,red,B,True,0.26906
2,3,15,100,0.549051,green,B,False,-4.27396e+18
3,7,35,100,0.549051,blue,,False,0.26906
4,9,45,100,0.549051,red,B,False,0.26906


In [127]:
# findall missing
findall(ismissing.(df.D))

5-element Vector{Int64}:
  2
  5
  6
  8
 10

In [128]:
# findall missing
df[findall(ismissing.(df.D)), :]

Row,A,B,C,D,E,G,F,H
Unnamed: 0_level_1,Int64,Int64,Int64,Float64?,String,Union…,Bool,Float64?
1,2,10,100,missing,red,A,False,-4.27396e18
2,5,25,100,missing,blue,A,False,missing
3,6,30,100,missing,green,B,True,missing
4,8,40,100,missing,blue,,True,missing
5,10,50,100,missing,green,B,True,missing


In [129]:
# findall all not missing
.!ismissing.(df.D)

10-element BitVector:
 1
 0
 1
 1
 0
 0
 1
 0
 1
 0

In [130]:
# findall not missing
df[findall(.!ismissing.(df.D)), :]

Row,A,B,C,D,E,G,F,H
Unnamed: 0_level_1,Int64,Int64,Int64,Float64?,String,Union…,Bool,Float64?
1,1,5,100,0.325977,red,B,True,0.26906
2,3,15,100,0.549051,green,B,False,-4.27396e18
3,4,20,100,0.549051,blue,A,False,missing
4,7,35,100,0.549051,blue,,False,0.26906
5,9,45,100,0.549051,red,B,False,0.26906


In [131]:
# findall nothing
findall(isnothing.(df.G))

2-element Vector{Int64}:
 7
 8

In [132]:
# findall nothing
df[findall(isnothing.(df.G)), :]

Row,A,B,C,D,E,G,F,H
Unnamed: 0_level_1,Int64,Int64,Int64,Float64?,String,Union…,Bool,Float64?
1,7,35,100,0.549051,blue,,False,0.26906
2,8,40,100,missing,blue,,True,missing


In [133]:
# convert datatype
replace!(df.D, missing => 1_000);
df.D = convert.(Float64, df.D)
eltype(df.D)

Float64