diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index dc46067..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6466682..1e1bef1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,5 +3,5 @@ variables: include: - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/common.yml' - - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/test_v1.1.yml' + - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/test_v1.3.yml' - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/test_dev.yml' diff --git a/Project.toml b/Project.toml index d33d086..bbb7de9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "FixedEffectModels" uuid = "9d5cd8c9-2029-5cab-9928-427838db53e3" -version = "0.10.9" +version = "0.11.0" [deps] Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" @@ -18,15 +18,15 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -Combinatorics = "0, 1.0" -DataFrames = "0.19, 0.20, 0.21" +Combinatorics = "0, 1" +DataFrames = "0.21" Distributions = "0" FillArrays = "0" FixedEffects = "0.7.4" Reexport = "0" StatsBase = "0.32, 0.33" StatsModels = "0.6" -Tables = "0, 1.0" +Tables = "1" julia = "1.3" [extras] diff --git a/README.md b/README.md index 4ea9034..1c441d1 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ [![Build Status](https://travis-ci.com/FixedEffects/FixedEffectModels.jl.svg?branch=master)](https://travis-ci.com/FixedEffects/FixedEffectModels.jl) -[![pipeline status](https://gitlab.com/JuliaGPU/FixedEffectModels.jl/badges/master/pipeline.svg)](https://gitlab.com/JuliaGPU/FixedEffectModels.jl/commits/master) -This package estimates linear models with high dimensional categorical variables and/or instrumental variables. +This package estimates linear models with high dimensional categorical variables and/or instrumental variables. Its objective is similar to the Stata command [`reghdfe`](https://github.com/sergiocorreia/reghdfe) and the R function [`felm`](https://cran.r-project.org/web/packages/lfe/lfe.pdf). The package tends to be much faster than these two options. @@ -31,13 +30,13 @@ reg(df, @formula(Sales ~ NDI + fe(State) + fe(Year)), Vcov.cluster(:State), weig - A typical formula is composed of one dependent variable, exogeneous variables, endogeneous variables, instrumental variables, and a set of high-dimensional fixed effects. - + ```julia dependent variable ~ exogenous variables + (endogenous variables ~ instrumental variables) + fe(fixedeffect variable) ``` - High-dimensional fixed effect variables are indicated with the function `fe`. You can add an arbitrary number of high dimensional fixed effects, separated with `+`. You can also interact fixed effects using `&` or `*`. - + High-dimensional fixed effect variables are indicated with the function `fe`. You can add an arbitrary number of high dimensional fixed effects, separated with `+`. You can also interact fixed effects using `&` or `*`. + For instance, to add state fixed effects use `fe(State)`. To add both state and year fixed effects, use `fe(State) + fe(Year)`. To add state-year fixed effects, use `fe(State)&fe(Year)`. To add state specific slopes for year, use `fe(State)&Year`. To add both state fixed-effects and state specific slopes for year use `fe(State)*Year`. ```julia @@ -62,7 +61,7 @@ reg(df, @formula(Sales ~ NDI + fe(State) + fe(Year)), Vcov.cluster(:State), weig ```julia weights = :Pop ``` -- The option `subset` specifies a subset of the data +- The option `subset` specifies a subset of the data ```julia subset = df.State .>= 30 ``` @@ -70,14 +69,14 @@ reg(df, @formula(Sales ~ NDI + fe(State) + fe(Year)), Vcov.cluster(:State), weig - The option `method` can be set to one of the following: `:cpu`, `:gpu` (see Performances below). -- The option `contrasts` specifies particular contrasts for categorical variables in the formula, e.g. +- The option `contrasts` specifies particular contrasts for categorical variables in the formula, e.g. ```julia df.YearC = categorical(df.Year) reg(df, @formula(Sales ~ YearC); contrasts = Dict(:YearC => DummyCoding(base = 80))) ``` ## Output -`reg` returns a light object. It is composed of - +`reg` returns a light object. It is composed of + - the vector of coefficients & the covariance matrix (use `coef`, `coefnames`, `vcov` on the output of `reg`) - a boolean vector reporting rows used in the estimation - a set of scalars (number of observations, the degree of freedoms, r2, etc) @@ -90,13 +89,10 @@ Methods such as `predict`, `residuals` are still defined but require to specify You may use [RegressionTables.jl](https://github.com/jmboehm/RegressionTables.jl) to get publication-quality regression tables. -## Performances -#### GPU +## GPU The package has support for GPUs (Nvidia) (thanks to Paul Schrimpf). This can make the package an order of magnitude faster for complicated problems. -First make sure to do `using CUDA` before `using FixedEffectModels`. Then, estimate a model with `method = :gpu`. - -When working on the GPU, it is encouraged to set the floating point precision to `Float32` with `double_precision = false`, since it is usually much faster. +To use GPU, run `using CUDA` before `using FixedEffectModels`. Then, estimate a model with `method = :gpu`. For maximum speed, set the floating point precision to `Float32` with `double_precision = false`. ```julia using CUDA, FixedEffectModels @@ -124,10 +120,6 @@ Fong, DC. and Saunders, M. (2011) *LSMR: An Iterative Algorithm for Sparse Least Gaure, S. (2013) *OLS with Multiple High Dimensional Category Variables*. Computational Statistics and Data Analysis -Kleibergen, F, and Paap, R. (2006) *Generalized reduced rank tests using the singular value decomposition.* Journal of econometrics +Kleibergen, F, and Paap, R. (2006) *Generalized reduced rank tests using the singular value decomposition.* Journal of econometrics Kleibergen, F. and Schaffer, M. (2007) *RANKTEST: Stata module to test the rank of a matrix using the Kleibergen-Paap rk statistic*. Statistical Software Components, Boston College Department of Economics. - - - - diff --git a/src/FixedEffectModels.jl b/src/FixedEffectModels.jl index 8bf229d..eac19bb 100644 --- a/src/FixedEffectModels.jl +++ b/src/FixedEffectModels.jl @@ -6,7 +6,6 @@ module FixedEffectModels ## Dependencies ## ############################################################################## -using Base using LinearAlgebra using Statistics using Printf @@ -15,11 +14,10 @@ using DataFrames using Distributions using Reexport using Tables +using FixedEffects @reexport using StatsBase @reexport using StatsModels -using FixedEffects - ############################################################################## ## ## Exported methods and types @@ -29,16 +27,10 @@ using FixedEffects export reg, partial_out, fe, - FixedEffectModel, has_iv, has_fe, - -Vcov, - -#deprecated -@model, -fes +Vcov ############################################################################## ## @@ -58,6 +50,5 @@ include("FixedEffectModel.jl") include("fit.jl") include("partial_out.jl") -include("deprecated.jl") end # module FixedEffectModels diff --git a/src/deprecated.jl b/src/deprecated.jl deleted file mode 100644 index 50640e2..0000000 --- a/src/deprecated.jl +++ /dev/null @@ -1,156 +0,0 @@ - - -############################################################################## -## -## Old one -## -## -############################################################################## - -function oldparse_fixedeffect(df::AbstractDataFrame, feformula::FormulaTerm) - fe = FixedEffect[] - id = Symbol[] - for term in eachterm(feformula.rhs) - result = oldparse_fixedeffect(df, term, feformula) - if result != nothing - push!(fe, result[1]) - push!(id, result[2]) - end - end - return fe, id -end - -# Constructors from dataframe + Term -function oldparse_fixedeffect(df::AbstractDataFrame, a::Term, feformula::FormulaTerm) - v = df[!, Symbol(a)] - if isa(v, CategoricalVector) - return FixedEffect(v), Symbol(a) - else - # x from x*id -> x + id + x&id - if !any(isa(term, InteractionTerm) & (a ∈ terms(term)) for term in eachterm(feformula.rhs)) - error("The term $(a) in fe= is a continuous variable. Convert it to a categorical variable using 'categorical'.") - end - end -end - -# Constructors from dataframe + InteractionTerm -function oldparse_fixedeffect(df::AbstractDataFrame, a::InteractionTerm, feformula::FormulaTerm) - factorvars, interactionvars = _split(df, a) - if !isempty(factorvars) - # x1&x2 from (x1&x2)*id - fe = FixedEffect((df[!, v] for v in factorvars)...; interaction = old_multiply(df, interactionvars)) - id = old_name(Symbol.(terms(a))) - return fe, id - end -end - -function _split(df::AbstractDataFrame, a::InteractionTerm) - factorvars, interactionvars = Symbol[], Symbol[] - for s in terms(a) - s = Symbol(s) - isa(df[!, s], CategoricalVector) ? push!(factorvars, s) : push!(interactionvars, s) - end - return factorvars, interactionvars -end - -function old_multiply(df, ss::Vector{Symbol}) - if isempty(ss) - out = Ones(size(df, 1)) - else - out = ones(size(df, 1)) - for j in eachindex(ss) - old_multiply!(out, df[!, ss[j]]) - end - end - return out -end - -function old_multiply!(out, v) - for i in eachindex(out) - if v[i] === missing - # may be missing when I remove singletons - out[i] = 0.0 - else - out[i] = out[i] * v[i] - end - end -end - -function old_name(s::Vector{Symbol}) - if isempty(s) - out = nothing - else - out = Symbol(reduce((x1, x2) -> string(x1)*"x"*string(x2), s)) - end - return out -end - - -struct ModelTerm - f::FormulaTerm - dict::Dict{Symbol, Any} -end - -ModelTerm(f::FormulaTerm; kwargs...) = ModelTerm(f, Dict(pairs(kwargs)...)) -function Base.show(io::IO, m::ModelTerm) - println(io, m.f) - for (k, v) in m.dict - println(io, k, ": ", v) - end -end - -import StatsModels: capture_call -macro model(ex, kws...) - @warn "@model is deprecated, please use @formula" - f = StatsModels.terms!(StatsModels.sort_terms!(StatsModels.parse!(ex))) - d = Dict{Symbol, Any}() - for kw in kws - isa(kw, Expr) && kw.head== :(=) || throw("All arguments of @model, except the first one, should be keyboard arguments") - if kw.args[1] == :fe - @warn "The keyword argument fe is deprecated. Instead of @model(y ~ x, fe = state + year), write @formula(y ~ x + fe(state) + fe(year))" - d[:feformula] = kw.args[2] - elseif kw.args[1] == :ife - @warn "The keyword argument ife is deprecated. Instead of @model(y ~ x, ife = (state + year, 2)), write @formula(y ~ x + ife(state, year, 2))" - d[:ifeformula] = kw.args[2] - elseif kw.args[1] == :vcov - d[:vcovformula] = kw.args[2] - @warn "The keyword argument vcov is deprecated. Instead of reg(df, @model(y ~ x, vcov = cluster(State))), write reg(df, @formula(y ~ x), Vcov.cluster(:State))" - elseif kw.args[1] == :subset - d[:subsetformula] = kw.args[2] - @warn "The keyword argument subset is deprecated. Instead of reg(df, @model(y ~ x, subset = State .>= 30), write reg(df, @formula(y ~ x), subset = df.State .>= 30))" - elseif kw.args[1] == :weight - d[:weight] = kw.args[2] - @warn "The keyword argument weight is deprecated. Instead of reg(df, @model(y ~ x, weight = Pop), write reg(df, @formula(y ~ x), weight = :Pop)" - else - d[kw.args[1]] = kw.args[2] - end - end - :(ModelTerm($f, $d)) -end - - -function evaluate_subset(df, ex::Expr) - if ex.head == :call - return Expr(ex.head, ex.args[1], (evaluate_subset(df, ex.args[i]) for i in 2:length(ex.args))...) - else - return Expr(ex.head, (evaluate_subset(df, ex.args[i]) for i in 1:length(ex.args))...) - end -end -evaluate_subset(df, ex::Symbol) = df[!, ex] -evaluate_subset(df, ex) = ex - - -function reg(df, m::ModelTerm;kwargs...) - reg(df, m.f; m.dict..., kwargs...) -end - -function partial_out(df, m::ModelTerm; kwargs...) - partial_out(DataFrame(df), m.f; m.dict..., kwargs...) -end - - -function fes(args...) - @warn "fes() is deprecated. Use fe()" - fe(args...) -end - diff --git a/src/fit.jl b/src/fit.jl index 4248a1a..067b315 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -43,24 +43,9 @@ function reg(@nospecialize(df), method::Symbol = :cpu, drop_singletons = true, double_precision::Bool = true, - tol::Real = double_precision ? 1e-8 : 1e-6, - @nospecialize(feformula::Union{Symbol, Expr, Nothing} = nothing), - @nospecialize(vcovformula::Union{Symbol, Expr, Nothing} = nothing), - @nospecialize(subsetformula::Union{Symbol, Expr, Nothing} = nothing)) + tol::Real = double_precision ? 1e-8 : 1e-6) + df = DataFrame(df; copycols = false) - # to deprecate - if vcovformula != nothing - if (vcovformula == :simple) | (vcovformula == :(simple())) - vcov = Vcov.Simple() - elseif (vcovformula == :robust) | (vcovformula == :(robust())) - vcov = Vcov.Robust() - else - vcov = Vcov.cluster(StatsModels.termvars(@eval(@formula(0 ~ $(vcovformula.args[2]))))...) - end - end - if subsetformula != nothing - subset = eval(evaluate_subset(df, subsetformula)) - end ############################################################################## ## @@ -96,9 +81,6 @@ function reg(@nospecialize(df), # create a dataframe without missing values & negative weights vars = StatsModels.termvars(formula) - if feformula != nothing # to deprecate - vars = vcat(vars, StatsModels.termvars(@eval(@formula(0 ~ $(feformula))))) - end iv_vars = Symbol[] endo_vars = Symbol[] if has_iv @@ -121,11 +103,6 @@ function reg(@nospecialize(df), end fes, ids, formula = parse_fixedeffect(df, formula) has_fes = !isempty(fes) - if feformula != nothing - has_fes = true - feformula = @eval(@formula(0 ~ $(feformula))) - fes, ids = oldparse_fixedeffect(df, feformula) - end if has_fes if drop_singletons for fe in fes