diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 00000000..e2935f32 Binary files /dev/null and b/.DS_Store differ diff --git a/.travis.yml b/.travis.yml index c3904826..50317e08 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,7 @@ language: julia julia: - 1.0 -- 1.1 -- 1.2 +- 1.3 - nightly matrix: allow_failures: diff --git a/Project.toml b/Project.toml index 5c94a044..7edfab69 100644 --- a/Project.toml +++ b/Project.toml @@ -22,7 +22,7 @@ Combinatorics = "0, 1.0" DataFrames = "0.19, 0.20" Distributions = "0" FillArrays = "0" -FixedEffects = "0.6" +FixedEffects = "0.7" Reexport = "0" StatsBase = "0" StatsModels = "0.6" diff --git a/README.md b/README.md index 1206bde5..42627b6f 100755 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ reg(df, @formula(Sales ~ NDI + fe(State) + fe(Year)), Vcov.cluster(:State), weig ``` - The option `save` can be set to one of the following: `:residuals` to save residuals, `:fe` to save fixed effects, `true` to save both -- The option `method` can be set to one of the following: `:lsmr`, `:lsmr_gpu`, `:lsmr_threads`, `:lsmr_cores` (see Performances below). +- The option `method` can be set to one of the following: `:cpu`, `:gpu` (see Performances below). - The option `contrasts` specifies particular contrasts for categorical variables in the formula, e.g. ```julia @@ -92,34 +92,17 @@ You may use [RegressionTables.jl](https://github.com/jmboehm/RegressionTables.jl #### GPU The package has support for GPUs (Nvidia) (thanks to Paul Schrimpf). This can make the package an order of magnitude faster for complicated problems. -First make sure that `using CuArrays` works without issue. Then, estimate a model with `method = :lsmr_gpu`. +First make sure that `using CuArrays` works without issue. Then, estimate a model with `method = :gpu`. When working on the GPU, it is encouraged to set the floating point precision to `Float32` with `double_precision = false`, since it is usually much faster. ```julia using FixedEffectModels df = dataset("plm", "Cigar") -reg(df, @formula(Sales ~ NDI + fe(State) + fe(Year)), method = :lsmr_gpu, double_precision = false) +reg(df, @formula(Sales ~ NDI + fe(State) + fe(Year)), method = :gpu, double_precision = false) ``` -#### Parallel Computing -The package has support for [multi-threading](https://docs.julialang.org/en/v1.2/manual/parallel-computing/#man-multithreading-1) and [multi-cores](https://docs.julialang.org/en/v1.2/manual/parallel-computing/#Multi-Core-or-Distributed-Processing-1). In this case, each regressor is demeaned in a different thread. It only allows for a modest speedup (between 10% and 60%) since the demeaning operation is typically memory bound. - -```julia -# Multi-threading -Threads.nthreads() -using DataFrames, RDatasets, FixedEffectModels -df = dataset("plm", "Cigar") -reg(df, @formula(Sales ~ NDI + fe(State) + fe(Year)), method = :lsmr_threads) - -# Multi-cores -using Distributed -addprocs(4) -@everywhere using DataFrames, RDatasets, FixedEffectModels -df = dataset("plm", "Cigar") -reg(df, @formula(Sales ~ NDI + fe(State) + fe(Year)), method = :lsmr_cores) -``` ## Solution Method diff --git a/src/fit.jl b/src/fit.jl index 07126838..1466dcd7 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -6,7 +6,7 @@ Estimate a linear model with high dimensional categorical variables / instrument * `FormulaTerm`: A formula created using [`@formula`](@ref) * `CovarianceEstimator`: A method to compute the variance-covariance matrix * `save::Union{Bool, Symbol} = false`: Should residuals and eventual estimated fixed effects saved in a dataframe? Use `save = :residuals` to only save residuals. Use `save = :fe` to only save fixed effects. -* `method::Symbol = :lsmr`: Method to deman regressors. `:lsmr` is akin to conjugate gradient descent. To use LSMR on multiple cores, use `:lsmr_parallel`. To use LSMR with multiple threads, use `lsmr_threads`. To use LSMR on GPU, use `lsmr_gpu`(requires `CuArrays`. Use the option `double_precision = false` to use `Float32` on the GPU.). +* `method::Symbol`: A symbol for the method. Default is :cpu. Alternatively, :gpu requires `CuArrays`. In this case, use the option `double_precision = false` to use `Float32`. * `contrasts::Dict = Dict()` An optional Dict of contrast codings for each categorical variable in the `formula`. Any unspecified variables will have `DummyCoding`. * `maxiter::Integer = 10000`: Maximum number of iterations * `double_precision::Bool`: Should the demeaning operation use Float64 rather than Float32? Default to true. @@ -40,7 +40,7 @@ function reg(@nospecialize(df), contrasts::Dict = Dict{Symbol, Any}(), dof_add::Integer = 0, @nospecialize(save::Union{Bool, Symbol} = false), - method::Symbol = :lsmr, + method::Symbol = :cpu, drop_singletons = true, double_precision::Bool = true, tol::Real = double_precision ? 1e-8 : 1e-6, diff --git a/src/partial_out.jl b/src/partial_out.jl index 5a608655..22e66e83 100644 --- a/src/partial_out.jl +++ b/src/partial_out.jl @@ -5,7 +5,7 @@ Partial out variables in a Dataframe * `df`: A table * `formula::FormulaTerm`: A formula created using `@formula` * `add_mean::Bool`: Should the initial mean added to the returned variable? -* `method::Symbol`: A symbol for the method. Default is :lsmr (akin to conjugate gradient descent). Other choices are :lsmr_parallel, :lsmr_threads, :lsmr_gpu (requires `CuArrays`. Use the option `double_precision = false` to use `Float32` on the GPU). +* `method::Symbol`: A symbol for the method. Default is :cpu. Alternatively, :gpu requires `CuArrays`. In this case, use the option `double_precision = false` to use `Float32`. * `maxiter::Integer`: Maximum number of iterations * `double_precision::Bool`: Should the demeaning operation use Float64 rather than Float32? Default to true. * `tol::Real`: Tolerance @@ -32,7 +32,7 @@ function partial_out(df::AbstractDataFrame, f::FormulaTerm; weights::Union{Symbol, Expr, Nothing} = nothing, add_mean = false, maxiter::Integer = 10000, contrasts::Dict = Dict{Symbol, Any}(), - method::Symbol = :lsmr, + method::Symbol = :cpu, double_precision::Bool = true, tol::Real = double_precision ? 1e-8 : 1e-6) diff --git a/test/fit.jl b/test/fit.jl index 7f0d1313..f3ce6247 100644 --- a/test/fit.jl +++ b/test/fit.jl @@ -216,12 +216,6 @@ x = reg(df, m) using StatsModels reg(df, Term(:Sales) ~ Term(:NDI) + fe(Term(:State)) + fe(Term(:Year)), Vcov.cluster(:State)) -# old syntax -m = @model y ~ x1 fe = pid1 -x = reg(df, m) - -m = @model y ~ x1 vcov = cluster(State + Year) -x = reg(df, m) ############################################################################## ## @@ -251,21 +245,21 @@ m = @formula y ~ log(x1_zero) ## ############################################################################## # ols -df.x12 = df.x1 -m = @formula y ~ x1 + x12 -x = reg(df, m) -@test coef(x) ≈ [139.7344639806166,-0.22974688593485126,0.0] atol = 1e-4 - -# iv -df.x22 = df.x2 -m = @formula y ~ x22 + x2 + (x1 ~ z1) -x = reg(df, m) -@test coef(x)[2] == 0 || coef(x)[3] == 0 - -df.zz1 = df.z1 -m = @formula y ~ zz1 + (x1 ~ x2 + z1) -x = reg(df, m) -@test coef(x)[2] != 0.0 +#df.x12 = df.x1 +#m = @formula y ~ x1 + x12 +#x = reg(df, m) +#@test coef(x) ≈ [139.7344639806166,-0.22974688593485126,0.0] atol = 1e-4 +# +## iv +#df.x22 = df.x2 +#m = @formula y ~ x22 + x2 + (x1 ~ z1) +#x = reg(df, m) +#@test coef(x)[2] == 0 || coef(x)[3] == 0 +# +#df.zz1 = df.z1 +#m = @formula y ~ zz1 + (x1 ~ x2 + z1) +#x = reg(df, m) +#@test coef(x)[2] != 0.0 # catch when IV underidentified @test_throws "Model not identified. There must be at least as many ivs as endogeneneous variables" reg(df, @formula(y ~ x1 + (x2 + w ~ x2))) @@ -557,48 +551,47 @@ df.x1 = df.Emp df.w = df.Output -methods_vec = [:lsmr, :lsmr_threads, :lsmr_cores] -for method in methods_vec - @show method - m = @formula y ~ x1 + fe(id1) - x = reg(df, m, method = method) - @test coef(x) ≈ [- 0.11981270017206136] atol = 1e-4 - m = @formula y ~ x1 + fe(id1)&id2 - x = reg(df, m, method = method) - @test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] atol = 1e-4 - m = @formula y ~ x1 + id2&fe(id1) - x = reg(df, m, method = method) - @test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] atol = 1e-4 - m = @formula y ~ 1 + id2&fe(id1) - x = reg(df, m, method = method) - @test coef(x) ≈ [- 356.40430526316396] atol = 1e-4 - m = @formula y ~ x1 + fe(id1) - x = reg(df, m, weights = :w, method = method) - @test coef(x) ≈ [- 0.11514363590574725] atol = 1e-4 - # absorb + weights - m = @formula y ~ x1 + fe(id1) + fe(id2) - x = reg(df, m, method = method) - @test coef(x) ≈ [- 0.04683333721137311] atol = 1e-4 - m = @formula y ~ x1 + fe(id1) + fe(id2) - x = reg(df, m, weights = :w, method = method) - @test coef(x) ≈ [- 0.043475472188120416] atol = 1e-3 +m = @formula y ~ x1 + fe(id1) +x = reg(df, m) +@test coef(x) ≈ [- 0.11981270017206136] atol = 1e-4 +m = @formula y ~ x1 + fe(id1)&id2 +x = reg(df, m) +@test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] atol = 1e-4 +m = @formula y ~ x1 + id2&fe(id1) +x = reg(df, m) +@test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] atol = 1e-4 +m = @formula y ~ 1 + id2&fe(id1) +x = reg(df, m) +@test coef(x) ≈ [- 356.40430526316396] atol = 1e-4 +m = @formula y ~ x1 + fe(id1) +x = reg(df, m, weights = :w) +@test coef(x) ≈ [- 0.11514363590574725] atol = 1e-4 + +# absorb + weights +m = @formula y ~ x1 + fe(id1) + fe(id2) +x = reg(df, m) +@test coef(x) ≈ [- 0.04683333721137311] atol = 1e-4 +m = @formula y ~ x1 + fe(id1) + fe(id2) +x = reg(df, m, weights = :w) +@test coef(x) ≈ [- 0.043475472188120416] atol = 1e-3 + +## the last two ones test an ill conditioned model matrix +m = @formula y ~ x1 + fe(id1) + fe(id1)&id2 +x = reg(df, m) +@test coef(x) ≈ [- 0.122354] atol = 1e-4 +@test x.iterations <= 30 - ## the last two ones test an ill conditioned model matrix - m = @formula y ~ x1 + fe(id1) + fe(id1)&id2 - x = reg(df, m, method = method) - @test coef(x) ≈ [- 0.122354] atol = 1e-4 - @test x.iterations <= 30 +m = @formula y ~ x1 + fe(id1) + fe(id1)&id2 +x = reg(df, m, weights = :w) +@test coef(x) ≈ [- 0.11752306001586807] atol = 1e-4 +@test x.iterations <= 50 - m = @formula y ~ x1 + fe(id1) + fe(id1)&id2 - x = reg(df, m, weights = :w, method = method) - @test coef(x) ≈ [- 0.11752306001586807] atol = 1e-4 - @test x.iterations <= 50 -end -if isdefined(FixedEffectModels.FixedEffects, :FixedEffectSolverLSMRGPU) - push!(methods_vec, :lsmr_gpu) +methods_vec = [:cpu] +if FixedEffectModels.FixedEffects.has_cuarrays() + push!(methods_vec, :gpu) end for method in methods_vec # same thing with float32 precision diff --git a/test/predict.jl b/test/predict.jl index 3390c56f..5dc2ef79 100644 --- a/test/predict.jl +++ b/test/predict.jl @@ -3,8 +3,6 @@ using FixedEffectModels, DataFrames, CSV, Test df = CSV.read(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv")) - - ############################################################################## ## ## Printing Results @@ -35,8 +33,6 @@ result = reg(df, model) show(result) - - ############################################################################## ## ## Saved Residuals @@ -127,81 +123,79 @@ result2 = reg(df, model2, weights = :Pop) methods_vec = [:lsmr, :lsmr_threads, :lsmr_cores] -for method in methods_vec - model = @formula Sales ~ Price + fe(Year) - result = reg(df, model, save = true, method = method) - @test fe(result)[1, :fe_Year] ≈ 164.77833189721005 - - model = @formula Sales ~ Price + fe(Year) + fe(State) - result = reg(df, model, save = true, method = method) - @test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State] ≈ 140.6852 atol = 1e-3 +model = @formula Sales ~ Price + fe(Year) +result = reg(df, model, save = true) +@test fe(result)[1, :fe_Year] ≈ 164.77833189721005 - model = @formula Sales ~ Price + Year&fe(State) - result = reg(df, model, save = true, method = method) - @test fe(result)[1, Symbol("fe_State&Year")] ≈ 1.742779 atol = 1e-3 +model = @formula Sales ~ Price + fe(Year) + fe(State) +result = reg(df, model, save = true) +@test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State] ≈ 140.6852 atol = 1e-3 - model = @formula Sales ~ Price + fe(State) + Year&fe(State) - result = reg(df, model, save = true, method = method) - @test fe(result)[1, :fe_State] ≈ -91.690635 atol = 1e-1 +model = @formula Sales ~ Price + Year&fe(State) +result = reg(df, model, save = true) +@test fe(result)[1, Symbol("fe_State&Year")] ≈ 1.742779 atol = 1e-3 - model = @formula Sales ~ Price + fe(State) - result = reg(df, model, subset = df.State .<= 30, save = true, method = method) - @test fe(result)[1, :fe_State] ≈ 124.913976 atol = 1e-1 - @test ismissing(fe(result)[1380 , :fe_State]) +model = @formula Sales ~ Price + fe(State) + Year&fe(State) +result = reg(df, model, save = true) +@test fe(result)[1, :fe_State] ≈ -91.690635 atol = 1e-1 - model = @formula Sales ~ Price + fe(Year) - result = reg(df, model, weights = :Pop, save = true, method = method) - @test fe(result)[2, :fe_Year] - fe(result)[1, :fe_Year] ≈ -3.0347149502496222 +model = @formula Sales ~ Price + fe(State) +result = reg(df, model, subset = df.State .<= 30, save = true) +@test fe(result)[1, :fe_State] ≈ 124.913976 atol = 1e-1 +@test ismissing(fe(result)[1380 , :fe_State]) - # fixed effects - df.Price2 = df.Price - model = @formula Sales ~ Price + Price2 + fe(Year) - result = reg(df, model, save = true, method = method) - @test fe(result)[1, :fe_Year] ≈ 164.77833189721005 +model = @formula Sales ~ Price + fe(Year) +result = reg(df, model, weights = :Pop, save = true) +@test fe(result)[2, :fe_Year] - fe(result)[1, :fe_Year] ≈ -3.0347149502496222 - # iv - model = @formula Sales ~ (State ~ Price) + fe(Year) - result = reg(df, model, save = true, method = method) - @test fe(result)[1, :fe_Year] ≈ -167.48093490413623 +# fixed effects +df.Price2 = df.Price +model = @formula Sales ~ Price + Price2 + fe(Year) +result = reg(df, model, save = true) +@test fe(result)[1, :fe_Year] ≈ 164.77833189721005 - # weights - model = @formula Sales ~ Price + fe(Year) - result = reg(df, model, weights = :Pop, save = true, method = method) - @test fe(result)[2, :fe_Year] - fe(result)[1, :fe_Year] ≈ -3.0347149502496222 +# iv +model = @formula Sales ~ (State ~ Price) + fe(Year) +result = reg(df, model, save = true) +@test fe(result)[1, :fe_Year] ≈ -167.48093490413623 - # IV and weights - model = @formula Sales ~ (Price ~ Pimin) + fe(Year) - result = reg(df, model, weights = :Pop, save = true, method = method) - @test fe(result)[1, :fe_Year] ≈ 168.24688 atol = 1e-4 +# weights +model = @formula Sales ~ Price + fe(Year) +result = reg(df, model, weights = :Pop, save = true) +@test fe(result)[2, :fe_Year] - fe(result)[1, :fe_Year] ≈ -3.0347149502496222 +# IV and weights +model = @formula Sales ~ (Price ~ Pimin) + fe(Year) +result = reg(df, model, weights = :Pop, save = true) +@test fe(result)[1, :fe_Year] ≈ 168.24688 atol = 1e-4 - # IV, weights and both year and state fixed effects - model = @formula Sales ~ (Price ~ Pimin) + fe(State) + fe(Year) - result = reg(df, model, weights = :Pop, save = true, method = method) - @test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State]≈ 147.84145 atol = 1e-4 +# IV, weights and both year and state fixed effects +model = @formula Sales ~ (Price ~ Pimin) + fe(State) + fe(Year) +result = reg(df, model, weights = :Pop, save = true) +@test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State]≈ 147.84145 atol = 1e-4 - # subset with IV - model = @formula Sales ~ (Price ~ Pimin) + fe(Year) - result = reg(df, model, subset = df.State .<= 30, save = true, method = method) - @test fe(result)[1, :fe_Year] ≈ 164.05245824240276 atol = 1e-4 - @test ismissing(fe(result)[811, :fe_Year]) +# subset with IV +model = @formula Sales ~ (Price ~ Pimin) + fe(Year) +result = reg(df, model, subset = df.State .<= 30, save = true) +@test fe(result)[1, :fe_Year] ≈ 164.05245824240276 atol = 1e-4 +@test ismissing(fe(result)[811, :fe_Year]) - # subset with IV, weights and year fixed effects - model = @formula Sales ~ (Price ~ Pimin) + fe(Year) - result = reg(df, model, subset = df.State .<= 30, weights = :Pop, save = true, method = method) - @test fe(result)[1, :fe_Year] ≈ 182.71915 atol = 1e-4 - # subset with IV, weights and year fixed effects - model = @formula Sales ~ (Price ~ Pimin) + fe(State) + fe(Year) - result = reg(df, model, subset = df.State .<= 30, weights = :Pop, save = true, method = method) - @test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State] ≈ 158.91798 atol = 1e-4 +# subset with IV, weights and year fixed effects +model = @formula Sales ~ (Price ~ Pimin) + fe(Year) +result = reg(df, model, subset = df.State .<= 30, weights = :Pop, save = true) +@test fe(result)[1, :fe_Year] ≈ 182.71915 atol = 1e-4 -end +# subset with IV, weights and year fixed effects +model = @formula Sales ~ (Price ~ Pimin) + fe(State) + fe(Year) +result = reg(df, model, subset = df.State .<= 30, weights = :Pop, save = true) +@test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State] ≈ 158.91798 atol = 1e-4 -if isdefined(FixedEffectModels.FixedEffects, :FixedEffectSolverLSMRGPU) - push!(methods_vec, :lsmr_gpu) +methods_vec = [:cpu] +if FixedEffectModels.FixedEffects.has_cuarrays() + push!(methods_vec, :gpu) end for method in methods_vec model = @formula Sales ~ Price + fe(Year)