# Grupo 2 - PC3/4

### Integrantes
- GARCIA RODRIGUEZ, EMILIO ALONSO (Python)
- PADILLA AQUISE, ALESSANDRO PIERO (R)
- RIEGA NUÑEZ, GABRIEL ANTONIO FERMIN (R)
- SALAMANCA FERNANDEZ, LUCAS PABLO (Julia)
- SILVA ANDUJAR, NICOLAS (Python)

## Part 1

In [1]:
using CSV
using DataFrames
using CategoricalArrays
using Pkg
using Distributions
using Dates
using Plots
using Random
using LinearAlgebra
using LaTeXStrings
using Lasso
using Statistics
using GLMNet
using StatsModels
using HDMjl

data = CSV.read("wage2015_subsample_inference.csv", DataFrame,
                types = Dict(:occ2 => String, :ind2 => String));

data = select(data, Not(["wage", "rownames"])); 




In [2]:

categorical_vars = ["sex", "clg", "mw", "so", "we", "ne", "occ2", "ind2"]

for var in categorical_vars
    data[!, var] = categorical(data[!, var])
end


In [3]:
# Create the design matrix with adjusted formula
design = @formula(
    lwage ~ 1 + sex + clg * sex + clg * (mw + so + we + ne) + sex * (mw + so + we + ne)+ clg * sex * (mw + so + we + ne) + (exp1 + exp2 + exp3 + exp4) * (hsg + scl + clg + ad + mw + so + we + ne + occ2 + ind2)
    + (hsg + scl + clg + ad) * (mw + so + we + ne + occ2 + ind2) + (mw + so + we + ne) * (occ2 + ind2) + occ2 * ind2
);

In [None]:
# Create a ModelFrame which retains the formula and data structure
mf = ModelFrame(design, data)

# Extract column names using coefnames from the ModelFrame
X_names = coefnames(mf)
X_names = X_names[2:end]  # Exclude the intercept

# Verify the content of X_names
println("Coefficient Names:")
println(X_names)

# Create the ModelMatrix from the ModelFrame
mm = modelmatrix(mf)

# Convert the predictors to a plain Matrix, excluding the intercept
X = mm[:, 2:end]

# Extract the target variable
y = data.lwage




In [None]:
#  Identify Treatment and Control Variables 

# Define regular expressions to match treatment variables and their interactions
treatment_patterns = [
    r"^clg: 1\.0$",                      # clg
    r"^clg: 1\.0 & sex: 1\.0$",          # clg:sex
    r"^clg: 1\.0 & (mw|so|we|ne): 1\.0$",# clg:mw, clg:so, clg:we, clg:ne
    r"^clg: 1\.0 & sex: 1\.0 & (mw|so|we|ne): 1\.0$" # clg:sex:mw, clg:sex:so, etc.
]


# Identify indices of treatment variables based on the patterns
treatment_indices = [
    i for (i, name) in enumerate(X_names)
    if any(occursin(pat, name) for pat in treatment_patterns)
]

# Identify control indices as all other variables
control_indices = setdiff(1:length(X_names), treatment_indices)

# Verify identified treatment variables
println("\nTreatment Variables Identified:")
println(X_names[treatment_indices])
println(X_names[control_indices])

In [None]:

# Separate treatment and control matrices
D = X[:, treatment_indices]      # Treatment variables
Z = X[:, control_indices]        # Control variables

# Check if D and Z have the expected dimensions
println("\nDimensions of D (Treatment): ", size(D))
println("Dimensions of Z (Control): ", size(Z))
println("Length of y (Target): ", length(y))




In [None]:
# Perform Double Lasso Estimation for Each Treatment Variable
# Initialize a DataFrame to store all results with StdError and TStatistic
results_df = DataFrame(
    Variable = String[],
    Coefficient = Float64[],
    StdError = Float64[],
    TStatistic = Float64[],
    PValue = Float64[]
)

# Iterate over each treatment variable and perform double lasso
for i in 1:size(D, 2)
    treatment_var = X_names[treatment_indices[i]]
    d_i = D[:, i]  # Extract the i-th treatment variable as a Vector
    
    println("\nPerforming Double Lasso for Treatment Variable: $treatment_var")
    
    # Perform double lasso estimation using HDMjl
    result_i = try
        rlassoEffect(Z, y, d_i; method = "double selection")
    catch e
        println("Error during rlassoEffect for $treatment_var: ", e)
        continue
    end
    
    # Print the keys to verify
    println("Keys in the result: ", keys(result_i))
    
    # Check if necessary keys are present
    if !("coefficients" in keys(result_i)) || !("se" in keys(result_i)) || !("t" in keys(result_i))
        println("Missing necessary keys in the result for $treatment_var. Skipping...")
        continue
    end
    
    # Extract coefficients, standard errors, and t-statistics
    coefficients = result_i["coefficients"]
    se = result_i["se"]
    t_stats = result_i["t"]
    
    # Compute p-values using the t-statistics
    p_values = 2 .* (1 .- cdf.(Normal(), abs.(t_stats)))
    
    # Append the results to the DataFrame
    push!(results_df, (
        Variable = treatment_var,
        Coefficient = coefficients,
        StdError = se,
        TStatistic = t_stats,
        PValue = p_values
    ))
end


In [None]:
#Summarize and Interpret Results

# Display the summary of estimated parameters
println("\nSummary of Estimated Parameters:")
println(results_df)

# Identify significant variables (e.g., p-value < 0.05)
significant_results = filter(row -> row.PValue < 0.05, results_df)

println("\nSignificant Parameters (p-value < 0.05):")
println(significant_results)

# Determine which treatment variables have significant impacts
if nrow(significant_results) > 0
    println("\nInterpretation:")
    for row in eachrow(significant_results)
        println("Variable: ", row.Variable)
        println("  Coefficient: ", row.Coefficient)
        println("  StdError: ", row.StdError)
        println("  T-Statistic: ", row.TStatistic)
        println("  P-Value: ", row.PValue)
    end
else
    println("No significant parameters found at the 5% significance level.")
end

### Discussion
First, the coefficient of having a college degree by itself is asociated with an increase in wages by approximately 49%. Next, when interacting this term with sex, the coefficient is 0.088 and significant, hihglighting that a wage gender premium exists even when accounting for college graduates. The same applies when the college graduate variable is interacted with both sex and south and sex and west, showing that this wage premium is even stronger in these regions, which makes sense, as these are more conservative regions. There are also some interesting wage premiums by just geographical location, in the case of the mid-west, south and north east, due to them being regions with considerably stronger and larger industries. 

## Part 2

In [None]:
using RDatasets, DataFrames, MLJ, MLJDecisionTreeInterface, DecisionTree, StatsPlots, MLJModels

hitters = dataset("ISLR", "Hitters")
hitters[1:5, :]

In [None]:
describe(hitters, :nmissing)

#59 salaries missing

In [61]:
hitters = dropmissing(hitters, :Salary);

In [None]:

# Separate the target variable `Salary` and the features
y, X = unpack(hitters, ==(:Salary), !=(:Salary))

# Transform categorical variables to dummies using one-hot encoding
onehotencoder = @load OneHotEncoder pkg=MLJModels verbosity=0
ohe = onehotencoder(features=[:League, :Division, :NewLeague])
ohe_machine = machine(ohe, X)
MLJ.fit!(ohe_machine);
X = MLJ.transform(ohe_machine, X);


In [None]:
MLJ.schema(X)

In [64]:
# Coerce counts to continuous variables
MLJ.coerce!(X, Count => MLJ.Continuous)

# Combine `X` and `y` into a single DataFrame
df = hcat(X, y);


In [94]:
train_indices, test_indices = partition(eachindex(y), 0.9, rng=1)
train_df = df[train_indices, :];
test_df = df[test_indices, :];

rename!(train_df, :x1 => :Salary);
rename!(test_df, :x1 => :Salary);

# Transform Salary to LogSalary and then drop Salary
train_df.LogSalary = log.(train_df.Salary)
test_df.LogSalary = log.(test_df.Salary)

select!(train_df, Not(:Salary));
select!(test_df, Not(:Salary));

In [None]:
println(names(train_df))


In [None]:
# Fit OLS model with all regressors from training set
fmla = @formula(LogSalary ~ 1 + AtBat + Hits + HmRun + Runs + RBI + Walks + Years + CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + League__A + League__N + Division__E + Division__W + PutOuts + Assists + Errors + NewLeague__A + NewLeague__N)
ols_model= lm(fmla, train_df)

In [None]:
# Calculate the OLS point estimate using the training set
b_hat = GLM.coef(ols_model)

# Number of features (including the intercept)
n_features = length(b_hat)


In [None]:
# Bootstrapping
# loop to generate 10,000 bootstrap estimates
n_boots = 10_000
b_boots = zeros(n_boots, n_features)
n_train = nrow(train_df)
Random.seed!(123)  # For reproducibility

for i in 1:n_boots
    # Sample indices with replacement
    indices = rand(1:n_train, n_train)
    boot_df = train_df[indices, :]
    # Fit OLS model on the bootstrap sample
    boot_model = lm(fmla, boot_df)
    # Store the bootstrap coefficients
    b_boots[i, :] = GLM.coef(boot_model)
end

n_b_boots = size(b_boots)


In [None]:
# Initialize arrays to store the lower and upper bounds
b_lower = zeros(n_features)
b_upper = zeros(n_features)

# Calculate the 2.5% and 97.5% percentiles for each coefficient
for j in 1:n_features
    b_lower[j] = quantile(b_boots[:, j], 0.025)
    b_upper[j] = quantile(b_boots[:, j], 0.975)
end
length(b_lower)

In [None]:
# Calculate confidence intervals
b_conf_lower = b_hat .- b_upper
b_conf_upper = b_hat .- b_lower

# Display the results
coef_names = coefnames(ols_model)
for i in 1:n_features
    println("Coefficient: $(coef_names[i])")
    println("Point Estimate: $(b_hat[i])")
    println("95% Confidence Interval: [$(b_conf_lower[i]), $(b_conf_upper[i])]")
    println()
end

In [None]:
using Statistics

# Predict on the test set
y_pred = MLJModels.predict(ols_model, test_df)

# Calculate the MSE
mse = mean((test_df.LogSalary - y_pred).^2)

println("Out-of-sample MSE: $mse")

In [None]:
# Load the DecisionTreeRegressor
train, test = partition(eachindex(y), 0.9, rng = 1);
DecisionTreeRegressor = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0

tree_model = DecisionTreeRegressor()
tree_machine = machine(tree_model, X[train, :], y[train])
MLJ.fit!(tree_machine)


In [None]:
# Fit the tree
fitted_params(tree_machine)[1]

In [None]:
# Calculate RMSE
predictions = MLJ.predict(tree_machine, X[test, :])
sqrt(mean((predictions - y[test]) .^ 2))

In [None]:
# Cross validate different trees and then obtain threshold with lowerst RMSE
tree_model_prune = DecisionTreeRegressor(post_prune = true, merge_purity_threshold = 0.6)
tree_machine_prune = machine(tree_model_prune, X[train, :], y[train])
MLJ.fit!(tree_machine_prune);

thresholds = exp.(collect(-10:0.01:0))
rmses = []

for threshold in thresholds
        tree_model_prune.merge_purity_threshold = threshold
        evaluation = evaluate!(
                tree_machine_prune,
                resampling = CV(nfolds = 3, shuffle = true, rng = 123),
                measure = rmse
        )
        rmses = [rmses; evaluation.measurement]
end

thresholds[argmin(rmses)]



In [None]:
# Use threshold that minimizes RMSE to create pruned tree
tree_model_prune = DecisionTreeRegressor(post_prune = true, merge_purity_threshold = thresholds[argmin(rmses)])
tree_machine_prune = machine(tree_model_prune, X[train, :], y[train])
MLJ.fit!(tree_machine_prune);
predictions_prune = MLJ.predict(tree_machine_prune, X[test, :])
sqrt(mean((predictions_prune - y[test]) .^ 2))

In [None]:
# Compare models
results_df = DataFrame(
    Model = String[], 
    RMSE = Float64[]
)

# 1. Calculate RMSE for OLS with bootstrapping
ols_rmse = sqrt(mean((test_df.LogSalary - y_pred).^2))  
push!(results_df, ("OLS with Bootstrapping", ols_rmse))

# 2. Calculate RMSE for unpruned tree
unpruned_tree_rmse = sqrt(mean((y[test] - predictions).^2))  
push!(results_df, ("Unpruned Tree", unpruned_tree_rmse))

# 3. Calculate RMSE for pruned tree
pruned_tree_rmse = sqrt(mean((y[test] - predictions_prune).^2)) 
push!(results_df, ("Pruned Tree", pruned_tree_rmse))

# summary of RMSE
println("Model Comparison Summary:")
println(results_df)
