In [393]:
using CSV, DataFrames, Statistics, Dates, Gadfly, LinearAlgebra, GLMNet

In [394]:
function standardize(data)
    return (data .- mean(data)) ./ std(data)
end

function standardize_data(data)
    stddata = deepcopy(data)
   for col in names(stddata)
        if eltype(stddata[!, col]) <: Number
            stddata[!, col] = standardize(stddata[!, col])
        end
    end
    return stddata
end

standardize_data (generic function with 1 method)

In [395]:
function encode(data, column)
    for c in unique(data[!, column])
        data[!, Symbol(c)] = ifelse.(data[!, column] .== c, 1, 0)
    end
    return data
end

function encode_data(data)
    encoded_data = deepcopy(data)
    encoded_data = encode(encoded_data, :type)
    encoded_data = encode(encoded_data, :transmission)
    encoded_data = encode(encoded_data, :boite)
    return encoded_data
end

encode_data (generic function with 1 method)

In [396]:
# faire une méthode pour les outliers

# code pas tester:
# function remove_outliers(data)
#     return data[(abs.(zscore(data)) .< 3) .| isnan.(zscore(data)), :]
# end

In [397]:
function removeRows(data)
    return select!(data, Not([:type, :transmission, :boite]))
    # return select!(data, Not([:annee]))
end

removeRows (generic function with 1 method)

In [398]:
function concatenate(data)
    data[!, :typecar] = data[!, :type] .* data[!, :transmission] .* data[!, :boite]
    return removeRows(data)
end

concatenate (generic function with 1 method)

In [399]:
function getStandardEncodedData(data)
    data_copy = deepcopy(data)
    standardised_data = standardize_data(data_copy)
    # standardised_data = encode_data(standardised_data)
    # standardised_data = removeRows(standardised_data)
    standardised_data = concatenate(standardised_data)
    standardised_data = encode(standardised_data, :typecar)
    select!(standardised_data, Not([:typecar]))
    return standardised_data
end




getStandardEncodedData (generic function with 1 method)

In [400]:
function rmse(y, ychap)
    return sqrt(mean((ychap .- y).^2))
end


rmse (generic function with 1 method)

In [401]:
function rsquared(y, ychap)
    ss_total = sum((y .- mean(y)).^2)
    ss_res = sum((y .- ychap).^2)
    return ss_res / ss_total
end


rsquared (generic function with 1 method)

In [402]:
function split_data(data,k)
    n = size(data, 1)
    validation_size = floor(Int, n * k)

    validation_indices = rand(1:n, validation_size)  
    
    validation_data = data[validation_indices, :]
    training_data = data[setdiff(1:n, validation_indices), :]
    
    return training_data, validation_data
end

split_data (generic function with 1 method)

In [403]:
function regression(standardised_data, trainTestSplitPercentage)
    training_data, test_data = split_data(standardised_data, trainTestSplitPercentage)
    X_train =  Matrix(training_data[:, Not(:consommation)])
    y_train = training_data[:, :consommation]

    beta = X_train \ y_train

    rmseval = 0.0
    if trainTestSplitPercentage != 0.0
        X_test = Matrix(test_data[:, Not(:consommation)])
        y_test = test_data[:, :consommation]
        ychap =  X_test * beta
        ychap = (ychap .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        y_test = (y_test .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        # print("regression\n")
        # print("r2 = ", rsquared(y_test, ychap), "\n")
        rmseval = rmse(y_test, ychap)
        # print("rmse = ", rmseval , "\n\n")
    end
    
    return rmseval, beta
end


regression (generic function with 1 method)

In [404]:
function ridge_regression(standardised_data, lambda, trainTestSplitPercentage)
    training_data, test_data = split_data(standardised_data, trainTestSplitPercentage)
    X_train = Matrix(training_data[:, Not([:consommation])])
    y_train = training_data[:, :consommation]
    beta = (X_train'X_train + lambda*I)\X_train'y_train

    rmseval = 0.0
    if trainTestSplitPercentage != 0.0
        X_test = Matrix(test_data[:, Not(:consommation)])
        y_test = test_data[:, :consommation]
        ychap =  X_test * beta
        ychap = (ychap .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        y_test = (y_test .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        # print("ridge: \n")
        # print("r2 = ", rsquared(y_test, ychap), "\n")
        rmseval = rmse(y_test, ychap)
        # print("rmse = ", rmseval , "\n\n")
    end
    return rmseval, beta
end

ridge_regression (generic function with 1 method)

In [405]:
function lasso_regression(standardised_data, lambda, trainTestSplitPercentage)
    training_data, test_data = split_data(standardised_data, trainTestSplitPercentage)
    X_train = Matrix(training_data[:, Not([:consommation])])
    y_train = training_data[:, :consommation]

    fit = glmnet(X_train, y_train, alpha = 1.0, lambda = [lambda]) # alpha = 1.0 pour lasso

    beta = fit.betas[:, 1]

    rmseval = 0.0
    if trainTestSplitPercentage != 0.0
        X_test = Matrix(test_data[:, Not(:consommation)])
        y_test = test_data[:, :consommation]
        
        ychap = X_test * beta  
        
        ychap = (ychap .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        y_test = (y_test .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        
        rmseval = rmse(y_test, ychap) 
    end

    return rmseval, beta
end

lasso_regression (generic function with 1 method)

In [406]:
# for polynomial regression
function construct_structure(x::Matrix{<:Real}, order::Int)
    n, m = size(x)
    poly_terms = [x[:, j].^p for j in 1:m, p in 0:order]
    X = hcat(poly_terms...)
    return X
end

construct_structure (generic function with 2 methods)

In [407]:
function polynomial_regression(data::DataFrame, degree::Int, trainTestSplitPercentage::Float64)
    training_data, test_data = split_data(data, trainTestSplitPercentage)
    X_train = construct_structure(Matrix(training_data[:, Not([:consommation])]), degree)
    y_train = training_data[:, :consommation]

    beta = X_train \ y_train

    rmseval = 0.0

    if trainTestSplitPercentage != 0.0
        X_test = construct_structure(Matrix(test_data[:, Not([:consommation])]), degree)
        y_test = test_data[:, :consommation]

        ychap = X_test * beta
        ychap = (ychap .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        y_test = (y_test .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN

        rmseval = rmse(y_test, ychap)
    end

    return rmseval, beta
end


polynomial_regression (generic function with 2 methods)

In [408]:
trainData = CSV.read("./data/train.csv", DataFrame)
testData = CSV.read("./data/test.csv", DataFrame)
first(testData, 1)

Row,annee,type,nombre_cylindres,cylindree,transmission,boite
Unnamed: 0_level_1,Int64,String31,Int64,String3,String15,String15
1,2014,voiture_moyenne,4,25,traction,manuelle


In [409]:
trainData.consommation = parse.(Float64,replace.(trainData.consommation, "," => "."))
trainData.cylindree = parse.(Float64,replace.(trainData.cylindree, "," => "."))
testData.cylindree = parse.(Float64,replace.(testData.cylindree, "," => "."))


150-element Vector{Float64}:
 2.5
 2.5
 2.5
 2.0
 5.8
 5.0
 5.0
 2.4
 3.5
 5.2
 5.2
 5.9
 2.0
 ⋮
 1.6
 3.3
 5.0
 2.0
 2.0
 1.6
 2.0
 2.0
 3.0
 3.0
 1.5
 2.0

In [410]:
# trainData[!,:volume_gaz] = trainData[!,:nombre_cylindres] .* trainData[!,:cylindree]
# testData[!,:volume_gaz] = testData[!,:nombre_cylindres] .* testData[!,:cylindree]
first(trainData, 1)
COMSOMMATION_MEAN = mean(trainData.consommation)
COMSOMMATION_STD = std(trainData.consommation)


2.139763088813657

In [411]:
data = getStandardEncodedData(trainData)
# print(first(data, 1))
nrange = 1000
n = 0
for i in range(0, 1, length=nrange)
    n += regression(data, 0.2)[1]
end
print("average rmse for regression: ", n/nrange, "\n")

n = 0
for i in range(0, 1, length=nrange)
    r = ridge_regression(data, 10, 0.2)[1]
    # if r > 1.7
    #     print("ridge rmse: ", r, "\n")
    # end
    n += r
    
end
print("average rmse for ridge: ", n/nrange, "\n")

# n = 0
# for i in range(0, 1, length=nrange)
#     n += svd_regression(data, 0.05)[1]
# end
# print("average rmse for svd: ", n/nrange, "\n")

#lasso
n = 0
for i in range(0, 1, length=nrange)
    n += lasso_regression(data, 0.1, 0.2)[1]
end
print("average rmse for lasso: ", n/nrange, "\n")

n=0
for i in range(0, 1, length=nrange)
    n += polynomial_regression(data, 3, 0.2)[1]
end
print("average rmse for polynomial regression: ", n/nrange, "\n")


average rmse for regression: 0.9201962486537227
average rmse for ridge: 0.9690545155821302
average rmse for lasso: 1.0631923504552345
average rmse for polynomial regression: 0.9129991974804436
