# MTH3302 - Méthodes probabilistes et statistiques pour I.A.
#### Polytechnique Montréal
### Projet A2024
----
Équipe T - TODO
### Objectif
Prédiction de **la consommation en carburant de voitures récentes**.

### Données
Le jeu de données contient pour presque 400 véhicule, la consommation moyenne en L/100km, l'année de frabrication, le type de véhicule, le nombre de cylindre, cylindree, la transmission et la boite.

- `train.csv` est l'ensemble d'entraînement
- `test.csv` est l'ensemble de test


## Chargement des données

Importation des librairies utilisées dans le calepin.

// TODO: à enlever à la fin:

Pour importer librairies:
using Pkg
Pkg.add(["CSV", "DataFrames", "Combinatorics", "Gadfly", "Distributions"], ...)

In [2840]:
# import Pkg; Pkg.add("Plots")

In [2841]:
# using CSV, DataFrames, Statistics, Dates, Gadfly, LinearAlgebra, Plots

In [2842]:
function standardize(data)
    return (data .- mean(data)) ./ std(data)
end

function standardize_data(data)
    stddata = deepcopy(data)
   for col in names(stddata)
        if eltype(stddata[!, col]) <: Number && col != "id"
            stddata[!, col] = standardize(stddata[!, col])
        end
    end
    return stddata
end

standardize_data (generic function with 1 method)

In [2843]:
function encode(data, column)
    for c in unique(data[!, column])
        data[!, Symbol(c)] = ifelse.(data[!, column] .== c, 1, 0)
    end
    return data
end

function encode_data(data)
    encoded_data = deepcopy(data)
    encoded_data = encode(encoded_data, :general_type)
    # encoded_data = encode(encoded_data, :annee)
    # encoded_data = encode(encoded_data, :type)
    encoded_data = encode(encoded_data, :transmission)
    encoded_data = encode(encoded_data, :boite)
    return encoded_data
end

encode_data (generic function with 1 method)

In [2844]:
# faire une méthode pour les outliers

# code pas tester:
# function remove_outliers(data)
#     return data[(abs.(zscore(data)) .< 3) .| isnan.(zscore(data)), :]
# end

In [2845]:
function removeRows(data)
    return select!(data, Not([:type, :transmission, :boite, :general_type,]))
    # return select!(data, Not([:annee]))
end

removeRows (generic function with 1 method)

In [2846]:
function add_rows(data)
    data[!,:volume_gaz] = data[!,:nombre_cylindres] .* data[!,:cylindree]

    # https://www.insurancenavy.com/average-car-weight/
    # https://www.auto-tests.com/fr/lightest-weight/Wagon/all/
    weight_dict = Dict("voiture_moyenne" => 3300, "VUS_petit" => 3500, "voiture_compacte" => 2800, "voiture_deux_places" => 2800, "voiture_minicompacte" => 1500, "VUS_standard" => 5000, "monospace" => 4500, "voiture_sous_compacte" => 2600, "camionnette_petit" => 4200, "break_petit" => 2640, "voiture_grande" => 4400, "camionnette_standard" => 4700, "break_moyen" => 3300)
    data[!, :weight] = [weight_dict[t] for t in data[!, :type]]

    general_type_dict = Dict("voiture_moyenne" => "voiture", "VUS_petit" => "VUS", "voiture_compacte" => "voiture", "voiture_deux_places" => "voiture", "voiture_minicompacte" => "voiture", "VUS_standard" => "VUS", "monospace" => "camionnette", "voiture_sous_compacte" => "voiture", "camionnette_petit" => "camionnette", "break_petit" => "break", "voiture_grande" => "voiture", "camionnette_standard" => "camionnette", "break_moyen" => "break")
    data[!, :general_type] = [general_type_dict[t] for t in data[!, :type]]
    
    # display(plot(data, x=:general_type, y=:consommation, color=:type))
    # println(combine(groupby(data, :type), nrow => :count))
    return data
end

add_rows (generic function with 1 method)

In [2847]:
function getStandardEncodedData(data)
    data_copy = deepcopy(data)
    standardised_data = add_rows(data_copy)
    standardised_data = standardize_data(data_copy)
    standardised_data = encode_data(standardised_data)
    standardised_data = removeRows(standardised_data)
    
    # print(standardised_data)
    return standardised_data
end




getStandardEncodedData (generic function with 1 method)

In [2848]:
function rmse(y, ychap)
    return sqrt(mean((ychap .- y).^2))
end


rmse (generic function with 1 method)

In [2849]:
function rsquared(y, ychap)
    ss_total = sum((y .- mean(y)).^2)
    ss_res = sum((y .- ychap).^2)
    return ss_res / ss_total
end


rsquared (generic function with 1 method)

In [2850]:
function split_data(data,k)
    n = size(data, 1)
    validation_size = floor(Int, n * k)

    validation_indices = rand(1:n, validation_size)  
    
    validation_data = data[validation_indices, :]
    training_data = data[setdiff(1:n, validation_indices), :]
    
    return training_data, validation_data
end

split_data (generic function with 1 method)

In [2851]:
function regression(standardised_data, trainTestSplitPercentage)
    training_data, test_data = split_data(standardised_data, trainTestSplitPercentage)
    X_train =  Matrix(training_data[:, Not(:consommation, :id)])
    y_train = training_data[:, :consommation]

    beta = X_train \ y_train

    rmseval = 0.0
    if trainTestSplitPercentage != 0.0
        X_test = Matrix(test_data[:, Not(:consommation, :id)])
        y_test = test_data[:, :consommation]
        ychap =  X_test * beta
        ychap = (ychap .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        y_test = (y_test .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        # print("regression\n")
        # print("r2 = ", rsquared(y_test, ychap), "\n")
        rmseval = rmse(y_test, ychap)
        # print("rmse = ", rmseval , "\n\n")
    end
    
    return rmseval, beta, training_data, test_data
end


regression (generic function with 1 method)

In [2852]:
function ridge_regression(standardised_data, lambda, trainTestSplitPercentage)
    training_data, test_data = split_data(standardised_data, trainTestSplitPercentage)
    X_train = Matrix(training_data[:, Not([:consommation, :id])])
    y_train = training_data[:, :consommation]
    beta = (X_train'X_train + lambda*I)\X_train'y_train

    rmseval = 0.0
    if trainTestSplitPercentage != 0.0
        X_test = Matrix(test_data[:, Not(:consommation, :id)])
        y_test = test_data[:, :consommation]
        ychap =  X_test * beta
        ychap = (ychap .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        y_test = (y_test .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        # print("ridge: \n")
        # print("r2 = ", rsquared(y_test, ychap), "\n")
        rmseval = rmse(y_test, ychap)
        # print("rmse = ", rmseval , "\n\n")
    end
    return rmseval, beta, training_data, test_data
end

ridge_regression (generic function with 1 method)

In [2853]:
# pas tester: (aucune idee si ca marche)

function svd_regression(standardised_data, trainTestSplitPercentage)
    training_data, test_data = split_data(standardised_data, trainTestSplitPercentage)
    X_train = Matrix(training_data[:, Not([:consommation, :id])])
    y_train = training_data[:, :consommation]
    
    U, S, V = svd(X_train)

    # beta = V * (Diagonal(1 ./ S) * U' * y_train)

    beta = V' * Diagonal([s > 1e-10 ? 1/s : 0 for s in S]) * U' * y_train

    rmseval = 0.0
    if trainTestSplitPercentage != 0.0
        X_test = Matrix(test_data[:, Not(:consommation,:id)])
        y_test = test_data[:, :consommation]
        ychap =  X_test * beta
        ychap = (ychap .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        y_test = (y_test .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        # print("regression\n")
        # print("r2 = ", rsquared(y_test, ychap), "\n")
        rmseval = rmse(y_test, ychap)
        # print("rmse = ", rmseval , "\n\n")
    end
    return rmseval,beta, training_data, test_data
end

svd_regression (generic function with 1 method)

In [2854]:
# for polynomial regression
function construct_structure(x::Matrix{<:Real}, order::Int)
    n, m = size(x)
    poly_terms = [x[:, j].^p for j in 1:m, p in 0:order]
    X = hcat(poly_terms...)
    return X
end

function polynomial_regression(data::DataFrame, degree::Int, trainTestSplitPercentage::Float64)
    training_data, test_data = split_data(data, trainTestSplitPercentage)
    X_train = construct_structure(Matrix(training_data[:, Not([:consommation])]), degree)
    y_train = training_data[:, :consommation]

    beta = X_train \ y_train

    rmseval = 0.0

    if trainTestSplitPercentage != 0.0
        X_test = construct_structure(Matrix(test_data[:, Not([:consommation])]), degree)
        y_test = test_data[:, :consommation]

        ychap = X_test * beta
        ychap = (ychap .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
        y_test = (y_test .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN

        rmseval = rmse(y_test, ychap)
    end

    return rmseval, beta
end

polynomial_regression (generic function with 1 method)

In [2855]:
trainData = CSV.read("./data/train.csv", DataFrame)
testData = CSV.read("./data/test.csv", DataFrame)
trainData.consommation = parse.(Float64,replace.(trainData.consommation, "," => "."))
trainData.cylindree = parse.(Float64,replace.(trainData.cylindree, "," => "."))
testData.cylindree = parse.(Float64,replace.(testData.cylindree, "," => "."))
COMSOMMATION_MEAN = mean(trainData.consommation)
COMSOMMATION_STD = std(trainData.consommation)
trainData[!, :id] = 1:nrow(trainData)
testData[!, :id] = 1:nrow(testData)


1:150

In [2856]:
# trainData[!,:volume_gaz] = trainData[!,:nombre_cylindres] .* trainData[!,:cylindree]
# testData[!,:volume_gaz] = testData[!,:nombre_cylindres] .* testData[!,:cylindree]




In [2857]:
function evaluate_rmse()
    data = getStandardEncodedData(trainData)
    nrange = 1000
    n = 0
    for i in range(0, 1, length=nrange)
        n += regression(data, 0.05)[1]
    end
    print("average rmse for regression: ", n/nrange, "\n")

    n = 0
    for i in range(0, 1, length=nrange)
        n += ridge_regression(data, 0.1, 0.05)[1]
    end
    print("average rmse for ridge: ", n/nrange, "\n")

    n = 0
    for i in range(0, 1, length=nrange)
        n += svd_regression(data, 0.05)[1]
    end
    print("average rmse for svd: ", n/nrange, "\n")

    nrange = 1000
    n = 0
    for i in range(0, 1, length=nrange)
        n += polynomial_regression(data,2, 0.05)[1]
    end
    print("average rmse for polynomial regression: ", n/nrange, "\n")

end


evaluate_rmse (generic function with 1 method)

In [2858]:


function predict(data, beta)
    return (Matrix(data[:, Not(:consommation, :id)]) * beta)
end

function destandardize(data)
    return (data .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
end

function predict_and_destandardize(data, beta)
    return destandardize(predict(data, beta))
end



function find_mistakes(data_to_predict, beta, col)
    prediction = (Matrix(data_to_predict[:, Not(:consommation, :id)]) * beta)
    prediction = (prediction .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
    data_to_predict[!,:consommation] = data_to_predict[!,:consommation] .* COMSOMMATION_STD .+ COMSOMMATION_MEAN

    idtrain = 1:size(prediction, 1)

    base_data_of_predicted = innerjoin(data_to_predict, trainData, on=:id, makeunique=true)

    scatter(idtrain, prediction)
    scatter!(idtrain, base_data_of_predicted[!,:consommation])


    difference = prediction - base_data_of_predicted[!,:consommation]
    base_data_of_predicted[!, :difference] = difference
    diff_cutoff = 1
    high_diff_rows = base_data_of_predicted[abs.(difference) .> diff_cutoff, :]

    grouped_normal = combine(groupby(base_data_of_predicted, col), nrow => :count)

    grouped_high_diff = combine(groupby(high_diff_rows, col), nrow => :count)

    grouped_high_diff[!, :total_diff] = combine(groupby(high_diff_rows, col), :difference => (x -> sum(abs.(x))) => :total_diff)[:, :total_diff]

    max_diff = combine(groupby(high_diff_rows, col), :difference => (x -> maximum(abs.(x))) => :max_diff)

    grouped_high_diff = leftjoin(grouped_high_diff, max_diff, on=col)
    grouped_high_diff[!, :average_diff] = combine(groupby(high_diff_rows, col), :difference => (x -> mean(abs.(x))) => :average_abs_diff)[:, :average_abs_diff]

    grouped_high_diff[!, :rmse] = combine(groupby(high_diff_rows, col), :difference => (x -> sqrt(mean(x.^2))) => :rmse)[:, :rmse]
    grouped_normal[!, :rmse] = combine(groupby(base_data_of_predicted, col), :difference => (x -> sqrt(mean(x.^2))) => :rmse)[:, :rmse]

    percentage_high_diff = leftjoin(grouped_high_diff, grouped_normal, on=col, makeunique=true)
    percentage_high_diff[!, :percentage] = percentage_high_diff[!, :count] ./ percentage_high_diff[!, :count_1] .* 100
    percentage_high_diff = sort(percentage_high_diff, :percentage, rev=true)

    println("Percentage of data by ", col," with difference higher than 1: ")
    println(percentage_high_diff[:, [col, :percentage, :count, :count_1, :total_diff, :max_diff, :average_diff, :rmse]])

    grouped_normal = sort(grouped_normal, :rmse, rev=true)

    println(grouped_normal[:, [col, :count, :rmse]])
end




find_mistakes (generic function with 2 methods)

In [2859]:
# rmseval, betatrain, traindata, testpredictdata = regression(data, 0.2)
# find_mistakes(testpredictdata, betatrain, :annee)

In [2860]:
# beta = ridge_regression(data, 10, 0.0)[2]
function remise_regression(beta)
    test_data = getStandardEncodedData(testData)
    X_test = Matrix(test_data[!,Not(:id)])
    ychap =  X_test * beta
    ychap = (ychap .* COMSOMMATION_STD) .+ COMSOMMATION_MEAN
    remise(ychap)
end
using Dates
function remise(prediction)
    id = 1:150
    df_pred = DataFrame(id=id, consommation=prediction)
    
   
    current_time = Dates.format(now(), "yyyy-mm-dd_HH-MM-SS")
    file_name = "benchmark_" * current_time * ".csv"
    CSV.write("./soumissions_potentielles/" *file_name, df_pred)
end


remise (generic function with 1 method)

In [2861]:
function main()
    data = getStandardEncodedData(trainData)
    rmseval, betatrain, traindata, testpredictdata = regression(data, 0.2)
    evaluate_rmse()
    find_mistakes(testpredictdata, betatrain, :type)
    remise_regression(betatrain)
end
main()

average rmse for regression: 0.9242509616292423
average rmse for ridge: 0.9395010680177696
average rmse for svd: 1.9989225763669052
average rmse for polynomial regression: 0.9249379131016187
Percentage of data by type with difference higher than 1: 
[1m11×8 DataFrame[0m
[1m Row [0m│[1m type                  [0m[1m percentage [0m[1m count [0m[1m count_1 [0m[1m total_diff [0m[1m max_diff [0m[1m average_diff [0m[1m rmse    [0m
     │[90m String31              [0m[90m Float64    [0m[90m Int64 [0m[90m Int64?  [0m[90m Float64    [0m[90m Float64? [0m[90m Float64      [0m[90m Float64 [0m
─────┼────────────────────────────────────────────────────────────────────────────────────────────────
   1 │ voiture_moyenne          100.0         4        4    14.3086    5.7589        3.57714  4.04285
   2 │ voiture_minicompacte     100.0         1        1     4.86241   4.86241       4.86241  4.86241
   3 │ VUS_standard             100.0         3        3     6.68131  

"./soumissions_potentielles/benchmark_2024-11-27_16-58-33.csv"

In [2862]:
# for type in unique(trainData.type)
#     println(type)
#     data_type = trainData[trainData.type .== type, :]
#     println(combine(groupby(data_type, :transmission), :consommation => mean, :volume_gaz => mean, nrow => :nrow))
#     println()
# end

In [2863]:
# for type in unique(trainData.type)
#     println(type)
#     data_type = trainData[trainData.type .== type, :]
#     display(plot(x=data_type.volume_gaz, y=data_type.consommation))
#     println()
# end

## Exploration des données