# MTH3302 - Méthodes probabilistes et statistiques pour I.A.
#### Polytechnique Montréal
### Projet A2024
----
Équipe T - TODO
### Objectif
Prédiction de **la consommation en carburant de voitures récentes**.

### Données
Le jeu de données contient pour presque 400 véhicule, la consommation moyenne en L/100km, l'année de frabrication, le type de véhicule, le nombre de cylindre, cylindree, la transmission et la boite.

- `train.csv` est l'ensemble d'entraînement
- `test.csv` est l'ensemble de test


## Chargement des données

Importation des librairies utilisées dans le calepin.

// TODO: à enlever à la fin:

Pour importer librairies:
using Pkg
Pkg.add(["CSV", "DataFrames", "Combinatorics", "Gadfly", "Distributions"], ...)

In [None]:
using CSV, DataFrames, Statistics, Dates, Gadfly, LinearAlgebra



In [None]:
function standardize(data)
    return (data .- mean(data)) ./ std(data)
end

function standardize_data(data)
    stddata = deepcopy(data)
   for col in names(stddata)
        if eltype(stddata[!, col]) <: Number
            stddata[!, col] = standardize(stddata[!, col])
        end
    end
    return stddata
end

standardize_data (generic function with 1 method)

In [None]:
function encode(data, column)
    for c in unique(data[!, column])
        data[!, Symbol(c)] = ifelse.(data[!, column] .== c, 1, 0)
    end
    return data
end

function encode_data(data)
    encoded_data = deepcopy(data)
    encoded_data = encode(encoded_data, :type)
    encoded_data = encode(encoded_data, :transmission)
    encoded_data = encode(encoded_data, :boite)
    return select!(encoded_data, Not([:type, :transmission, :boite, :cylindree, :nombre_cylindres]))
end

encode_data (generic function with 2 methods)

In [None]:
function getStandardEncodedData(data)
    data_copy = deepcopy(data)
    standardised_data = standardize_data(data_copy)
    standardised_data = encode_data(standardised_data,)
    return standardised_data
end




getStandardEncodedData (generic function with 1 method)

In [None]:
function rmse(y, ychap)
    return sqrt(mean((ychap .- y).^2))
end


rmse (generic function with 1 method)

In [None]:
function rsquared(y, ychap)
    ss_total = sum((y .- mean(y)).^2)
    ss_res = sum((y .- ychap).^2)
    return ss_res / ss_total
end


rsquared (generic function with 1 method)

In [None]:
function split_data(data,k)
    n = size(data, 1)
    validation_size = floor(Int, n * k)

    validation_indices = rand(1:n, validation_size)  
    
    validation_data = data[validation_indices, :]
    training_data = data[setdiff(1:n, validation_indices), :]
    
    return training_data, validation_data
end

split_data (generic function with 1 method)

In [None]:
function regression(standardised_data, trainTestSplitPercentage)
    training_data, test_data = split_data(standardised_data, trainTestSplitPercentage)
    X_train =  Matrix(training_data[:, Not(:consommation)])
    y_train = training_data[:, :consommation]

    beta = X_train \ y_train

    rmseval = 0.0
    if trainTestSplitPercentage != 0.0
        X_test = Matrix(test_data[:, Not(:consommation)])
        y_test = test_data[:, :consommation]
        ychap =  X_test * beta
        ychap = (ychap .* std(standardised_data[:, :consommation])) .+ mean(standardised_data[:, :consommation])
        y_test = (y_test .* std(standardised_data[:, :consommation])) .+ mean(standardised_data[:, :consommation])
        # print("regression\n")
        # print("r2 = ", rsquared(y_test, ychap), "\n")
        rmseval = rmse(y_test, ychap)
        # print("rmse = ", rmseval , "\n\n")
    end
    
    return rmseval, beta
end


regression (generic function with 2 methods)

In [None]:
function ridge_regression(standardised_data, lambda, trainTestSplitPercentage)
    training_data, test_data = split_data(standardised_data, trainTestSplitPercentage)
    X_train =  Matrix(training_data[:, Not(:consommation)])
    y_train = training_data[:, :consommation]

    beta = (X_train'X_train + lambda*I)\X_train'y_train

    rmseval = 0.0
    if trainTestSplitPercentage != 0.0
        X_test = Matrix(test_data[:, Not(:consommation)])
        y_test = test_data[:, :consommation]
        ychap =  X_test * beta
        ychap = (ychap .* std(standardised_data[:, :consommation])) .+ mean(standardised_data[:, :consommation])
        y_test = (y_test .* std(standardised_data[:, :consommation])) .+ mean(standardised_data[:, :consommation])
        # print("ridge: \n")
        # print("r2 = ", rsquared(y_test, ychap), "\n")
        rmseval = rmse(y_test, ychap)
        # print("rmse = ", rmseval , "\n\n")
    end
    return rmseval, beta
end

ridge_regression (generic function with 2 methods)

In [None]:
trainData = CSV.read("./data/train.csv", DataFrame)
testData = CSV.read("./data/test.csv", DataFrame)
first(trainData, 1)

Row,annee,type,nombre_cylindres,cylindree,transmission,boite,consommation
Unnamed: 0_level_1,Int64,String31,Int64,String3,String15,String15,String31
1,2023,voiture_moyenne,8,44,integrale,automatique,138358823529412


In [None]:
trainData.consommation = parse.(Float64,replace.(trainData.consommation, "," => "."))
trainData.cylindree = parse.(Float64,replace.(trainData.cylindree, "," => "."))


396-element Vector{Float64}:
 4.4
 2.0
 3.3
 5.0
 4.4
 4.4
 1.5
 1.5
 3.8
 3.3
 3.3
 5.0
 4.4
 ⋮
 2.0
 3.7
 3.8
 2.4
 3.5
 2.0
 2.0
 4.4
 3.3
 1.5
 3.3
 2.4

In [None]:
trainData[!,:volume_gaz] = trainData[!,:nombre_cylindres] .* trainData[!,:cylindree]
first(trainData, 1)


Row,annee,type,nombre_cylindres,cylindree,transmission,boite,consommation,volume_gaz
Unnamed: 0_level_1,Int64,String31,Int64,Float64,String15,String15,Float64,Float64
1,2023,voiture_moyenne,8,4.4,integrale,automatique,13.8359,35.2


In [None]:
data = getStandardEncodedData(trainData)
nrange = 100
n = 0
for i in range(0, 1, length=nrange)
    n += regression(data, 0.05)[1]
end
print("average rmse for regression: ", n/nrange, "\n")

n = 0
for i in range(0, 1, length=nrange)
    n += ridge_regression(data, 10, 0.05)[1]
end
print("average rmse for ridge: ", n/nrange, "\n")


average rmse for regression: 0.4674914936484146
average rmse for ridge: 0.45067120544722783


In [None]:
for type in unique(trainData.type)
    println(type)
    data_type = trainData[trainData.type .== type, :]
    println(combine(groupby(data_type, :transmission), :consommation => mean, :volume_gaz => mean, nrow => :nrow))
    println()
end

In [None]:
for type in unique(trainData.type)
    println(type)
    data_type = trainData[trainData.type .== type, :]
    display(plot(x=data_type.volume_gaz, y=data_type.consommation))
    println()
end

## Exploration des données