In [332]:
using CSV, DataFrames, Statistics, Dates, Plots, LinearAlgebra, DecisionTree, Random

In [3]:
trainData = CSV.read("./data/train.csv", DataFrame)
testData = CSV.read("./data/test.csv", DataFrame)
first(trainData, 5)

Row,annee,type,nombre_cylindres,cylindree,transmission,boite,consommation
Unnamed: 0_level_1,Int64,String31,Int64,String3,String15,String15,String31
1,2023,voiture_moyenne,8,44,integrale,automatique,138358823529412
2,2020,VUS_petit,4,2,integrale,automatique,980041666666667
3,2021,voiture_compacte,6,33,propulsion,automatique,117605
4,2023,voiture_deux_places,8,5,integrale,automatique,130672222222222
5,2022,voiture_moyenne,8,44,integrale,automatique,138358823529412


In [4]:
trainData.consommation = parse.(Float64,replace.(trainData.consommation, "," => "."))
trainData.cylindree = parse.(Float64,replace.(trainData.cylindree, "," => ".")) 

testData.cylindree = parse.(Float64,replace.(testData.cylindree, "," => "."))

150-element Vector{Float64}:
 2.5
 2.5
 2.5
 2.0
 5.8
 5.0
 5.0
 2.4
 3.5
 5.2
 5.2
 5.9
 2.0
 ⋮
 1.6
 3.3
 5.0
 2.0
 2.0
 1.6
 2.0
 2.0
 3.0
 3.0
 1.5
 2.0

In [5]:
trainData[!,:volume_gaz] = trainData[!,:nombre_cylindres] .* trainData[!,:cylindree]

testData[!,:volume_gaz] = testData[!,:nombre_cylindres] .* testData[!,:cylindree]

150-element Vector{Float64}:
 10.0
 10.0
 10.0
  8.0
 46.4
 40.0
 40.0
  9.6
 21.0
 52.0
 52.0
 70.80000000000001
  8.0
  ⋮
  6.4
 19.799999999999997
 40.0
  8.0
  8.0
  6.4
  8.0
  8.0
 18.0
 18.0
  4.5
  8.0

In [333]:
function split_data(data,k)
    n = size(data, 1)
    validation_size = floor(Int, n * k)

    validation_indices = rand(1:n, validation_size)  
    
    validation_data = data[validation_indices, :]
    training_data = data[setdiff(1:n, validation_indices), :]
    
    return training_data, validation_data
end



function train_test_split(data, test_size=0.2, shuffle=true)
    n = size(data, 1)
    test_size = floor(Int, n * test_size)
    
    if shuffle
        indices = randperm(n)
    else
        indices = 1:n
    end
    
    test_indices = indices[1:test_size]
    train_indices = indices[test_size+1:end]
    
    train_data = data[train_indices, :]
    test_data = data[test_indices, :]
    
    return train_data, test_data
end

train_test_split (generic function with 3 methods)

In [457]:
function encode(data, column)
    for c in unique(data[!, column])
        data[!, Symbol(c)] = ifelse.(data[!, column] .== c, 1, 0)
    end
    return data
end


function encode_data(data)
    encoded_data = deepcopy(data)
    encoded_data = encode(encoded_data, :type)
    encoded_data = encode(encoded_data, :transmission)
    encoded_data = encode(encoded_data, :boite)
    return encoded_data
end

function removeRows(data)
    return select!(data, Not([:type, :transmission, :boite]))
end

function normalize_column(data, column)
    data[!, column] = (data[!, column] .- mean(data[!, column])) ./ std(data[!, column])
    return data
end

function norm_cols(data)
    normalized_data = deepcopy(data)
    normalized_data = normalize_column(normalized_data, :cylindree)
    normalized_data = normalize_column(normalized_data, :volume_gaz)
    normalized_data = normalize_column(normalized_data, :nombre_cylindres)
    normalized_data = normalize_column(normalized_data, :annee)

    return normalized_data
end

norm_cols (generic function with 1 method)

In [458]:
encoded_train = encode_data(trainData)
encoded_train = removeRows(encoded_train)
#encoded_train = normalize(encoded_train)
encoded_train = norm_cols(encoded_train)
train, test= train_test_split(encoded_train)


#= encoded_train =#

X_train =  Matrix(train[:, Not(:consommation)])
y_train = train.consommation
X_test = Matrix(test[:, Not(:consommation)])
y_test = test.consommation

size(X_test),size(X_train),size(y_test),size(y_train)

((79, 23), (317, 23), (79,), (317,))

In [337]:
train

Row,annee,nombre_cylindres,cylindree,consommation,volume_gaz,voiture_moyenne,VUS_petit,voiture_compacte,voiture_deux_places,voiture_minicompacte,VUS_standard,monospace,voiture_sous_compacte,camionnette_petit,break_petit,voiture_grande,camionnette_standard,break_moyen,integrale,propulsion,traction,4x4,automatique,manuelle
Unnamed: 0_level_1,Int64,Int64,Float64,Float64,Float64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,2023,8,4.4,12.3795,35.2,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,2021,4,2.0,7.58742,8.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
3,2017,4,2.0,8.71148,8.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,2015,6,3.3,11.2005,19.8,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
5,2018,6,3.3,11.7605,19.8,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0
6,2014,8,4.2,13.8359,33.6,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
7,2016,8,5.0,13.0672,40.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0
8,2015,8,5.0,13.0672,40.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0
9,2018,4,2.0,8.71148,8.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
10,2021,8,5.0,13.0672,40.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [None]:

res = []
for i in 1:20
    n = 0
    for i in 1:10
        model = RandomForestRegressor(n_trees=i)
    DecisionTree.fit!(model, X_train, y_train)
    y_pred = DecisionTree.predict(model, X_test)
    n += sqrt(mean((y_pred - y_test).^2))
    end
    push!(res,n/10)
end



# display the grid
plot(1:20, res, label="RandomForestRegressor", xlabel="n_trees", ylabel="RMSE", title="RandomForestRegressor RMSE vs n_trees")

In [None]:

res = []
for i in 1:23
    n = 0
    for j in 1:10
        model = RandomForestRegressor(n_subfeatures=i)
    DecisionTree.fit!(model, X_train, y_train)
    y_pred = DecisionTree.predict(model, X_test)
    n += sqrt(mean((y_pred - test.consommation).^2))
    end
    push!(res,n/10)
end



# display the grid
plot(1:23, res, label="RandomForestRegressor", xlabel="n_subfeatures", ylabel="RMSE", title="RandomForestRegressor RMSE vs n_trees")

In [None]:
res = []
for i in 1:23
    n = 0
    for j in 1:10
        model = RandomForestRegressor(min_samples_leaf=i)
    DecisionTree.fit!(model, X_train, y_train)
    y_pred = DecisionTree.predict(model, X_test)
    n += sqrt(mean((y_pred - test.consommation).^2))
    end
    push!(res,n/10)
end



# display the grid
plot(1:23, res, label="RandomForestRegressor", xlabel="min_samples_leaf", ylabel="RMSE", title="RandomForestRegressor RMSE vs n_trees")

In [None]:
res = []
for i in 1:23
    n = 0
    for j in 1:10
        model = RandomForestRegressor(min_purity_increase=i)
    DecisionTree.fit!(model, X_train, y_train)
    y_pred = DecisionTree.predict(model, X_test)
    n += sqrt(mean((y_pred - test.consommation).^2))
    end
    push!(res,n/10)
end



# display the grid
plot(1:23, res, label="RandomForestRegressor", xlabel="min_samples_leaf", ylabel="RMSE", title="RandomForestRegressor RMSE vs n_trees")

In [None]:
res = []
for i in 2:23
    n = 0
    for j in 1:10
        model = RandomForestRegressor(min_samples_split=i)
    DecisionTree.fit!(model, X_train, y_train)
    y_pred = DecisionTree.predict(model, X_test)
    n += sqrt(mean((y_pred - test.consommation).^2))
    end
    push!(res,n/10)
end



# display the grid
plot(2:23, res, label="RandomForestRegressor", xlabel="min_samples_leaf", ylabel="RMSE", title="RandomForestRegressor RMSE vs n_trees")

In [485]:
@doc RandomForestRegressor

```
RandomForestRegressor(; n_subfeatures::Int=-1,
                      n_trees::Int=10,
                      partial_sampling::Float=0.7,
                      max_depth::Int=-1,
                      min_samples_leaf::Int=5,
                      rng=Random.GLOBAL_RNG,
                      impurity_importance::Bool=true)
```

Random forest regression. See [DecisionTree.jl's documentation](https://github.com/bensadeghi/DecisionTree.jl)

Hyperparameters:

  * `n_subfeatures`: number of features to consider at random per split (default: -1, sqrt(# features))
  * `n_trees`: number of trees to train (default: 10)
  * `partial_sampling`: fraction of samples to train each tree on (default: 0.7)
  * `max_depth`: maximum depth of the decision trees (default: no maximum)
  * `min_samples_leaf`: the minimum number of samples each leaf needs to have (default: 5)
  * `min_samples_split`: the minimum number of samples in needed for a split
  * `min_purity_increase`: minimum purity needed for a split
  * `rng`: the random number generator to use. Can be an `Int`, which will be used to seed and create a new random number generator. Multi-threaded forests must be seeded with an `Int`
  * `impurity_importance`: whether to calculate feature importances using `Mean Decrease in Impurity (MDI)`. See [`DecisionTree.impurity_importance`](@ref).

Implements `fit!`, `predict`, `get_classes`


In [502]:
model = RandomForestRegressor(n_subfeatures=12, n_trees=600,min_samples_leaf=1,min_purity_increase=0.0, max_depth=10, min_samples_split=6,)
DecisionTree.fit!(model, X_train, y_train)
y_pred = DecisionTree.predict(model, X_test)
sqrt(mean((y_pred - test.consommation).^2))

0.9131591102805092

In [488]:
@doc DecisionTreeRegressor

```
DecisionTreeRegressor(; pruning_purity_threshold=0.0,
                      max_depth::Int-1,
                      min_samples_leaf::Int=5,
                      min_samples_split::Int=2,
                      min_purity_increase::Float=0.0,
                      n_subfeatures::Int=0,
                      rng=Random.GLOBAL_RNG,
                      impurity_importance::Bool=true)
```

Decision tree regression. See [DecisionTree.jl's documentation](https://github.com/bensadeghi/DecisionTree.jl)

Hyperparameters:

  * `pruning_purity_threshold`: (post-pruning) merge leaves having `>=thresh` combined purity (default: no pruning). This accuracy-based method may not be appropriate for regression tree.
  * `max_depth`: maximum depth of the decision tree (default: no maximum)
  * `min_samples_leaf`: the minimum number of samples each leaf needs to have (default: 5)
  * `min_samples_split`: the minimum number of samples in needed for a split (default: 2)
  * `min_purity_increase`: minimum purity needed for a split (default: 0.0)
  * `n_subfeatures`: number of features to select at random (default: keep all)
  * `rng`: the random number generator to use. Can be an `Int`, which will be used to seed and create a new random number generator.
  * `impurity_importance`: whether to calculate feature importances using `Mean Decrease in Impurity (MDI)`. See [`DecisionTree.impurity_importance`](@ref)

Implements `fit!`, `predict`, `get_classes`


In [515]:
model = DecisionTreeRegressor(n_subfeatures=12, min_samples_leaf=1,min_purity_increase=0.0, max_depth=10, min_samples_split=6,)
DecisionTree.fit!(model, X_train, y_train)
y_pred = DecisionTree.predict(model, X_test)
sqrt(mean((y_pred - test.consommation).^2))

1.1704408817014724

In [456]:
y_pred

79-element Vector{Float64}:
  1.2240951129805515
  0.17625130984704987
  1.479696235370256
 -0.2607463003952433
  1.5211869207713695
 -1.3097410769834612
 -1.376475788529015
 -1.3608160998913268
  0.6165740389681241
 -1.2787160338849795
  1.4209770672951387
  1.7941338351390625
  0.3887079431074021
  ⋮
  0.37544214897858164
 -1.5136770521451326
 -1.079500521944099
  1.174229577971678
 -1.0333093095750074
 -0.01294650848269291
 -1.2879443398905748
 -1.4793632637667964
  1.2431097360103887
 -1.304503701086083
 -0.8312720241469432
 -1.2995868865253335

In [None]:
function decisionNormal(X_train ,y_train,  X_test, max_depth)
    model = DecisionTreeRegressor(max_depth=max_depth)
    fit!(model, X_train, y_train) #peut etre normaliser sur tout les données, pas justr sur les données d'entrainement
    ychap =  predict(model, X_test)  
    return ychap
end

decisionTree_remise (generic function with 1 method)

In [455]:
res = decisionTree_remise(X_train ,y_train,  X_test, 5)
testData.consommation = res

LoadError: UndefVarError: `decisionTree_remise` not defined

In [21]:
id = 1:150

df_pred = DataFrame(id=id, consommation=res)

CSV.write("benchmark1.csv", df_pred)

"benchmark1.csv"