In [1]:
using CSV, DataFrames, Statistics, Dates, Plots, LinearAlgebra, DecisionTree

In [2]:
trainData = CSV.read("../../data/train.csv", DataFrame)
testData = CSV.read("../../data/test.csv", DataFrame)
first(trainData, 5)

Row,annee,type,nombre_cylindres,cylindree,transmission,boite,consommation
Unnamed: 0_level_1,Int64,String31,Int64,String3,String15,String15,String31
1,2023,voiture_moyenne,8,44,integrale,automatique,138358823529412
2,2020,VUS_petit,4,2,integrale,automatique,980041666666667
3,2021,voiture_compacte,6,33,propulsion,automatique,117605
4,2023,voiture_deux_places,8,5,integrale,automatique,130672222222222
5,2022,voiture_moyenne,8,44,integrale,automatique,138358823529412


In [3]:
trainData.consommation = parse.(Float64,replace.(trainData.consommation, "," => "."))
trainData.cylindree = parse.(Float64,replace.(trainData.cylindree, "," => ".")) 

testData.cylindree = parse.(Float64,replace.(testData.cylindree, "," => "."))

150-element Vector{Float64}:
 2.5
 2.5
 2.5
 2.0
 5.8
 5.0
 5.0
 2.4
 3.5
 5.2
 5.2
 5.9
 2.0
 ⋮
 1.6
 3.3
 5.0
 2.0
 2.0
 1.6
 2.0
 2.0
 3.0
 3.0
 1.5
 2.0

In [4]:
trainData[!,:volume_gaz] = trainData[!,:nombre_cylindres] .* trainData[!,:cylindree]

testData[!,:volume_gaz] = testData[!,:nombre_cylindres] .* testData[!,:cylindree]

150-element Vector{Float64}:
 10.0
 10.0
 10.0
  8.0
 46.4
 40.0
 40.0
  9.6
 21.0
 52.0
 52.0
 70.80000000000001
  8.0
  ⋮
  6.4
 19.799999999999997
 40.0
  8.0
  8.0
  6.4
  8.0
  8.0
 18.0
 18.0
  4.5
  8.0

In [5]:
function encode(data, column)
    for c in unique(data[!, column])
        data[!, Symbol(c)] = ifelse.(data[!, column] .== c, 1, 0)
    end
    return data
end


function encode_data(data)
    encoded_data = deepcopy(data)
    encoded_data = encode(encoded_data, :type)
    encoded_data = encode(encoded_data, :transmission)
    encoded_data = encode(encoded_data, :boite)

    encoded_data.cylindree = encoded_data.cylindree .- minimum(encoded_data.cylindree)
    encoded_data.volume_gaz = encoded_data.volume_gaz .- minimum(encoded_data.volume_gaz)
    encoded_data.annee = encoded_data.annee .-minimum(encoded_data.annee)
    #encoded_data = encode(encoded_data, :nombre_cylindres)
    return encoded_data
end

function removeRows(data)
    return select!(data, Not([:type, :transmission, :boite]))
end

function normalize_column(data, column)
    data[!, column] = (data[!, column] .- mean(data[!, column])) ./ std(data[!, column])
    return data
end

function norm_cols(data)
    normalized_data = deepcopy(data)
    normalized_data = normalize_column(normalized_data, :cylindree)
    normalized_data = normalize_column(normalized_data, :volume_gaz)
    #normalized_data = normalize_column(normalized_data, :annee)
    return normalized_data
end

norm_cols (generic function with 1 method)

In [6]:
encoded_train = encode_data(trainData)
filtered_train = removeRows(encoded_train)
#normed_train = norm_cols(filtered_train)

encoded_test = encode_data(testData)
filtered_test = removeRows(encoded_test)
#normed_test = norm_cols(filtered_test)

X_train =  Matrix(filtered_train[:, Not(:consommation)])
y_train = filtered_train.consommation
X_test = Matrix(filtered_test)

150×23 Matrix{Float64}:
  0.0   4.0  1.3   6.4  1.0  0.0  0.0  …  0.0  1.0  0.0  0.0  0.0  1.0  0.0
  0.0   4.0  1.3   6.4  1.0  0.0  0.0     0.0  1.0  0.0  0.0  0.0  0.0  1.0
  0.0   4.0  1.3   6.4  0.0  1.0  0.0     0.0  1.0  0.0  0.0  0.0  0.0  1.0
  0.0   4.0  0.8   4.4  0.0  1.0  0.0     0.0  0.0  1.0  0.0  0.0  0.0  1.0
  0.0   8.0  4.6  42.8  0.0  0.0  1.0     0.0  0.0  0.0  1.0  0.0  1.0  0.0
  0.0   8.0  3.8  36.4  0.0  0.0  1.0  …  0.0  0.0  0.0  1.0  0.0  0.0  1.0
  0.0   8.0  3.8  36.4  0.0  0.0  1.0     0.0  0.0  0.0  1.0  0.0  1.0  0.0
  0.0   4.0  1.2   6.0  0.0  1.0  0.0     0.0  0.0  1.0  0.0  0.0  0.0  1.0
  0.0   6.0  2.3  17.4  0.0  1.0  0.0     0.0  0.0  0.0  0.0  1.0  0.0  1.0
  0.0  10.0  4.0  48.4  0.0  0.0  0.0     0.0  0.0  0.0  0.0  1.0  1.0  0.0
  0.0  10.0  4.0  48.4  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  1.0  1.0  0.0
  0.0  12.0  4.7  67.2  0.0  0.0  1.0     0.0  0.0  0.0  1.0  0.0  0.0  1.0
  0.0   4.0  0.8   4.4  0.0  1.0  0.0     0.0  1.0  0.0  0.0  0.

In [7]:
size(X_train), size(y_train), size(X_test)

((396, 23), (396,), (150, 23))

In [8]:
function decisionTree_remise(X_train ,y_train,  X_test)
    model = model = RandomForestRegressor(n_subfeatures=12, n_trees=600,min_samples_leaf=1,min_purity_increase=0.0, max_depth=10, min_samples_split=6)
    DecisionTree.fit!(model, X_train, y_train)
    ychap = DecisionTree.predict(model, X_test)
    return ychap
end

decisionTree_remise (generic function with 1 method)

In [9]:
res = decisionTree_remise(X_train ,y_train,  X_test)
testData.consommation = res

150-element Vector{Float64}:
  9.533122427867047
  9.924493526723806
  9.991116575619897
  9.605439347047174
 14.091098516407353
 14.311009432009193
 13.374469020966789
  9.955849620169417
 11.575024346969261
 14.66502807950151
 14.66502807950151
 15.075799664796541
 10.05678183982413
  ⋮
  8.185991323575495
 11.587632364132306
 13.386550419418748
  9.837515819799773
 10.131450100557183
  8.161728853577575
  8.391881064055928
  9.910797731805797
 10.782449726117967
 11.72780204164973
  7.963175746768973
  9.754729227014998

In [10]:
id = 1:150

df_pred = DataFrame(id=id, consommation=res)

CSV.write("benchmark5.csv", df_pred)

"benchmark5.csv"

In [11]:
benchmark = CSV.read("../soumission_2/benchmark1.csv", DataFrame)

Row,id,consommation
Unnamed: 0_level_1,Int64,Float64
1,1,8.7731
2,2,8.07005
3,3,9.54555
4,4,9.8891
5,5,14.683
6,6,14.683
7,7,14.683
8,8,9.8891
9,9,10.9249
10,10,15.3451




In [12]:
println(benchmark.consommation)
println(res)

[8.773101155974718, 8.070047540540804, 9.545548529167164, 9.889100280579937, 14.682977190876372, 14.682977190876372, 14.682977190876372, 9.889100280579937, 10.924893508841617, 15.345064163165286, 15.345064163165286, 15.345064163165286, 8.840702060295868, 9.545548529167164, 8.773101155974718, 11.221135830353218, 13.463562602124174, 13.463562602124174, 13.463562602124174, 13.463562602124174, 14.138433915671541, 13.463562602124174, 13.463562602124174, 14.138433915671541, 13.463562602124174, 14.138433915671541, 13.463562602124174, 14.138433915671541, 12.897491767520393, 10.620542993264337, 11.632846725905672, 9.545548529167164, 9.889100280579937, 11.632846725905672, 9.889100280579937, 7.842333308965198, 10.620542993264337, 10.620542993264337, 8.773101155974718, 9.545548529167164, 12.897491767520393, 10.924893508841617, 9.889100280579937, 8.070047540540804, 8.773101155974718, 11.632846725905672, 9.889100280579937, 10.924893508841617, 7.4361978718638, 15.345064163165286, 9.889100280579937, 1