In [3]:
using CSV, DataFrames, Statistics, Dates, Plots, LinearAlgebra, DecisionTree

In [2]:
trainData = CSV.read("../../data/train.csv", DataFrame)
testData = CSV.read("../../data/test.csv", DataFrame)
first(trainData, 5)

Row,annee,type,nombre_cylindres,cylindree,transmission,boite,consommation
Unnamed: 0_level_1,Int64,String31,Int64,String3,String15,String15,String31
1,2023,voiture_moyenne,8,44,integrale,automatique,138358823529412
2,2020,VUS_petit,4,2,integrale,automatique,980041666666667
3,2021,voiture_compacte,6,33,propulsion,automatique,117605
4,2023,voiture_deux_places,8,5,integrale,automatique,130672222222222
5,2022,voiture_moyenne,8,44,integrale,automatique,138358823529412


In [6]:
trainData.consommation = parse.(Float64,replace.(trainData.consommation, "," => "."))
trainData.cylindree = parse.(Float64,replace.(trainData.cylindree, "," => ".")) 

testData.cylindree = parse.(Float64,replace.(testData.cylindree, "," => "."))

150-element Vector{Float64}:
 2.5
 2.5
 2.5
 2.0
 5.8
 5.0
 5.0
 2.4
 3.5
 5.2
 5.2
 5.9
 2.0
 ⋮
 1.6
 3.3
 5.0
 2.0
 2.0
 1.6
 2.0
 2.0
 3.0
 3.0
 1.5
 2.0

In [7]:
trainData[!,:volume_gaz] = trainData[!,:nombre_cylindres] .* trainData[!,:cylindree]

testData[!,:volume_gaz] = testData[!,:nombre_cylindres] .* testData[!,:cylindree]

150-element Vector{Float64}:
 10.0
 10.0
 10.0
  8.0
 46.4
 40.0
 40.0
  9.6
 21.0
 52.0
 52.0
 70.80000000000001
  8.0
  ⋮
  6.4
 19.799999999999997
 40.0
  8.0
  8.0
  6.4
  8.0
  8.0
 18.0
 18.0
  4.5
  8.0

In [8]:
function encode(data, column)
    for c in unique(data[!, column])
        data[!, Symbol(c)] = ifelse.(data[!, column] .== c, 1, 0)
    end
    return data
end


function encode_data(data)
    encoded_data = deepcopy(data)
    encoded_data = encode(encoded_data, :type)
    encoded_data = encode(encoded_data, :transmission)
    encoded_data = encode(encoded_data, :boite)

    encoded_data.cylindree = encoded_data.cylindree .- minimum(encoded_data.cylindree)
    encoded_data.volume_gaz = encoded_data.volume_gaz .- minimum(encoded_data.volume_gaz)
    encoded_data.annee = encoded_data.annee .-minimum(encoded_data.annee)
    #encoded_data = encode(encoded_data, :nombre_cylindres)
    return encoded_data
end

function removeRows(data)
    return select!(data, Not([:type, :transmission, :boite]))
end

function normalize_column(data, column)
    data[!, column] = (data[!, column] .- mean(data[!, column])) ./ std(data[!, column])
    return data
end

function norm_cols(data)
    normalized_data = deepcopy(data)
    normalized_data = normalize_column(normalized_data, :cylindree)
    normalized_data = normalize_column(normalized_data, :volume_gaz)
    #normalized_data = normalize_column(normalized_data, :annee)
    return normalized_data
end

norm_cols (generic function with 1 method)

In [9]:
encoded_train = encode_data(trainData)
filtered_train = removeRows(encoded_train)
#normed_train = norm_cols(filtered_train)

encoded_test = encode_data(testData)
filtered_test = removeRows(encoded_test)
#normed_test = norm_cols(filtered_test)

X_train =  Matrix(filtered_train[:, Not(:consommation)])
y_train = filtered_train.consommation
X_test = Matrix(filtered_test)

150×23 Matrix{Float64}:
  0.0   4.0  1.3   6.4  1.0  0.0  0.0  …  0.0  1.0  0.0  0.0  0.0  1.0  0.0
  0.0   4.0  1.3   6.4  1.0  0.0  0.0     0.0  1.0  0.0  0.0  0.0  0.0  1.0
  0.0   4.0  1.3   6.4  0.0  1.0  0.0     0.0  1.0  0.0  0.0  0.0  0.0  1.0
  0.0   4.0  0.8   4.4  0.0  1.0  0.0     0.0  0.0  1.0  0.0  0.0  0.0  1.0
  0.0   8.0  4.6  42.8  0.0  0.0  1.0     0.0  0.0  0.0  1.0  0.0  1.0  0.0
  0.0   8.0  3.8  36.4  0.0  0.0  1.0  …  0.0  0.0  0.0  1.0  0.0  0.0  1.0
  0.0   8.0  3.8  36.4  0.0  0.0  1.0     0.0  0.0  0.0  1.0  0.0  1.0  0.0
  0.0   4.0  1.2   6.0  0.0  1.0  0.0     0.0  0.0  1.0  0.0  0.0  0.0  1.0
  0.0   6.0  2.3  17.4  0.0  1.0  0.0     0.0  0.0  0.0  0.0  1.0  0.0  1.0
  0.0  10.0  4.0  48.4  0.0  0.0  0.0     0.0  0.0  0.0  0.0  1.0  1.0  0.0
  0.0  10.0  4.0  48.4  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  1.0  1.0  0.0
  0.0  12.0  4.7  67.2  0.0  0.0  1.0     0.0  0.0  0.0  1.0  0.0  0.0  1.0
  0.0   4.0  0.8   4.4  0.0  1.0  0.0     0.0  1.0  0.0  0.0  0.

In [10]:
size(X_train), size(y_train), size(X_test)

((396, 23), (396,), (150, 23))

In [11]:
function decisionTree_remise(X_train ,y_train,  X_test)
    model = model = RandomForestRegressor(n_subfeatures=12, n_trees=600,min_samples_leaf=1,min_purity_increase=0.0, max_depth=10, min_samples_split=6)
    DecisionTree.fit!(model, X_train, y_train)
    ychap = DecisionTree.predict(model, X_test)
    return ychap
end

decisionTree_remise (generic function with 1 method)

In [12]:
res = decisionTree_remise(X_train ,y_train,  X_test)
testData.consommation = res

150-element Vector{Float64}:
  9.586057010923264
  9.991908621249095
  9.964387156230877
  9.587383615313524
 14.198700850265208
 14.478373358114695
 13.528113729538982
  9.920027790156436
 11.58164852191427
 14.712751472276686
 14.712751472276686
 15.107087763706025
 10.051715191924206
  ⋮
  8.049301130597426
 11.488109732726155
 13.37367611307947
  9.764434839433623
 10.15006933842474
  8.03404508208437
  8.395407805040302
  9.888721405425269
 10.77947174410176
 11.709875702135543
  7.833658935438558
  9.793088520066277

In [13]:
id = 1:150

df_pred = DataFrame(id=id, consommation=res)

CSV.write("benchmark5.csv", df_pred)

"benchmark5.csv"