# Imports

In [167]:
using CSV
using DataFrames
using Statistics
using Dates

# Cleaning the data

## Eliminating double comas ",," after wrong column
Changing them to ","

In [181]:
# Preparar paths
path = "dataSets/damCombustible.csv"
cleaned_path = "dataSets/damCombustible_cleaned.csv"

# Corregir cada linea del CSV
function fix_csv_line(line::String)
    parts = split(line, ",")
    horometro_index = 3
    
    if length(parts) > horometro_index + 1 && !isempty(parts[horometro_index + 1])
        while length(parts) > horometro_index + 2 && parts[horometro_index + 2] == ""
            parts = vcat(parts[1:horometro_index + 1], parts[horometro_index + 3:end])
        end
    end

    return join(parts, ",")
end
        

function correct_csv_file(input_path::String, output_path::String)
    csv_lines = readlines(input_path)

    fixed_lines = [fix_csv_line(line) for line in csv_lines]

    open(output_path, "w") do file
        for line in fixed_lines
            println(file, line)
        end
    end
end

correct_csv_file(path, cleaned_path)

df = CSV.read(cleaned_path, DataFrame)
describe(df)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,Nro.,320.709,2,322.5,638,0,Int64
2,Vehículo,105.566,101,105.0,110,0,Int64
3,Odómetro,812749.0,436624,1003400.0,1142707,0,Int64
4,Horómetro,40.0,4,33.0,86,618,"Union{Missing, Int64}"
5,Fecha,,01/abr./2024 12:39:00,,31/may./2024 1:54:00,0,String31
6,Tanqueo Full,,N,,S,0,String1
7,Costo por Volumen,,2324,,2697,0,String7
8,Cant.,,100,,9913,0,String7
9,Unidad,,Litros,,Litros,0,String7
10,Costo Total,,100045,,9999,0,String7


## Changing columns data-type to correct data-type
Fecha = DateTime

Tanqueo Full = Bool

Costo por Volumen = Float32

Cantidad = Float32

Costo Total = Float32

In [182]:
propertynames(df)

12-element Vector{Symbol}:
 Symbol("Nro.")
 :Vehículo
 :Odómetro
 :Horómetro
 :Fecha
 Symbol("Tanqueo Full")
 Symbol("Costo por Volumen")
 Symbol("Cant.")
 :Unidad
 Symbol("Costo Total")
 :Tipo
 :Column12

In [183]:
# Rename columns
rename!(df, Symbol("Nro.") => :Numero,
    :Vehículo => :Vehiculo, :Odómetro => :Odometro,
    :Horómetro => :Horometro,
    Symbol("Tanqueo Full") => :Tanque_Lleno,
    Symbol("Costo por Volumen") => :Costo_Por_Volumen,
    Symbol("Cant.") => :Cantidad,
    Symbol("Costo Total") => :Costo_Total)

# Drop useless Columns
select!(df, Not(:Column12, :Horometro, :Unidad, :Tipo))

propertynames(df)

8-element Vector{Symbol}:
 :Numero
 :Vehiculo
 :Odometro
 :Fecha
 :Tanque_Lleno
 :Costo_Por_Volumen
 :Cantidad
 :Costo_Total

In [184]:
# Convert S and N to 1 and 0 respectively
df.Tanque_Lleno = df.Tanque_Lleno .== "S"

#---------------------------------------------------------------

# Date convertion 
# Change spanish name to english
function replaceMonths(date)
    months = Dict(
        "ene." => "01", "feb." => "02", "mar." => "03", "abr." => "04",
        "may." => "05", "jun." => "06", "jul." => "07", "ago." => "08",
        "sep." => "09", "oct." => "10", "nov." => "11", "dic." => "12"
    )

    for (mes, month) in months
        if occursin(mes, date)
            return replace(date, mes => month)
        end
    end

    return date
end

df.Fecha = replaceMonths.(df.Fecha)

df.Fecha = Dates.DateTime.(df.Fecha, "dd/mm/yyyy HH:MM:SS")

#---------------------------------------------------------------

# Columns to convert with commas
columns_with_commas_to_convert = [:Costo_Por_Volumen, :Cantidad, :Costo_Total]

# Replace columns with dots and to float
for col in columns_with_commas_to_convert
    df[!, col] = replace.(df[!, col], "," => ".")
    df[!, col] = parse.(Float32, df[!, col])
end

In [185]:
describe(df)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Any,Any,Int64,DataType
1,Numero,320.709,2,322.5,638,0,Int64
2,Vehiculo,105.566,101,105.0,110,0,Int64
3,Odometro,812749.0,436624,1.0034e6,1142707,0,Int64
4,Fecha,,2024-04-01T12:39:00,2024-05-31T16:37:16,2024-08-06T17:33:18,0,DateTime
5,Tanque_Lleno,0.800633,false,1.0,true,0,Bool
6,Costo_Por_Volumen,25.2959,23.24,25.29,26.97,0,Float32
7,Cantidad,160.645,13.0,120.025,665.0,0,Float32
8,Costo_Total,4058.61,329.0,3055.5,16353.0,0,Float32


In [186]:
size(df)

(632, 8)

# Separating training and test data

In [187]:
# 80% for training
train_ratio = 0.8
n_training = round(Int, train_ratio * nrow(df))

506

In [188]:
trainData = df[1:n_training, :]
testData = df[n_training+1:end, :]

Row,Numero,Vehiculo,Odometro,Fecha,Tanque_Lleno,Costo_Por_Volumen,Cantidad,Costo_Total
Unnamed: 0_level_1,Int64,Int64,Int64,DateTime,Bool,Float32,Float32,Float32
1,128,103,755893,2024-04-27T05:44:42,true,23.29,158.22,3685.04
2,125,110,566806,2024-04-26T22:57:42,true,24.99,78.29,1956.47
3,126,104,1011058,2024-04-26T22:56:34,true,24.99,160.0,3998.4
4,124,110,566806,2024-04-26T22:46:09,true,24.99,400.0,9996.0
5,123,106,440125,2024-04-26T19:01:03,true,25.39,119.01,3021.66
6,122,104,1010506,2024-04-26T10:28:05,true,25.99,160.0,4158.4
7,127,103,755473,2024-04-26T02:35:00,true,25.39,118.16,3000.0
8,121,107,1090039,2024-04-26T02:17:16,true,25.69,147.76,3795.95
9,120,104,1010282,2024-04-25T23:50:15,true,25.49,90.23,2300.0
10,119,106,439718,2024-04-25T22:26:07,true,25.49,105.92,2700.0


In [189]:
size(trainData)

(506, 8)

In [190]:
size(testData)

(126, 8)

### Averiguar si revolverlos sirve o no 
# Mezclar los índices aleatoriamente
indices = shuffle(1:nrow(df))

# Calcular el número de muestras para el conjunto de entrenamiento
n_training = round(Int, train_ratio * nrow(df))

# Separar los índices en entrenamiento y prueba
train_indices = indices[1:n_training]
test_indices = indices[n_training+1:end]

# Crear los DataFrames de entrenamiento y prueba
train_df = df[train_indices, :]
test_df = df[test_indices, :]

# Función de regresión lineal

## Hipótesis

In [195]:
# hyp
theta = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
bias = [1.0]

function hyp(X::Matrix{Float64}, theta::Vector{Float64})::Vector{Float64}
    return X * theta
end

hyp (generic function with 1 method)

In [192]:
describe(df)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Any,Any,Int64,DataType
1,Numero,320.709,2,322.5,638,0,Int64
2,Vehiculo,105.566,101,105.0,110,0,Int64
3,Odometro,812749.0,436624,1.0034e6,1142707,0,Int64
4,Fecha,,2024-04-01T12:39:00,2024-05-31T16:37:16,2024-08-06T17:33:18,0,DateTime
5,Tanque_Lleno,0.800633,false,1.0,true,0,Bool
6,Costo_Por_Volumen,25.2959,23.24,25.29,26.97,0,Float32
7,Cantidad,160.645,13.0,120.025,665.0,0,Float32
8,Costo_Total,4058.61,329.0,3055.5,16353.0,0,Float32
