# Imports

In [31]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [32]:
import pandas as pd
df = pd.read_csv('dataSets/damCombustible_cleaned.csv')

df.head()

Unnamed: 0,Nro.,Vehículo,Odómetro,Horómetro,Fecha,Tanqueo Full,Costo por Volumen,Cant.,Unidad,Costo Total,Tipo,Unnamed: 11
0,634,101,1086986,,06/ago./2024 17:33:18,S,2495,20,Litros,499,DIESEL,
1,633,101,1086986,,06/ago./2024 17:29:56,S,2538,394,Litros,9999,DIESEL,
2,637,102,1023146,,06/ago./2024 6:23:00,S,2457,127,Litros,3120,DIESEL,
3,638,110,595357,,06/ago./2024 3:51:00,S,2551,196,Litros,5000,DIESEL,
4,636,105,492843,,05/ago./2024 23:00:00,S,2457,127,Litros,3120,DIESEL,


# Transformación de los datos

Separamos las columnas a usar y las convertimos a int

- Odómetro
- Cantidad
- Vehículo

In [33]:
# Convertir 'Odómetro' y 'Cant.' a numérico, reemplazando las comas y valores nulos
df['Odómetro'] = pd.to_numeric(df['Odómetro'], errors='coerce')
df['Cant.'] = df['Cant.'].str.replace(',', '').astype(float)

# Eliminar filas con valores nulos en 'Odómetro' o 'Cant.'
df = df.dropna(subset=['Odómetro', 'Cant.'])

# Convertir 'Vehículo' a un formato numérico
df['Vehículo'] = df['Vehículo'].astype('category').cat.codes

# Dropeamos columnas que no se usan
df = df.drop(columns=['Nro.', 'Horómetro', 'Fecha', 'Tanqueo Full', 'Costo por Volumen', 'Unidad', 'Costo Total', 'Tipo', 'Unnamed: 11'])

# Mostrar el DataFrame transformado
df.head()


Unnamed: 0,Vehículo,Odómetro,Cant.
0,0,1086986,20.0
1,0,1086986,394.0
2,1,1023146,127.0
3,9,595357,196.0
4,4,492843,127.0


## Normalizamos

In [34]:
scaler = MinMaxScaler()

# Normalizar las columnas 'Odómetro' y 'Cant.'
df[['Odómetro', 'Cant.']] = scaler.fit_transform(df[['Odómetro', 'Cant.']])

df.head()

Unnamed: 0,Vehículo,Odómetro,Cant.
0,0,0.921084,0.000174
1,0,0.921084,0.009449
2,1,0.83067,0.002827
3,9,0.224808,0.004538
4,4,0.079621,0.002827


## Cambiamos nombres

In [35]:
df = df.rename(columns={'Vehículo':'vehicle','Odómetro': 'odometer', 'Cant.': 'quantity'})
df

Unnamed: 0,vehicle,odometer,quantity
0,0,0.921084,0.000174
1,0,0.921084,0.009449
2,1,0.830670,0.002827
3,9,0.224808,0.004538
4,4,0.079621,0.002827
...,...,...,...
627,6,0.918250,0.096573
628,8,0.132618,0.092778
629,8,0.132618,0.009598
630,9,0.172328,0.099003


# Entrenamiento

## Separamos independiente de dependientes

In [43]:
# Variables independientes
x = df[['odometer', 'quantity']]

# Variable dependiente
y = df['vehicle']


## Separamos train y test - 90% - 10%

In [48]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

## Usamos Random Forest Classifier

In [49]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

## Hacemos las predicciones

In [50]:
y_pred = model.predict(x_test)

## Evaluamos la precisión

In [52]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Precisión del modelo: {accuracy:.2f}')

Precisión del modelo: 0.89


# Realizamos un análisis de los resultados para compararlos con mi modelo

In [53]:
y_pred

array([1, 6, 5, 6, 3, 8, 9, 8, 0, 1, 0, 9, 5, 3, 9, 5, 3, 6, 7, 1, 3, 4,
       3, 8, 1, 9, 7, 2, 0, 2, 4, 4, 0, 6, 1, 5, 4, 4, 1, 7, 4, 7, 1, 3,
       8, 8, 2, 0, 9, 0, 9, 4, 9, 5, 7, 8, 4, 0, 2, 9, 3, 0, 2, 6],
      dtype=int8)

In [61]:
y_test.head(30)

328    1
247    6
570    5
145    6
496    3
405    8
165    9
77     8
533    0
163    1
271    0
31     8
55     5
90     3
575    9
76     5
2      1
256    6
311    7
333    3
131    3
291    4
49     3
628    8
482    3
110    9
367    7
278    2
627    6
261    2
Name: vehicle, dtype: int8