In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import tree

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline


In [2]:
data = pd.read_csv('./Dataset_dumarey/dataset_rounded.csv')

In [3]:
data.head()


Unnamed: 0,vm,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,-8.99632E+l8,01/01/2021 20:31,01/01/2021 20:38,443664,443947,564,549.0,trip,
1,-8.99632E+ 18,01/01/202120:39,02/01/202115:13,443947,443947,549,100.0,charge,240.0
2,-8.99632E+18,02/01/202115:13,02/01/202115:21,443947,444250,100,100.0,trip,
3,-8.99632E+18,02/01/202115:21,02/01/202117:46,444250,445713,100,945.0,trip,
4,-8.99632E+18,02/01/202117:46,02/01/202117:53,445713,446145,945,945.0,trip,


In [4]:
print(f"Element of the dataset: {len(data)}")
print(f"Columns of the dataset: {data.columns.to_list()}")
data.info()

Element of the dataset: 32
Columns of the dataset: ['vm', 'timestamp', 'end_time', 'odo', 'end_odo', 'soc', 'end_soc', 'event', 'charge_mode']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   vm           32 non-null     object 
 1   timestamp    32 non-null     object 
 2   end_time     32 non-null     object 
 3   odo          32 non-null     object 
 4   end_odo      32 non-null     object 
 5   soc          31 non-null     object 
 6   end_soc      31 non-null     float64
 7   event        32 non-null     object 
 8   charge_mode  8 non-null      object 
dtypes: float64(1), object(8)
memory usage: 2.4+ KB


In [5]:
data.describe()

Unnamed: 0,end_soc
count,31.0
mean,503.967742
std,283.321429
min,16.0
25%,437.0
50%,584.0
75%,654.0
max,945.0


In [6]:
def remove_comma(ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts).strip()
    ts = ts.replace(",", ".")         
    ts = ts.replace(". ", ".")        
    return ts 

In [7]:
data['odo'] = data['odo'].apply(remove_comma)
data['end_odo'] = data['end_odo'].apply(remove_comma)
data['soc'] = data['soc'].apply(remove_comma)
data['end_soc'] = data['end_soc'].apply(remove_comma)

In [8]:
cols_to_float = ['odo', 'end_odo', 'soc', 'end_soc']
for col in cols_to_float:
    data[col] = pd.to_numeric(data[col], errors='coerce')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   vm           32 non-null     object 
 1   timestamp    32 non-null     object 
 2   end_time     32 non-null     object 
 3   odo          32 non-null     float64
 4   end_odo      32 non-null     float64
 5   soc          29 non-null     float64
 6   end_soc      31 non-null     float64
 7   event        32 non-null     object 
 8   charge_mode  8 non-null      object 
dtypes: float64(4), object(5)
memory usage: 2.4+ KB


In [10]:
import re

def fix_timestamp_format(ts):
    if pd.isna(ts):
        return ts  # lascia i NaN invariati
    ts = str(ts).strip()
    
    # Verifica se è nel formato corretto: "gg/mm/aaaa ora:minuti"
    if re.match(r"\d{2}/\d{2}/\d{4} \d{2}:\d{2}", ts):
        return ts
    # Se è nel formato senza spazio: "gg/mm/aaaaora:minuti"
    elif re.match(r"\d{2}/\d{2}/\d{4}\d{2}:\d{2}", ts):
        return ts[:10] + ' ' + ts[10:]
    else:
        return ts  # non modificare se il formato è diverso (potresti loggarlo)

# Applica la funzione
data['timestamp'] = data['timestamp'].apply(fix_timestamp_format)
data['end_time'] = data['end_time'].apply(fix_timestamp_format) 

In [11]:
data.head()

Unnamed: 0,vm,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,-8.99632E+l8,01/01/2021 20:31,01/01/2021 20:38,443.664,443.947,564.0,549.0,trip,
1,-8.99632E+ 18,01/01/2021 20:39,02/01/2021 15:13,443.947,443.947,549.0,100.0,charge,240.0
2,-8.99632E+18,02/01/2021 15:13,02/01/2021 15:21,443.947,444.25,100.0,100.0,trip,
3,-8.99632E+18,02/01/2021 15:21,02/01/2021 17:46,444.25,445.713,100.0,945.0,trip,
4,-8.99632E+18,02/01/2021 17:46,02/01/2021 17:53,445.713,446.145,945.0,945.0,trip,


In [12]:
data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce')
data['end_time'] = pd.to_datetime(data['end_time'], errors='coerce')

In [13]:
data.head()

Unnamed: 0,vm,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,-8.99632E+l8,2021-01-01 20:31:00,2021-01-01 20:38:00,443.664,443.947,564.0,549.0,trip,
1,-8.99632E+ 18,2021-01-01 20:39:00,2021-02-01 15:13:00,443.947,443.947,549.0,100.0,charge,240.0
2,-8.99632E+18,2021-02-01 15:13:00,2021-02-01 15:21:00,443.947,444.25,100.0,100.0,trip,
3,-8.99632E+18,2021-02-01 15:21:00,2021-02-01 17:46:00,444.25,445.713,100.0,945.0,trip,
4,-8.99632E+18,2021-02-01 17:46:00,2021-02-01 17:53:00,445.713,446.145,945.0,945.0,trip,


In [14]:
nan_count = data.isna().sum()
print("| Features | NaN-counter |")
print(nan_count)
print("|----------|-------------|")

| Features | NaN-counter |
vm              0
timestamp       0
end_time        0
odo             0
end_odo         0
soc             3
end_soc         1
event           0
charge_mode    24
dtype: int64
|----------|-------------|


In [15]:
duplicates = data.duplicated()
print(f"Duplicates: {duplicates.sum()}")
data = data[~duplicates]

Duplicates: 0


In [16]:
print(f"Element of the dataset: {len(data)}")

Element of the dataset: 32


In [17]:
cols_to_convert = ['event', 'charge_mode']
for col in cols_to_convert:
    data[col] = data[col].astype('category')

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   vm           32 non-null     object        
 1   timestamp    32 non-null     datetime64[ns]
 2   end_time     32 non-null     datetime64[ns]
 3   odo          32 non-null     float64       
 4   end_odo      32 non-null     float64       
 5   soc          29 non-null     float64       
 6   end_soc      31 non-null     float64       
 7   event        32 non-null     category      
 8   charge_mode  8 non-null      category      
dtypes: category(2), datetime64[ns](2), float64(4), object(1)
memory usage: 2.2+ KB


In [19]:
ohe = OneHotEncoder(handle_unknown='ignore')
# Fit the one-hot encoder on training data
ohe.fit(data[cols_to_convert])
# Create a new DataFrame with only the one-hot encoded columns
temp_data = pd.DataFrame(data=ohe.transform(data[cols_to_convert]).toarray(),
                             columns=ohe.get_feature_names_out())
# Create a copy of the DataFrame
data_encoded = data.copy()

# Remove the old categorical columns from the original data
data_encoded.drop(columns=cols_to_convert, axis=1, inplace=True)
data_encoded = pd.concat([data_encoded.reset_index(drop=True), temp_data], axis=1)


In [25]:
data_encoded.head()
#remove the column charge_mode_nan
data_encoded.drop(columns=['charge_mode_nan'], axis=1, inplace=True)
data_encoded.head()

Unnamed: 0,vm,timestamp,end_time,odo,end_odo,soc,end_soc,event_charge,event_trip,charge_mode_240,charge_mode_DCCharging
0,-8.99632E+l8,2021-01-01 20:31:00,2021-01-01 20:38:00,443.664,443.947,564.0,549.0,0.0,1.0,0.0,0.0
1,-8.99632E+ 18,2021-01-01 20:39:00,2021-02-01 15:13:00,443.947,443.947,549.0,100.0,1.0,0.0,1.0,0.0
2,-8.99632E+18,2021-02-01 15:13:00,2021-02-01 15:21:00,443.947,444.25,100.0,100.0,0.0,1.0,0.0,0.0
3,-8.99632E+18,2021-02-01 15:21:00,2021-02-01 17:46:00,444.25,445.713,100.0,945.0,0.0,1.0,0.0,0.0
4,-8.99632E+18,2021-02-01 17:46:00,2021-02-01 17:53:00,445.713,446.145,945.0,945.0,0.0,1.0,0.0,0.0


In [30]:
# add a column duration
data_encoded['duration'] = (data_encoded['end_time'] - data_encoded['timestamp']).dt.total_seconds()/60

In [31]:
data_encoded.head()

Unnamed: 0,vm,timestamp,end_time,odo,end_odo,soc,end_soc,event_charge,event_trip,charge_mode_240,charge_mode_DCCharging,duration
0,-8.99632E+l8,2021-01-01 20:31:00,2021-01-01 20:38:00,443.664,443.947,564.0,549.0,0.0,1.0,0.0,0.0,7.0
1,-8.99632E+ 18,2021-01-01 20:39:00,2021-02-01 15:13:00,443.947,443.947,549.0,100.0,1.0,0.0,1.0,0.0,44314.0
2,-8.99632E+18,2021-02-01 15:13:00,2021-02-01 15:21:00,443.947,444.25,100.0,100.0,0.0,1.0,0.0,0.0,8.0
3,-8.99632E+18,2021-02-01 15:21:00,2021-02-01 17:46:00,444.25,445.713,100.0,945.0,0.0,1.0,0.0,0.0,145.0
4,-8.99632E+18,2021-02-01 17:46:00,2021-02-01 17:53:00,445.713,446.145,945.0,945.0,0.0,1.0,0.0,0.0,7.0


In [32]:
data_encoded.to_csv('./Dataset_dumarey/dataset_rounded_cleaned.csv', index=False)

- trasformare gli object in float64 (Done)
- capire come gestire le date (Done)
- data fine evento si può convertire in durata evento? (ho inserito la data duration)
- verificare se ci siano valori null (Done)
- verificare se ci sono duplicati (Done)
- fare feature correlation heatmap
- onehot encoding (Done)
- trasformare la colonna vm dato che si riferisce a singoli veicoli(ID) tipo in vec1 vec2 ecc
- Come gestisco i missing values?  
