In [139]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import tree

import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
import torch.nn as nn
from mpl_toolkits.mplot3d import Axes3D  
import plotly.graph_objects as go
from scipy.stats import gaussian_kde
import matplotlib.patches as mpatches
import matplotlib.dates as mdates
from matplotlib import pyplot as plt
from scipy import stats
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.colors as pc
from sklearn.ensemble import IsolationForest



# Dataset overview

In [140]:
data = pd.read_csv('./Dataset_dumarey/original_csv/tmp.csv')

In [141]:
data.head()

Unnamed: 0,vin,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,-8.99632e+18,13/02/2021 23:18,15/02/2021 11:09,10142.79688,10142.79688,56.0,100.0,charge,240.0
1,-8.99632e+18,15/02/2021 11:09,15/02/2021 12:05,10142.79688,10210.1875,100.0,83.1,trip,
2,-8.99632e+18,15/02/2021 12:05,15/02/2021 13:08,10210.1875,10210.21875,83.1,82.7,trip,
3,-8.99632e+18,15/02/2021 13:08,15/02/2021 13:15,10210.21875,10212.20313,82.7,82.7,trip,
4,-8.99632e+18,15/02/2021 15:30,15/02/2021 16:11,10212.20313,10272.75,81.9,65.4,trip,


In [142]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   vin          64 non-null     float64
 1   timestamp    64 non-null     object 
 2   end_time     64 non-null     object 
 3   odo          64 non-null     float64
 4   end_odo      64 non-null     float64
 5   soc          64 non-null     float64
 6   end_soc      64 non-null     float64
 7   event        64 non-null     object 
 8   charge_mode  13 non-null     float64
dtypes: float64(6), object(3)
memory usage: 4.6+ KB


# Data Cleaning


In [143]:
def substitute_comma(ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts).strip()
    ts = ts.replace(",", ".")     
    ts = ts.replace(", ", ".")
    ts = ts.replace(" , ", ".")    
    ts = ts.replace(" ,", ".")       
    ts = ts.replace(". ", ".")    
    ts = ts.replace(" .", ".") 
    ts = ts.replace(" . ", ".")       
    return ts 
def remove_comma_odo(ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts).strip()
    ts = ts.replace(",", "")         
    ts = ts.replace(", ", "")        
    ts = ts.replace(" ,", "")  
    ts = ts.replace(".", "")   
    return ts 
def remove_comma(ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts).strip()
    ts = ts.replace(",", "")         
    ts = ts.replace(". ", "")        
    ts = ts.replace(".", "")     
    return ts 

def remove_spaces(ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts).strip()
    ts = ts.replace(" ", "")         
    ts = ts.replace("  ", "")        
    ts = ts.replace("   ", "")     
    return ts
def fix_timestamp_format(ts):
    if pd.isna(ts):
        return ts
    
    ts = str(ts).replace(" ", "")  
    
    if len(ts) >= 15:  # es. 27/04/202121:27
        return ts[:10] + ' ' + ts[10:]
    else:
        return ts
def fix_soc (ts):
    if ts > 100.0:
        return ts / 10
    else:
        return ts
def fix_odo(ts):
    if pd.isna(ts):
        return ts
    try:
        ts_str = str(int(ts))
        if len(ts_str) <= 5:
            return int(ts_str) 
        return float(ts_str[:5] + '.' + ts_str[5:])
    except:
        return ts
def fix_vin (ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts)
    if ts.startswith('-8.9'):
        return '1.0'
    elif ts.startswith('-2.4'):
        return '2.0'
    elif ts.startswith('Â·2.4'):
        return '2.0'
    else:
        return ts


#### Fixing marks

In [144]:
data['odo'] = data['odo'].apply(remove_comma_odo)
data['end_odo'] = data['end_odo'].apply(remove_comma_odo)
data['soc'] = data['soc'].apply(substitute_comma)
data['end_soc'] = data['end_soc'].apply(substitute_comma)


In [145]:
data.head()

Unnamed: 0,vin,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,-8.99632e+18,13/02/2021 23:18,15/02/2021 11:09,1014279688,1014279688,56.0,100.0,charge,240.0
1,-8.99632e+18,15/02/2021 11:09,15/02/2021 12:05,1014279688,102101875,100.0,83.1,trip,
2,-8.99632e+18,15/02/2021 12:05,15/02/2021 13:08,102101875,1021021875,83.1,82.7,trip,
3,-8.99632e+18,15/02/2021 13:08,15/02/2021 13:15,1021021875,1021220313,82.7,82.7,trip,
4,-8.99632e+18,15/02/2021 15:30,15/02/2021 16:11,1021220313,1027275,81.9,65.4,trip,


In [146]:
cols_to_float = ['odo', 'end_odo', 'soc', 'end_soc']
for col in cols_to_float:
    data[col] = pd.to_numeric(data[col], errors='coerce')

#### Fixing timestamp format

In [147]:
data['timestamp'] = data['timestamp'].apply(fix_timestamp_format)
data['end_time'] = data['end_time'].apply(fix_timestamp_format) 
data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce', dayfirst=True)
data['end_time'] = pd.to_datetime(data['end_time'], errors='coerce', dayfirst=True)

In [148]:
data['soc'] = data['soc'].apply(fix_soc)
data['end_soc'] = data['end_soc'].apply(fix_soc)
data['odo'] = data['odo'].apply(fix_odo)
data['end_odo'] = data['end_odo'].apply(fix_odo) #it's ok for odo <= 99999


In [149]:
data.head()

Unnamed: 0,vin,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,-8.99632e+18,2021-02-13 23:18:00,2021-02-15 11:09:00,10142.79688,10142.79688,56.0,100.0,charge,240.0
1,-8.99632e+18,2021-02-15 11:09:00,2021-02-15 12:05:00,10142.79688,10210.1875,100.0,83.1,trip,
2,-8.99632e+18,2021-02-15 12:05:00,2021-02-15 13:08:00,10210.1875,10210.21875,83.1,82.7,trip,
3,-8.99632e+18,2021-02-15 13:08:00,2021-02-15 13:15:00,10210.21875,10212.20313,82.7,82.7,trip,
4,-8.99632e+18,2021-02-15 15:30:00,2021-02-15 16:11:00,10212.20313,10272.75,81.9,65.4,trip,


In [150]:
cols_to_convert = ['event']
for col in cols_to_convert:
    data[col] = data[col].astype('category')

data['charge_mode'] = data['charge_mode'].apply(remove_spaces)

data['charge_mode'] = data['charge_mode'].replace('', '0')
data['charge_mode'] = data['charge_mode'].fillna('0')
data['charge_mode'] = data['charge_mode'].replace('240.0', '240')
data['charge_mode'] = data['charge_mode'].replace('120.0', '120')
data['charge_mode'] = data['charge_mode'].astype('category')
if '0' not in data['charge_mode'].cat.categories:
    data['charge_mode'] = data['charge_mode'].cat.add_categories('0')
data['vin'] = data['vin'].apply(fix_vin)
data['vin'] = data['vin'].astype('float64')

In [151]:
def original_vin (ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts)
    if ts.startswith('1.0'):
        return '-8.99632E+18'
    elif ts.startswith('2.0'):
        return '-2.47174E+18'
    else:
        return ts


In [152]:
data_to_csv = data.copy()
data_to_csv['vin'] = data_to_csv['vin'].apply(original_vin)

In [153]:
data_to_csv.to_csv('./Dataset_dumarey/original_csv/tmp_cleaned.csv', index=False)

In [154]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   vin          64 non-null     float64       
 1   timestamp    64 non-null     datetime64[ns]
 2   end_time     64 non-null     datetime64[ns]
 3   odo          64 non-null     float64       
 4   end_odo      64 non-null     float64       
 5   soc          64 non-null     float64       
 6   end_soc      64 non-null     float64       
 7   event        64 non-null     category      
 8   charge_mode  64 non-null     category      
dtypes: category(2), datetime64[ns](2), float64(5)
memory usage: 4.0 KB


In [155]:
data.head()

Unnamed: 0,vin,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,1.0,2021-02-13 23:18:00,2021-02-15 11:09:00,10142.79688,10142.79688,56.0,100.0,charge,240
1,1.0,2021-02-15 11:09:00,2021-02-15 12:05:00,10142.79688,10210.1875,100.0,83.1,trip,0
2,1.0,2021-02-15 12:05:00,2021-02-15 13:08:00,10210.1875,10210.21875,83.1,82.7,trip,0
3,1.0,2021-02-15 13:08:00,2021-02-15 13:15:00,10210.21875,10212.20313,82.7,82.7,trip,0
4,1.0,2021-02-15 15:30:00,2021-02-15 16:11:00,10212.20313,10272.75,81.9,65.4,trip,0
