In [67]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import tree

import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
import torch.nn as nn
from mpl_toolkits.mplot3d import Axes3D  
import plotly.graph_objects as go
from scipy.stats import gaussian_kde
import matplotlib.patches as mpatches
import matplotlib.dates as mdates
from matplotlib import pyplot as plt
from scipy import stats
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.colors as pc
from sklearn.ensemble import IsolationForest



# Dataset overview

In [68]:
data = pd.read_csv('./Dataset_dumarey/original_csv/tmp.csv')

In [69]:
data.head()


Unnamed: 0,vin,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,·2.47174E+l8,15/05/202 103:09,15/05/2021 17:21,458578125,458578125,27.4,44.7,charge,120
1,·2.47174E+l8,15/05/202117:21,15/05/2021 17:41,458578125,459521875,44.7,43.5,trip,
2,-2.47174E+l8,15/05/2021 17:42,15/05/202118:29,459521875,459521875,43.5,79.6,charge,DCCharging
3,·2.47174E+18,15/05/202 118:29,15/05/2021 18:53,459521875,4609,79.6,77.6,trip,
4,·2.47174E+18,15/05/202118:56,15/05/2021 19:22,4609,462434375,77.6,74.9,trip,


In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   vin          96 non-null     object 
 1   timestamp    96 non-null     object 
 2   end_time     96 non-null     object 
 3   odo          96 non-null     object 
 4   end_odo      96 non-null     object 
 5   soc          96 non-null     float64
 6   end_soc      96 non-null     float64
 7   event        96 non-null     object 
 8   charge_mode  32 non-null     object 
dtypes: float64(2), object(7)
memory usage: 6.9+ KB


# Data Cleaning


In [71]:
def substitute_comma(ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts).strip()
    ts = ts.replace(",", ".")     
    ts = ts.replace(", ", ".")
    ts = ts.replace(" , ", ".")    
    ts = ts.replace(" ,", ".")       
    ts = ts.replace(". ", ".")    
    ts = ts.replace(" .", ".") 
    ts = ts.replace(" . ", ".")       
    return ts 
def remove_comma_odo(ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts).strip()
    ts = ts.replace(",", "")         
    ts = ts.replace(", ", "")        
    ts = ts.replace(" ,", "")     
    return ts 
def remove_comma(ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts).strip()
    ts = ts.replace(",", "")         
    ts = ts.replace(". ", "")        
    ts = ts.replace(".", "")     
    return ts 

def remove_spaces(ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts).strip()
    ts = ts.replace(" ", "")         
    ts = ts.replace("  ", "")        
    ts = ts.replace("   ", "")     
    return ts
def fix_timestamp_format(ts):
    if pd.isna(ts):
        return ts
    
    ts = str(ts).replace(" ", "")  
    
    if len(ts) >= 15:  # es. 27/04/202121:27
        return ts[:10] + ' ' + ts[10:]
    else:
        return ts
def fix_soc (ts):
    if ts > 100.0:
        return ts / 10
    else:
        return ts
def fix_odo(ts):
    if pd.isna(ts):
        return ts
    try:
        ts_str = str(int(ts))
        if len(ts_str) <= 4:
            return int(ts_str) 
        return float(ts_str[:4] + '.' + ts_str[4:])
    except:
        return ts
def fix_vin (ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts)
    if ts.startswith('-8.9'):
        return '1.0'
    elif ts.startswith('-2.4'):
        return '2.0'
    elif ts.startswith('·2.4'):
        return '2.0'
    else:
        return ts


#### Fixing marks

In [72]:
data['odo'] = data['odo'].apply(remove_comma_odo)
data['end_odo'] = data['end_odo'].apply(remove_comma_odo)
data['soc'] = data['soc'].apply(substitute_comma)
data['end_soc'] = data['end_soc'].apply(substitute_comma)


In [73]:
data.head()

Unnamed: 0,vin,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,·2.47174E+l8,15/05/202 103:09,15/05/2021 17:21,458578125,458578125,27.4,44.7,charge,120
1,·2.47174E+l8,15/05/202117:21,15/05/2021 17:41,458578125,459521875,44.7,43.5,trip,
2,-2.47174E+l8,15/05/2021 17:42,15/05/202118:29,459521875,459521875,43.5,79.6,charge,DCCharging
3,·2.47174E+18,15/05/202 118:29,15/05/2021 18:53,459521875,4609,79.6,77.6,trip,
4,·2.47174E+18,15/05/202118:56,15/05/2021 19:22,4609,462434375,77.6,74.9,trip,


In [74]:
cols_to_float = ['odo', 'end_odo', 'soc', 'end_soc']
for col in cols_to_float:
    data[col] = pd.to_numeric(data[col], errors='coerce')

#### Fixing timestamp format

In [75]:
data['timestamp'] = data['timestamp'].apply(fix_timestamp_format)
data['end_time'] = data['end_time'].apply(fix_timestamp_format) 
data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce', dayfirst=True)
data['end_time'] = pd.to_datetime(data['end_time'], errors='coerce', dayfirst=True)

In [76]:
data['soc'] = data['soc'].apply(fix_soc)
data['end_soc'] = data['end_soc'].apply(fix_soc)
data['odo'] = data['odo'].apply(fix_odo)
data['end_odo'] = data['end_odo'].apply(fix_odo) #it's ok for odo <= 9999


In [77]:
data.head()

Unnamed: 0,vin,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,·2.47174E+l8,2021-05-15 03:09:00,2021-05-15 17:21:00,4585.78125,4585.78125,27.4,44.7,charge,120
1,·2.47174E+l8,2021-05-15 17:21:00,2021-05-15 17:41:00,4585.78125,4595.21875,44.7,43.5,trip,
2,-2.47174E+l8,2021-05-15 17:42:00,2021-05-15 18:29:00,4595.21875,4595.21875,43.5,79.6,charge,DCCharging
3,·2.47174E+18,2021-05-15 18:29:00,2021-05-15 18:53:00,4595.21875,4609.0,79.6,77.6,trip,
4,·2.47174E+18,2021-05-15 18:56:00,2021-05-15 19:22:00,4609.0,4624.34375,77.6,74.9,trip,


In [78]:
cols_to_convert = ['event']
for col in cols_to_convert:
    data[col] = data[col].astype('category')

data['charge_mode'] = data['charge_mode'].apply(remove_spaces)
data['charge_mode'] = data['charge_mode'].replace('', '0')
data['charge_mode'] = data['charge_mode'].fillna('0')
data['charge_mode'] = data['charge_mode'].astype('category')
if '0' not in data['charge_mode'].cat.categories:
    data['charge_mode'] = data['charge_mode'].cat.add_categories('0')
data['vin'] = data['vin'].apply(fix_vin)
data['vin'] = data['vin'].astype('float64')

In [79]:
def original_vin (ts):
    if pd.isna(ts):
        return ts 
    ts = str(ts)
    if ts.startswith('1.0'):
        return '-8.99632E+18'
    elif ts.startswith('2.0'):
        return '-2.47174E+18'
    else:
        return ts


In [80]:
data_to_csv = data.copy()
data_to_csv['vin'] = data_to_csv['vin'].apply(original_vin)

In [81]:
data_to_csv.to_csv('./Dataset_dumarey/original_csv/tmp_cleaned.csv', index=False)

In [82]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   vin          96 non-null     float64       
 1   timestamp    96 non-null     datetime64[ns]
 2   end_time     96 non-null     datetime64[ns]
 3   odo          96 non-null     float64       
 4   end_odo      95 non-null     float64       
 5   soc          96 non-null     float64       
 6   end_soc      96 non-null     float64       
 7   event        96 non-null     category      
 8   charge_mode  96 non-null     category      
dtypes: category(2), datetime64[ns](2), float64(5)
memory usage: 5.9 KB


In [83]:
data.head()

Unnamed: 0,vin,timestamp,end_time,odo,end_odo,soc,end_soc,event,charge_mode
0,2.0,2021-05-15 03:09:00,2021-05-15 17:21:00,4585.78125,4585.78125,27.4,44.7,charge,120
1,2.0,2021-05-15 17:21:00,2021-05-15 17:41:00,4585.78125,4595.21875,44.7,43.5,trip,0
2,2.0,2021-05-15 17:42:00,2021-05-15 18:29:00,4595.21875,4595.21875,43.5,79.6,charge,DCCharging
3,2.0,2021-05-15 18:29:00,2021-05-15 18:53:00,4595.21875,4609.0,79.6,77.6,trip,0
4,2.0,2021-05-15 18:56:00,2021-05-15 19:22:00,4609.0,4624.34375,77.6,74.9,trip,0
