# Data pre-processing Notebook 

The following are the steps which needs to be taken to pre-process the raw datasets and make them ready to be used to train an ML model: 
1. read raw data files 
2. concat the sperate dataframes which reflect different battery simulation scenarios into one dataframe 
3. Create new columns to refelct the charging mode and electric car usage patterns
4. Categorical columns encoding 

## Import libraries

In [19]:
import pandas as pd
import glob
import os
import re

## Read the raw data

In [3]:
# Combine all CSV files
# this function returns a list of the files names under raw folder and are in csv format
all_files = glob.glob("../data/raw/*battery_data.csv")

In [4]:
all_files

['../data/raw/low_usage_optimal_charging_battery_data.csv',
 '../data/raw/normal_usage_optimal_charging_battery_data.csv',
 '../data/raw/normal_usage_deep_discharge_charging_battery_data.csv',
 '../data/raw/high_usage_optimal_charging_battery_data.csv',
 '../data/raw/normal_usage_full_charging_battery_data.csv',
 '../data/raw/normal_usage_frequent_top_ups_charging_battery_data.csv']

## Concat dataframes

In [20]:
# Initialize an empty list to store DataFrames
dataframes = []

# Iterate over each file path
for file_path in all_files:
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Extract the filename from the file path
    filename = os.path.basename(file_path)
    
    # Use regex to extract usage and charging patterns
    match = re.match(r'(low|normal|high)_usage_(optimal|full|frequent_top_ups|deep_discharge)_charging', filename)
    if match:
        usage_pattern, charging_pattern = match.groups()
        # Add new columns to the DataFrame
        df['Usage_Pattern'] = usage_pattern
        df['Charging_Pattern'] = charging_pattern
    
    # Append the DataFrame to the list
    dataframes.append(df)

In [26]:
# Concatenate all DataFrames in the list
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
combined_df.head()

Unnamed: 0,Date,Temperature,Charge_Start,Charge_End,Daily_Cycles,Total_Cycles,SOH,Usage_Pattern,Charging_Pattern
0,2021-09-04,15.157374,20,80,0.231811,0.231811,99.994349,low,optimal
1,2021-09-05,9.269284,20,100,0.3893,0.621111,99.985886,low,optimal
2,2021-09-06,11.597141,20,80,0.390959,1.01207,99.978053,low,optimal
3,2021-09-07,6.811304,20,80,0.254491,1.266561,99.973179,low,optimal
4,2021-09-08,7.221331,20,80,0.367787,1.634349,99.968001,low,optimal


In [28]:
combined_df.describe()
# Note: this function only makes the statistics of numerical columns

Unnamed: 0,Temperature,Charge_Start,Charge_End,Daily_Cycles,Total_Cycles,SOH
count,6576.0,6576.0,6576.0,6576.0,6576.0,6576.0
mean,12.645496,22.5,87.177616,0.808221,442.658414,92.164178
std,13.879742,17.737846,8.91887,0.381455,334.85196,5.431924
min,-19.997573,5.0,80.0,0.200166,0.231811,78.080741
25%,3.8496,10.0,80.0,0.56318,184.267487,88.348713
50%,12.19629,20.0,80.0,0.74863,370.458671,93.19524
75%,21.032504,20.0,95.0,0.933777,640.737454,96.694674
max,49.98673,60.0,100.0,1.994392,1642.765464,99.994349


In [30]:
combined_df.columns

Index(['Date', 'Temperature', 'Charge_Start', 'Charge_End', 'Daily_Cycles',
       'Total_Cycles', 'SOH', 'Usage_Pattern', 'Charging_Pattern'],
      dtype='object')

## Categorical encoding 

In [31]:
# Define mappings for categorical columns
usage_mapping = {'low': 0, 'normal': 1, 'high': 2}
charging_mapping = {'optimal': 0, 'full': 1, 'frequent_top_ups': 2, 'deep_discharge': 3}

In [32]:
# Apply the mappings to the DataFrame
combined_df['Usage_Pattern'] = combined_df['Usage_Pattern'].map(usage_mapping)
combined_df['Charging_Pattern'] = combined_df['Charging_Pattern'].map(charging_mapping)

In [34]:
combined_df.tail(10)

Unnamed: 0,Date,Temperature,Charge_Start,Charge_End,Daily_Cycles,Total_Cycles,SOH,Usage_Pattern,Charging_Pattern
6566,2024-08-25,32.567768,60,80,0.735912,814.521245,85.149257,1,2
6567,2024-08-26,28.951197,60,80,0.672082,815.193328,85.134437,1,2
6568,2024-08-27,15.218613,60,80,0.772801,815.966129,85.118146,1,2
6569,2024-08-28,46.256955,60,80,0.886897,816.853026,85.101375,1,2
6570,2024-08-29,19.907714,60,80,0.689899,817.542925,85.086609,1,2
6571,2024-08-30,25.876485,60,80,0.762703,818.305627,85.071588,1,2
6572,2024-08-31,33.863431,60,100,0.820101,819.125728,85.059048,1,2
6573,2024-09-01,6.618893,60,80,0.563366,819.689095,85.047666,1,2
6574,2024-09-02,11.072825,60,80,0.962244,820.651339,85.031057,1,2
6575,2024-09-03,3.271074,60,80,0.707031,821.35837,85.016532,1,2


# Save the resulting dataframe

In [36]:
# The combined dataframe has the additional columns which were encoded 
folder_path = '../data/processed'
file_name = 'combined_dataframe.csv'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the DataFrame to a CSV file
combined_df.to_csv(f'{folder_path}/{file_name}', index=False)