# 18650 Battery Life: Survival Analysis vs Machine Learning

### Aryan Bhardwaj, Cormac Dacker, Tyler Gomez Riddick, Avery Pike

## Data Preprocessing

### Data Loading

In [3]:
import os
import glob  # pip install glob2
import pandas as pd
import warnings
import time
from tqdm import tqdm  # pip install tqdm
import sklearn
import numpy as np


In [16]:
csv_paths = ['/battery_alt_dataset/regular_alt_batteries',
             './battery_alt_dataset/second_life_batteries',
             './battery_alt_dataset/recommissioned_batteries']

root = os.path.dirname(os.path.realpath('csv_combine.py'))

def concatinateCSVs(folderPath, ignore_list=[]):  # combines all csv files in a folder into df
    # print('concatinating csvs in', folderPath)
    root = os.path.dirname(os.path.realpath('csv_combine.py'))
    warnings.filterwarnings("ignore")
    os.chdir(folderPath)
    # all the filenames with a .csv format
    allFilenames = [i for i in glob.glob("*.{}".format("csv"))]
    combinedFilesData = []
    for file in tqdm.tqdm(allFilenames, desc='Combining ' + folderPath.split('/')[-1]):
        if file in ignore_list:
            continue
        try:
            df = pd.read_csv(file)
            # # drop all rows except the one with the greatest time value
            # df = df.drop_duplicates(subset=['time'], keep='last')
            combinedFilesData.append(df)
        except pd.errors.EmptyDataError:
            print(file, "is empty")
            # os.remove(f)
            # print(f, "has been deleted")
            continue
        except pd.errors.ParserError:
            print(file, "parse error")
            continue
    try:  # concatinate all csv data in a folder
        combinedFilesData = pd.concat([file for file in combinedFilesData], )
    except ValueError:
        combinedFilesData = []
    os.chdir(root)
    return pd.DataFrame(combinedFilesData)


def combine_csvs(csv_paths):  # builds the one files from folders
    for csv_path in csv_paths:
        (concatinateCSVs(root + csv_path,
                         ignore_list=['battery40.csv', 'battery41.csv',
                                      'battery50.csv', 'battery51.csv']).to_csv(root + csv_path + '.csv', index=False))

        # * 9.30A: Battery pack 0.1 and 1.1
        # * 12.9A: Battery pack 3.1 and 2.2
        # * 14.3A: Battery pack 2.3 and 5.2
        # * 16.0A: Battery pack 0.0 and 1.0
        # * 19.0A: Battery pack 2.0, 3.0 and 2.1


        return


def convert_to_preferred_format(sec):  # only to be fancy
    sec = sec % (24 * 3600)
    sec %= 3600
    min = sec // 60
    sec %= 60
    # print("seconds value in minutes:",min)
    return "%02d:%02d" % (min, sec)


if __name__ == '__main__':
    start_time = time.time()
    combine_csvs(csv_paths)
    print('Time taken:', convert_to_preferred_format(time.time() - start_time))


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\tyler\\.vscode\\Capstone\\env\\18650-Accelerated-Battery-Life-Testing\\battery_alt_dataset\\second_life_batteries/battery_alt_dataset/regular_alt_batteries'

In [16]:
combine_csvs('./battery_alt_dataset/second_life_batteries')

Combining C:\Users\tyler\.vscode\Capstone\env\18650-Accelerated-Battery-Life-Testing.: 100%|██████████| 3/3 [00:00<00:00, 115.01it/s]


### Data Combining

Because we are focusing on only the discharge phases, we want to break each of the battery datasets into their discharge phases, and then find the average voltage, temperature, and current for each discharge cycle. Starting, we have to load in the new datasets and process them.

In [None]:
# Numbering each distinct discharge phase
def battLabeller(df):
    '''
    df - battery dataset
    feed battery dataset, add column to denote discharge phase
    '''
    label = 1
    df['dischargePhase'] = 0
    # for time in range(len(df['start_time'])-1):
    for time in tqdm(range(len(df['start_time'])-1), total=len(df['start_time'])-1):
        if df['mode'][time] == -1:
            df.at[time, 'dischargePhase'] = label
            if df['mode'][time+1] == 0 or df['mode'][time+1] == 1:
                label += 1
    return df

In [None]:
# This function calculates the time it takes the battery pack in a phase to
# discharge its charge
def timeFinder(df):
    '''
    df - dataframe
    takes dataframe and finds time from start of phase to end of phase
    '''
    df['startTime'] = 0
    df['finishTime'] = 0
    #for phase in df['dischargePhase'].unique():
    for phase in tqdm(df['dischargePhase'].unique(), total=len(df['dischargePhase'].unique())):
        currPhase = df[df['dischargePhase'] == phase]
        #print(currPhase)
        #print(phase)
        # Find start and end time
        start = currPhase['time'].iloc[0] 
        finish = currPhase['time'].iloc[-1]
        
        # Update start and end time
        df.loc[df['dischargePhase'] == phase, 'startTime'] = start
        df.loc[df['dischargePhase'] == phase, 'finishTime'] = finish
    return df

In [None]:
def process_battery_data(df):
    '''
    Takes in df of battery data and reduces each phase
    to a single row, averaging temp, voltage, and current
    and finding the time to death for each discharge phase
    
    Parameters:
    df (dataFrame): input df of battery data sent through
    battLabeller and timeFinder
    
    Returns:
    df (transformed): averaged values and new column
    end_time which is the difference of finishTime and startTime
    '''
    # Run battery df through functions time_end via difference
    df = battLabeller(df)
    df = df[df['mode'] != 0]
    df = df[df['mode'] != 1]
    df = timeFinder(df)
    cols = ['voltage_charger','temperature_battery','current_load','dischargePhase','startTime','finishTime']
    df = df[cols]
    df = df.dropna(subset=['dischargePhase'])
    
    df['time_end'] = (df['finishTime'] - df['startTime'])
    
    # Cast columns to floats
    numeric_cols = ['voltage_charger','temperature_battery','current_load']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col],errors = 'coerce')
    
    # Group by dischargePhase and aggregate other columns
    aggregated_df = df.groupby('dischargePhase').agg({
        'voltage_charger':'mean',
        'temperature_battery':'mean',
        'current_load':'mean',
        'time_end':'first'
    }).reset_index()
    aggregated_df['mode'] = 1
    
    return aggregated_df

In [None]:
# Read in combined csvs
regular = pd.read_csv('.\\battery_alt_dataset\\regular_alt_batteries.csv')
second = pd.read_csv('.\\battery_alt_dataset\\second_life_batteries.csv')
recommissioned = pd.read_csv('.\\battery_alt_dataset\\recommissioned_batteries.csv')

# Run through the battery data processor
regular = process_battery_data(regular)
second = process_battery_data(second)
recommissioned = process_battery_data(recommissioned)

# Write to new csvs
regular.to_csv('regular.csv')
second.to_csv('second.csv')
recommissioned.to_csv('recommissioned')

## Survival Analysis

All of the survival analysis done on these data was performed in R, due to our group's familiarity with working in the language with survival analysis. Thus, the next sections of this notebook will not be able to render the work we did, but the R markdowns will be included as part of the submission.