In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error

In [2]:
features_train_dataset_filepath = './data/features/train/features_df.csv'

In [3]:
df_train = pd.read_csv(features_train_dataset_filepath).drop(columns=['Unnamed: 0'])

df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_train = df_train.dropna(axis=1)

bins = [0, 600, 1100, 2000]
labels = ['A', 'B', 'C']
df_train['class'] = pd.cut(df_train['cycle_life'], bins=bins, labels=labels)

In [4]:
df_train.head()

Unnamed: 0,cell,cycle,cycle_life,IR,end_of_charge_t,start_of_discharge_t,V_max,V_min,V_median,V_sum,...,discharge_temp_peak_t,discharge_temp_peak,V_sup_lim_reach_t,final_discharge_v,dV_peak,dQdV_valley,dQdV_valley_V,RUL,SOH,class
0,b1c14,0,426,0.017317,10.212414,42.892137,3.600088,1.999769,3.396744,3151.683218,...,57.964251,35.15638,8.803805,2.334505,2.1e-05,-6.988153,3.15297,425,98.431167,A
1,b1c14,1,426,0.017187,10.20307,42.408679,3.600065,1.999888,3.396222,3156.253267,...,57.19272,34.877557,8.745489,2.32896,2.1e-05,-6.916812,3.152241,424,98.382676,A
2,b1c14,2,426,0.017008,10.196563,41.470584,3.6001,1.999874,3.391148,3150.13795,...,56.252178,34.624506,8.69103,2.32354,2.5e-05,-7.034547,3.155947,423,98.3987,A
3,b1c14,3,426,0.016933,10.217278,42.249823,3.600106,1.999843,3.395058,3158.460539,...,57.023454,34.981057,8.629457,2.321956,1.4e-05,-6.644012,3.152074,422,98.420615,A
4,b1c14,4,426,0.01685,10.187401,41.638438,3.600076,1.999544,3.392202,3154.958304,...,56.816983,35.322489,9.02508,2.319709,2.2e-05,-7.28386,3.146466,421,98.359591,A


In [5]:
cells_names = df_train['cell'].unique()
cell_class_dict = []
for name in cells_names:
    cell_class_dict.append({'cell': name, 'class': df_train[df_train['cell']==name]['class'].unique()[0]})
count_df = pd.DataFrame(pd.DataFrame(cell_class_dict).value_counts('class'))
count_df['prop'] = (count_df['count']/count_df['count'].sum()).round(2)
count_df

Unnamed: 0_level_0,count,prop
class,Unnamed: 1_level_1,Unnamed: 2_level_1
A,48,0.77
B,9,0.15
C,5,0.08


In [6]:
X_train = df_train.drop(columns=['cell', 'cycle', 'cycle_life', 'RUL', 'SOH', 'class'])
y_train = df_train['class']

In [7]:
clf = ExtraTreesClassifier(n_estimators=100, random_state=0)

In [8]:
clf.fit(X_train, y_train)

In [9]:
features_val_dataset_filepath = './data/features/val/features_df.csv'

In [10]:
df_val = pd.read_csv(features_val_dataset_filepath).drop(columns=['Unnamed: 0'])

df_val.replace([np.inf, -np.inf], np.nan, inplace=True)
df_val = df_val.dropna(axis=1)

bins = [0, 600, 1100, 2000]
labels = ['A', 'B', 'C']
df_val['class'] = pd.cut(df_val['cycle_life'], bins=bins, labels=labels)

In [11]:
cells_names = df_val['cell'].unique()
cell_class_dict = []
for name in cells_names:
    cell_class_dict.append({'cell': name, 'class': df_val[df_val['cell']==name]['class'].unique()[0]})
count_df = pd.DataFrame(pd.DataFrame(cell_class_dict).value_counts('class'))
count_df['prop'] = (count_df['count']/count_df['count'].sum()).round(2)
count_df

Unnamed: 0_level_0,count,prop
class,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16,0.76
B,4,0.19
C,1,0.05


In [12]:
X_val = df_val.drop(columns=['cell', 'cycle', 'cycle_life', 'RUL', 'SOH', 'class'])
y_val = df_val['class']

In [13]:
y_pred = clf.predict(X_val)
accuracy_score(y_val, y_pred)

1.0

In [14]:
reg_A = ExtraTreesRegressor(n_estimators=100, random_state=0)
reg_B = ExtraTreesRegressor(n_estimators=100, random_state=0)
reg_C = ExtraTreesRegressor(n_estimators=100, random_state=0)

In [15]:
df_train_A = df_train[df_train['class']=='A']
df_train_B = df_train[df_train['class']=='B']
df_train_C = df_train[df_train['class']=='C']

In [16]:
X_train_A = df_train_A.drop(columns=['cell', 'cycle', 'cycle_life', 'RUL', 'SOH', 'class'])
y_train_A = df_train_A['RUL']

X_train_B = df_train_B.drop(columns=['cell', 'cycle', 'cycle_life', 'RUL', 'SOH', 'class'])
y_train_B = df_train_B['RUL']

X_train_C = df_train_C.drop(columns=['cell', 'cycle', 'cycle_life', 'RUL', 'SOH', 'class'])
y_train_C = df_train_C['RUL']

In [17]:
reg_A.fit(X_train_A, y_train_A)
reg_B.fit(X_train_B, y_train_B)
reg_C.fit(X_train_C, y_train_C)

In [18]:
df_val_A = df_val[df_val['class']=='A']
df_val_B = df_val[df_val['class']=='B']
df_val_C = df_val[df_val['class']=='C']

In [19]:
X_val_A = df_val_A.drop(columns=['cell', 'cycle', 'cycle_life', 'RUL', 'SOH', 'class'])
y_val_A = df_val_A['RUL']

X_val_B = df_val_B.drop(columns=['cell', 'cycle', 'cycle_life', 'RUL', 'SOH', 'class'])
y_val_B = df_val_B['RUL']

X_val_C = df_val_C.drop(columns=['cell', 'cycle', 'cycle_life', 'RUL', 'SOH', 'class'])
y_val_C = df_val_C['RUL']

In [20]:
y_pred_A = reg_A.predict(X_val_A)
y_pred_B = reg_B.predict(X_val_B)
y_pred_C = reg_C.predict(X_val_C)

In [24]:
r2_A = r2_score(y_val_A, y_pred_A)
r2_B = r2_score(y_val_B, y_pred_B)
r2_C = r2_score(y_val_C, y_pred_C)

rmse_A = mean_squared_error(y_val_A, y_pred_A)**.5
rmse_B = mean_squared_error(y_val_B, y_pred_B)**.5
rmse_C = mean_squared_error(y_val_C, y_pred_C)**.5

In [25]:
print(f'R2 class A = {r2_A}')
print(f'R2 class B = {r2_B}')
print(f'R2 class C = {r2_C}')

print(f'RMSE class A = {rmse_A}')
print(f'RMSE class B = {rmse_B}')
print(f'RMSE class C = {rmse_C}')

R2 class A = 0.9981742225370295
R2 class B = 0.8739537986384862
R2 class C = 0.9993607021015031
RMSE class A = 5.957984930680047
RMSE class B = 102.13838942891468
RMSE class C = 8.681018745470455


In [28]:
joblib.dump(clf, './models/classifier.pkl')
joblib.dump(reg_A, './models/regressor_A.pkl')
joblib.dump(reg_B, './models/regressor_B.pkl')
joblib.dump(reg_C, './models/regressor_C.pkl')

['./models/regressor_C.pkl']