In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.metrics import accuracy_score, r2_score, mean_squared_error

In [2]:
features_test_dataset_filepath = './data/features/test/features_df.csv'
models_folder = './models'

In [3]:
clf = joblib.load(f'{models_folder}/classifier.pkl')
reg_A = joblib.load(f'{models_folder}/regressor_A.pkl')
reg_B = joblib.load(f'{models_folder}/regressor_B.pkl')
reg_C = joblib.load(f'{models_folder}/regressor_C.pkl')

In [4]:
df_test = pd.read_csv(features_test_dataset_filepath).drop(columns=['Unnamed: 0'])

df_test.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test = df_test.dropna(axis=1)

bins = [0, 600, 1100, 2000]
labels = ['A', 'B', 'C']
df_test['class'] = pd.cut(df_test['cycle_life'], bins=bins, labels=labels)

In [5]:
df_test.head()

Unnamed: 0,cell,cycle,cycle_life,IR,end_of_charge_t,start_of_discharge_t,V_max,V_min,V_median,V_sum,...,discharge_temp_peak_t,discharge_temp_peak,V_sup_lim_reach_t,final_discharge_v,dV_peak,dQdV_valley,dQdV_valley_V,RUL,SOH,class
0,b1c11,0,449,0.017229,10.209185,45.53737,3.60007,1.99984,3.420933,3167.423205,...,60.300291,37.369352,23.576606,2.344689,2.5e-05,-7.583251,3.154701,448,98.918192,A
1,b1c11,1,449,0.017362,10.204722,45.119448,3.600098,1.999774,3.420433,3166.274973,...,60.062076,37.067156,23.616642,2.335622,1.8e-05,-7.602911,3.152823,447,98.955033,A
2,b1c11,2,449,0.017189,10.210132,44.750791,3.600077,1.999832,3.417394,3165.93394,...,59.450485,37.840231,23.751229,2.334996,2e-05,-7.027643,3.158207,446,99.027497,A
3,b1c11,3,449,0.017122,10.155769,44.368468,3.600094,1.998165,3.415709,3163.465832,...,59.061921,37.274923,23.768822,2.332085,2e-05,-7.440042,3.153158,445,99.068025,A
4,b1c11,4,449,0.017046,10.162919,44.087029,3.600095,1.998842,3.414471,3164.457576,...,59.045128,37.548783,23.761191,2.32792,2.3e-05,-7.108435,3.156528,444,99.091039,A


In [6]:
cells_names = df_test['cell'].unique()
cell_class_dict = []
for name in cells_names:
    cell_class_dict.append({'cell': name, 'class': df_test[df_test['cell']==name]['class'].unique()[0]})
count_df = pd.DataFrame(pd.DataFrame(cell_class_dict).value_counts('class'))
count_df['prop'] = (count_df['count']/count_df['count'].sum()).round(2)
count_df

Unnamed: 0_level_0,count,prop
class,Unnamed: 1_level_1,Unnamed: 2_level_1
A,15,0.75
B,4,0.2
C,1,0.05


In [7]:
X_test = df_test.drop(columns=['cell', 'cycle', 'cycle_life', 'RUL', 'SOH', 'class'])
y_test = df_test[['RUL']]

In [8]:
y_test['class'] = clf.predict(X_test)
X_test['class'] = clf.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['class'] = clf.predict(X_test)


In [9]:
X_test_A = X_test[X_test['class']=='A'].drop(columns=['class'])
X_test_B = X_test[X_test['class']=='B'].drop(columns=['class'])
X_test_C = X_test[X_test['class']=='C'].drop(columns=['class'])

y_test_A = y_test[y_test['class']=='A'].drop(columns=['class'])
y_test_B = y_test[y_test['class']=='B'].drop(columns=['class'])
y_test_C = y_test[y_test['class']=='C'].drop(columns=['class'])

In [10]:
y_pred_A = reg_A.predict(X_test_A)
y_pred_B = reg_B.predict(X_test_B)
y_pred_C = reg_C.predict(X_test_C)

In [11]:
r2_A = r2_score(y_test_A, y_pred_A)
r2_B = r2_score(y_test_B, y_pred_B)
r2_C = r2_score(y_test_C, y_pred_C)

rmse_A = mean_squared_error(y_test_A, y_pred_A)**.5
rmse_B = mean_squared_error(y_test_B, y_pred_B)**.5
rmse_C = mean_squared_error(y_test_C, y_pred_C)**.5

In [12]:
print(f'R2 class A = {r2_A}')
print(f'R2 class B = {r2_B}')
print(f'R2 class C = {r2_C}')

print(f'RMSE class A = {rmse_A}')
print(f'RMSE class B = {rmse_B}')
print(f'RMSE class C = {rmse_C}')

R2 class A = 0.9954207316031947
R2 class B = 0.9093331102801046
R2 class C = 0.9970313776439821
RMSE class A = 9.41346324779537
RMSE class B = 85.9424751119332
RMSE class C = 18.104109839712198
