In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb
import random
from Helper import prepare_train_data

data_link = './Data/'
train_link = 'train_FD003.txt'
test_link = 'test_FD003.txt'
rul_link = "RUL_FD003.txt"

random.seed(49)
np.random.seed(49)

# Data Import

In [None]:
df = pd.read_csv(data_link+train_link,sep=" ",header=None)
df.dropna(axis=1,inplace=True)
columns = ['unit_number','time_in_cycles','sensor1','sensor2','sensor3','sensor4','sensor5','sensor6','sensor7','sensor8','sensor9','sensor10','sensor11',
'sensor12','sensor13','sensor14','sensor15','sensor16','sensor17','sensor18','sensor19','sensor20','sensor21','sensor22','sensor23','sensor24', ]
df.columns = columns
pd.options.display.max_columns = 100

# Add RUL
df = prepare_train_data(df)
df.head()

# Data Discriptions

In [None]:
pd.options.display.max_columns = 100
des = df.describe()
des.loc['median'] = df.median().values
des.loc['coeffvariation'] = (df.std()/df.mean()).values
des.loc['nunique'] = df.nunique().values
des.loc['NullCount'] = df.isna().sum().values
des

In [None]:
# find if time_in_cycles is continous
for i in range(1,101):
    tic = df[df['unit_number']==i]['time_in_cycles'].values
    for a,b in zip(tic, np.arange(1,len(tic)+1)):
        if a!=b:
            print(f"TIC is discontinous")
        else:
            pass
print(f'Total analmolies in sensor9: {sum(df.sensor9<21.55)}')

# anamolies present in sensor11 and 16
print(df[df.sensor11<2387.75].unit_number.value_counts())

# Data plots

In [None]:
# for col in df.columns:
#   plot_data = df[df['unit_number']==1][col]
#   length = len(plot_data)
#   fig, ax = plt.subplots(1,4,figsize=(30,10),gridspec_kw={'width_ratios': [4,4,4,0.5]})
#   ax[0].plot(np.arange(length),plot_data,marker='.',alpha=0.4)
#   ax[1].scatter(np.arange(len(df[col])),df[col],marker='.',alpha=0.4)
#   sb.histplot(df[col],kde=True,ax=ax[2],)
#   #sb.histplot(df[col],ax=ax[3])
#   sb.boxplot(y=df[col],color='green',orient='v',ax=ax[3])
#   ax[0].set_ylabel(col)
#   ax[1].set_ylabel(col)
#   ax[2].set_xlabel(col)
#   ax[3].set_xlabel(col)
#   plt.show()

In [None]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler
rb = RobustScaler()
mm = MinMaxScaler()
d = rb.fit_transform(df)
#d = np.sqrt(d)
#d = mm.fit_transform(d)
fig, ax = plt.subplots(1,2,figsize=(10,5))
sb.histplot(d[:,13], ax=ax[0])
sb.histplot(df['sensor12'],ax = ax[1])
plt.show()

# Display random unit_numbers

In [None]:
# # Correalation Heatmap
# sb.heatmap(df.corr(),annot=True,cmap='RdYlGn',linewidths=0.2,)
# fig = plt.gcf()
# fig.set_size_inches(25,25)
# plt.show()

In [None]:
# # Display random unit numbers
# a = [55,24,71,11]
# for col in df.columns:
#   fig, ax = plt.subplots(1,4,figsize=(30,10),gridspec_kw={'width_ratios': [1,1,1,1]})
#   for ID,un in zip(ax.ravel(),a):
#     plot_data = df[df['unit_number']==un][col]
#     length = len(plot_data)
#     ID.plot(np.arange(length),plot_data,marker='.',alpha=0.4)
#     ID.set_ylabel(col)
#     ID.set_title(un)
#   plt.show()

# RUL relations

In [None]:
# for col in df.columns:
#   plt.scatter(df['RUL'],df[col],marker='.')
#   plt.ylabel(col)
#   plt.xlabel('RUL')
#   plt.show()

# Scaling and trimming

In [None]:
# Dropping the anomalies of unit_number 10 according to findings in EDA of Sensor 9
df = df.loc[~(df.unit_number==10)]

# Add 24 hour check (180) and 100 hour check (250) and week check (350 cycles)
df['Acheck'] = (df['time_in_cycles']>180).values.astype(int)
df['Bcheck'] = (df['time_in_cycles']>250).values.astype(int)
df['Ccheck'] = (df['time_in_cycles']>350).values.astype(int)

# Dropping useless columns
for a in ['sensor3','sensor4','sensor8','sensor19','sensor21','sensor22']:
    df.drop(a,axis=1,inplace=True)

#drop duplicatives sensor 15,16,17
for a in ['sensor15','sensor16','sensor17']:
    df.drop(a,axis=1,inplace=True)

df.head()

In [None]:
#scaling
from scaling import scaling
from describe import describe
cols_normalize = df.columns.difference(['unit_number','time_in_cycles','RUL'])
# df_t = scaling(df,[cols_normalize,['sensor10','sensor11','sensor12']],scalers=['MinMax','Robust_Scaler'],single=False)
df_t = scaling(df,cols_normalize,scalers='Robust_Scaler',single=True)
df_t = scaling(df_t,cols_normalize,scalers='MinMax',single=True)
# print(df.columns.difference(cols_normalize))
# df = df[df.columns.difference(cols_normalize)].join(df_t)

#sqrt sensor 10-12 from df_t
#df_t = df_t[df_t.columns.difference(['sensor10','sensor11','sensor12'])].join(df_t[['sensor10','sensor11','sensor12']].apply(np.sqrt))

#add 'unit_number','time_in_cycles','RUL' to df_t
df = df_t.join(df[['unit_number','time_in_cycles','RUL']])

# describe function
des = describe(df)
#drop unit_number
unit_number = pd.DataFrame(df["unit_number"])
df = df.drop(columns='unit_number',axis=1)

# for col in df.columns:
#   plot_data = df[df['unit_number']==1][col]
#   length = len(plot_data)
#   fig, ax = plt.subplots(1,4,figsize=(30,10),gridspec_kw={'width_ratios': [4,4,4,0.5]})
#   ax[0].plot(np.arange(length),plot_data,marker='.',alpha=0.4)
#   ax[1].scatter(np.arange(len(df[col])),df[col],marker='.',alpha=0.4)
#   sb.histplot(df[col],kde=True,ax=ax[2],)
#   #sb.histplot(df[col],ax=ax[3])
#   sb.boxplot(y=df[col],color='green',orient='v',ax=ax[3])
#   ax[0].set_ylabel(col)
#   ax[1].set_ylabel(col)
#   ax[2].set_xlabel(col)
#   ax[3].set_xlabel(col)
#   plt.show()
des 

In [None]:
plt.figure()
sb.histplot(df['sensor12'])

In [None]:
#Set up test data
test_df = pd.read_csv(data_link+test_link,sep=" ",header=None)
test_df.dropna(axis=1,inplace=True)
columns = ['unit_number','time_in_cycles','sensor1','sensor2','sensor3','sensor4','sensor5','sensor6','sensor7','sensor8','sensor9','sensor10','sensor11',
'sensor12','sensor13','sensor14','sensor15','sensor16','sensor17','sensor18','sensor19','sensor20','sensor21','sensor22','sensor23','sensor24', ]
test_df.columns = columns

# Dropping the anomalies of unit_number 10 according to findings in EDA of Sensor 9
#test_df = test_df.loc[~(test_df.unit_number==10)]

# Add 24 hour check (180) and 100 hour check (250) and week check (350 cycles)
test_df['Acheck'] = (test_df['time_in_cycles']>180).values.astype(int)
test_df['Bcheck'] = (test_df['time_in_cycles']>250).values.astype(int)
test_df['Ccheck'] = (test_df['time_in_cycles']>350).values.astype(int)

# Dropping useless columns
for a in ['sensor3','sensor4','sensor8','sensor19','sensor21','sensor22']:
    test_df.drop(a,axis=1,inplace=True)

#drop duplicatives sensor 15,16,17
for a in ['sensor15','sensor16','sensor17']:
    test_df.drop(a,axis=1,inplace=True)

#scaling
from scaling import scaling
cols_normalize = test_df.columns.difference(['unit_number','time_in_cycles'])
# df_t = scaling(test_df,[cols_normalize,['sensor10','sensor11','sensor12']],scalers=['MinMax','Robust_Scaler'],single=False)
df_t = scaling(test_df,cols_normalize,scalers='Robust_Scaler',single=True)
df_t = scaling(df_t,cols_normalize,scalers='MinMax',single=True)
#add 'unit_number','time_in_cycles','RUL' to df_t
test_df = df_t.join(test_df[['unit_number','time_in_cycles']])

#describe function
des = test_df.describe()
des.loc['median'] = test_df.median().values
des.loc['coeffvariation'] = (test_df.std()/test_df.mean()).values
des.loc['nunique'] = test_df.nunique().values
des.loc['NullCount'] = test_df.isna().sum().values

# find the time cycle which gives the max cycles
dft = test_df
test_max = dft.groupby('unit_number')['time_in_cycles'].max().reset_index()
test_max.columns = ['unit_number','max']
dft = dft.merge(test_max, on=['unit_number'], how='left')
test = dft[dft['time_in_cycles'] == dft['max']].reset_index()
test.drop(columns=['index','max','unit_number'],inplace = True)
test_df = test.to_numpy()

RUL = pd.read_csv(data_link+rul_link,sep=" ",header=None)

print(f"Test Dataset shape: {test_df.shape}")
print(f"Train Dataset Shape: {df.shape}")
des

In [None]:
# from Helper import train_models,score_func
# acc=[]
# eval_acc={}
# model_names = ['FOREST','LinR','LSVM','SVM','KNN','GNB','TREE','CAT']
# for model in model_names:
#     print(f'Algorithm: {model}')
#     model_1 = train_models(df,model)
#     y_pred = model_1.predict(test_df)

#     # Get y true
#     y_true = RUL[0].to_numpy()

#     RF_individual_scorelst = score_func(y_true, y_pred)
#     acc.append(RF_individual_scorelst)
     
# eval_acc={'Modelling Algorithm':model_names,'Accuracy':acc}
# eval_acc

# LSTM

In [None]:
from Helper import *
train_df = df
train_df_lstm = pd.concat([unit_number, train_df], axis=1)
model, history, lstm_test_df, seq_array, label_array, sequence_length, sequence_cols = train_models([train_df_lstm, dft, RUL.copy()], "LSTM",epoch=500)
lstm_test_evaluation_graphs(model, history, seq_array, label_array)

In [None]:
MAE, R2, y_pred = lstm_valid_evaluation(lstm_test_df, model, sequence_length, sequence_cols)
# mae, rmse, r2
LSTM_individual_scorelst = [round(MAE,2), 0, round(R2,2)*100]

In [None]:
logger.info('-------------- Save Model ---------------') 
model.save('model/model3.h5')