In [None]:
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Setting seed for reproducability
np.random.seed(1234)  
PYTHONHASHSEED = 0
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Activation, Conv1D, MaxPooling1D, Masking
%matplotlib inline

In [None]:
jet = pd.read_table('../train_data.txt',sep=' ',header=None)
jet.drop([26,27],axis=1,inplace=True)
jet.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']
train_df = jet
train_df.head()

In [None]:
td = pd.read_table('../test_data.txt',sep=' ',header=None)
td.drop([26,27],axis=1,inplace=True)
td.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']
td.head()

In [None]:
from sklearn import cluster

In [None]:
k_cluster = cluster.KMeans(n_clusters=6)

In [None]:
def max_min_normalize(df):
    return (df-df.min())/(df.max()-df.min())

In [None]:
feature = jet[['setting1','setting2','setting3']]
test_feature = td[['setting1','setting2','setting3']]

In [None]:
train_data = np.array(max_min_normalize(feature))
test_data = np.array(max_min_normalize(test_feature))

In [None]:
k_cluster.fit(train_data)

In [None]:
train_mode = pd.DataFrame(k_cluster.labels_)
test_mode = pd.DataFrame(k_cluster.predict(test_data))

In [None]:
train_df['mode1']=0
train_df['mode2']=0
train_df['mode3']=0
train_df['mode4']=0
train_df['mode5']=0
train_df['mode6']=0
train_df.loc[train_mode[0]==0,['mode1','mode2','mode3','mode4','mode5','mode6']]=[1,0,0,0,0,0]
train_df.loc[train_mode[0]==1,['mode1','mode2','mode3','mode4','mode5','mode6']]=[0,1,0,0,0,0]
train_df.loc[train_mode[0]==2,['mode1','mode2','mode3','mode4','mode5','mode6']]=[0,0,1,0,0,0]
train_df.loc[train_mode[0]==3,['mode1','mode2','mode3','mode4','mode5','mode6']]=[0,0,0,1,0,0]
train_df.loc[train_mode[0]==4,['mode1','mode2','mode3','mode4','mode5','mode6']]=[0,0,0,0,1,0]
train_df.loc[train_mode[0]==5,['mode1','mode2','mode3','mode4','mode5','mode6']]=[0,0,0,0,0,1]

In [None]:
td['mode1']=0
td['mode2']=0
td['mode3']=0
td['mode4']=0
td['mode5']=0
td['mode6']=0
td.loc[test_mode[0]==0,['mode1','mode2','mode3','mode4','mode5','mode6']]=[1,0,0,0,0,0]
td.loc[test_mode[0]==1,['mode1','mode2','mode3','mode4','mode5','mode6']]=[0,1,0,0,0,0]
td.loc[test_mode[0]==2,['mode1','mode2','mode3','mode4','mode5','mode6']]=[0,0,1,0,0,0]
td.loc[test_mode[0]==3,['mode1','mode2','mode3','mode4','mode5','mode6']]=[0,0,0,1,0,0]
td.loc[test_mode[0]==4,['mode1','mode2','mode3','mode4','mode5','mode6']]=[0,0,0,0,1,0]
td.loc[test_mode[0]==5,['mode1','mode2','mode3','mode4','mode5','mode6']]=[0,0,0,0,0,1]

In [None]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)

train_df.head()

In [None]:
# normalize the columns in the training data.
# MinMax normalization
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id','cycle','RUL'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)
train_df

In [None]:
# normalize the columns in the test data.
# MinMax normalization
td['cycle_norm'] = td['cycle']
cols_normalize = td.columns.difference(['id','cycle'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(td[cols_normalize]), 
                             columns=cols_normalize, 
                             index=td.index)
join_df = td[td.columns.difference(cols_normalize)].join(norm_train_df)
td = join_df.reindex(columns = td.columns)
td

In [None]:
td.to_csv('./data/processed_test.csv',index=False)
train_df.to_csv('./data/processed_train.csv',index=False)