In [None]:
import json
import kfp
import kfd.dsl as dsl
import kfp.components as comp

In [None]:
client = kfp.Client(host='')

In [None]:
def get_clean_data(raw_data_path: str) -> str:
    '''Hier moet de data import komen'''
    import pandas as pd
    
    #raw data import
    data = pd.read_parquet(raw_data_path)
    
    data.columns = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21', 'NA', 'NA']
    del data['NA']

    # Creating and adding the RUL to the dataframe
    RUL_list = []
    for engine in set(data['engine_id']):
        max_cycle = data.loc[data['engine_id'] == engine].cycle.max()
    
        for cycle in list(data.loc[data['engine_id'] == engine].cycle):
            RUL_list.append(max_cycle - cycle +1)    
        

    data.insert(2, 'RUL', RUL_list)
    data.to_parquet(cleaned_data_path, compression='GZIP')
    
    return cleaned_data_path

In [None]:
get_clean_data_op = comp.create_component_from_func(
    get_clean_data, output_component_file = 'get_clean_data.yaml', packages_to_install['fastparquet', 'fsspec', 'gcfs'])

In [None]:
def feature_processing(raw_data_path: str, feature_data_path: str) -> str:
    import pandas as pd
    from sklearn.cluster import KMeans
    
    data = pd.read_parquet(raw_data_path)
    
    engine = data.iloc[:,0].to_list()
    cycle = data.iloc[:,1].to_list()

    # Clustering the data
    X_cluster = data[['setting1', 'setting2', 'setting3']]

    # creates the clusters
    kmeans = KMeans(n_clusters=3).fit(X_cluster)
    data['settings_clusters'] = kmeans.predict(X_cluster)

    features = data.columns[3:-1]
    for feature in features:
        # Creating min, max and delta variables
        data['max_' + feature] = data.groupby('engine_id')[feature].cummax()
        data['min_' + feature] = data.groupby('engine_id')[feature].cummin()

        data['delta_' + feature] = data.groupby('engine_id')[feature].diff()
        data['delta_' + feature].fillna(0, inplace=True)

    
    data.to_parquet(feature_data_path, compression='GZIP')
    
    print('Created an saved features.')
    
    return feature_data_path

In [None]:
feature_processing_op = comp.create_component_from_func(
    feature_processing, output_component_file = 'feature_processing.yaml', packages_to_install['fastparquet', 'fsspec', 'gcfs'])

In [None]:
def train_vanilla_gbr(feature_data_path: str, holdout_engine: int) -> None:
    import pandas as pd
    import _pickle as cPickle
    from google.cloud import storage
    from urlib.parse import urlparse
    from sklearn.model_selection import train_test_split
    from sklearn import ensemble
    from sklearn import metrics
    
    data = pd.read_parquet(feature_data_path)
    
    RUL_df = data.loc[data.engine_id != holdout_engine].iloc[:,2:].copy()
    
    labels = RUL_df['RUL']
    features = RUL_df.iloc[:,1:]
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)
    
    gbr_non_opt = ensemble.GradientBoostingRegressor()
    gbr_non_opt.fit(X_train, y_train)
    
    pred_non_opt = gbr_non_opt.predict(X_test)
    print('MAE: %s' % metrics.mean_absolute_error(y_test, pred_non_opt))
    print('MSE: %s' % metrics.mean_squared_error(y_test, pred_non_opt))
    
    with open('/tmp/model.pickle', 'wb') as f:
        cPickle.dump(gbr_non_opt, f, -1)
        
    parse = urlparse(url=vanilla_model_store_path, allow_fragments = False)
    if parse.path[0] == '/':
        model_path = parse.path[1:]
    client = storage.Client()
    bucket = client.get_bucket(parse.netloc)
    blob = bucket.blob(model_path)
    blob.upload_from_filename('/tmp/model.pickle')
        

In [None]:
train_vanilla_gbr_op = comp.create_component_from_func(
    train_vanilla_gbr, output_component_file = 'train_vanilla_gbr.yaml', packages_to_install['fastparquet', 'fsspec', 'gcfs', 'scikit-learn'])

In [None]:
def hyp_tune_train_gbr(feature_data_path: str, tuned_model_store_path: str, metrics_path: str, holdout_engine: int, random_iterations: int, random_params: str) -> str:
    import json
    import pandas as pd
    import _pickle as cPickle
    from google.cloud import storage
    from urlib.parse import urlparse
    from sklearn.model_selection import train_test_split, RandomizedSearchCV
    from sklearn import ensemble
    from sklearn import metrics
    
    data = pd.read_parquet(feature_data_path)
    
    RUL_df = data.loc[data.engine_id != holdout_engine].iloc[:,2:].copy()
    
    labels = RUL_df['RUL']
    features = RUL_df.iloc[:,1:]
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)
    
    random_grid = json.loads(random_params)
    
    gbr = ensemble.GradientBoostingRegressor()
    gbr_random = RandomizedSearchCV(estimator = gbr, param_distributions = random_grid, n_iter = 10, cv = 3, verbose = 2)
    gbr_random.fit(X_train, y_train)

    val_pred_random = gbr_random.predict(X_test)
    MAE_random = metrics.mean_absolute_error(y_test, val_pred_random)
    MSE_random = metrics.mean_squared_error(y_test, val_pred_random)
    print('MAE: %s' % MAE_random)
    print('MSE: %s' % MSE_random)
    
    metrics = {
        'metrics':
        [{
            'name' : 'tuned-mae-score'
            'numbervalue' : MAE_score
        },
        {
            'name' : 'tuned-mse-score'
            'numbervalue' : MSE_score
        },
        ]
    }
    
    temp_metrics_path = '/mlpipeline-metrics.json'
    temp_model_path = '/tmp/model.pickle'
    
    with open(temp_metrics_path, 'wb') as f:
        json.dump(metrics, f)
    
    with open(temp_model_path, 'wb') as f:
        cPickle.dump(gbr_random.best_estimator_, f, -1)
        
    parse = urlparse(url=tuned_model_store_path, allow_fragments = False)
    if parse.path[0] == '/':
        model_path = parse.path[1:]
    client = storage.Client()
    bucket = client.get_bucket(parse.netloc)
    blob = bucket.blob(tuned_model_store_path)
    blob.upload_from_filename(model_path)
    blob = bucket.blob(metrics_path)
    blob.upload_from_filename(temp_model_path)

In [None]:
hyp_tune_train_gbr_op = comp.create_component_from_func(
    hyp_tune_train_gbr, output_component_file = 'hyp_tune_train_gbr.yaml', packages_to_install['fastparquet', 'fsspec', 'gcfs', 'scikit-learn'])

In [None]:
n_estimators = [150, 250, 300, 400]
max_depth = [5, 10, 50, 100, None]
max_features = ['auto', 'sqrt']
min_samples_split= [2, 3, 5, 10]
min_samples_leaf= [1, 2, 4]

random_grid = {'n_estimators' : n_estimators,
               'max_depth' : max_depth,
               'max_features' : max_features,
               'min_samples_split' : min_samples_split,
               'min_samples_leaf' : min_samples_leaf
}

In [None]:
@dsl.pipeline(
    name='RUL gbr'
    descriprion='Predicting the Remaining Usefull Lifetime of aircraft engines.'
)
def RUL_pipeline(raw_data_path, feature_data_path, vanilla_model_store_path, tuned_model_store_path, metrics_path, holdout_engine, random_iterations, random_params):
    
    get_data_task = get_data_op(raw_data_path)
    
    feature_processing_task = feature_processing_op(get_data_task.output, feature_data_path)
    
    train_vanilla_gbr_tasl = train_vanilla_gbr_op(feature_processing_task.output, vanilla_model_store_path, holdout_engine)
    
    hyp_tune_train_gbr_task = hyp_tune_train_op(feature_processing_task.output, tuned_model_store_path, metrics_path, holdout_engine, random_iterations, random_params).apply()
    
    arguments = {'raw_data_path': '',
                 'cleaned_data_path' : '',
                 'feature_data_path' : '',
                 'vanilla_model_store_path' : '',
                 'tuned_model_store_path' : '',
                 'metrics_path': '',
                 'holdout_engine': 62,
                 'random_iterations': 3,
                 'random_params': random_grid
    }

    
client.create_run_from_pipeline_func(RUL_pipeline, arguments=arguments)

In [11]:
data.columns = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21', 'NA', 'NA']
del data['NA']

# Creating and adding the RUL to the dataframe
RUL_list = []
for engine in set(data['engine_id']):
    max_cycle = data.loc[data['engine_id'] == engine].cycle.max()
    
    for cycle in list(data.loc[data['engine_id'] == engine].cycle):
        RUL_list.append(max_cycle - cycle +1)    
        

data.insert(2, 'RUL', RUL_list)

data.head()
dataset = data
dataset.head()

Unnamed: 0,engine_id,cycle,RUL,setting1,setting2,setting3,s1,s2,s3,s4,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,192,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,191,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,190,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,189,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,188,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [24]:
engine = dataset.iloc[:,0].to_list()
cycle = dataset.iloc[:,1].to_list()

# Clustering the data
X_cluster = data[['setting1', 'setting2', 'setting3']]

# creates the clusters
kmeans = KMeans(n_clusters=3).fit(X_cluster)
data['settings_clusters'] = kmeans.predict(X_cluster)

features = data.columns[3:-1]
for feature in features:
    # Creating min, max and delta variables
    data['max_' + feature] = data.groupby('engine_id')[feature].cummax()
    data['min_' + feature] = data.groupby('engine_id')[feature].cummin()

    data['delta_' + feature] = data.groupby('engine_id')[feature].diff()
    data['delta_' + feature].fillna(0, inplace=True)

data.to_parquet(feature_path + 'RUL_features.parquet', compression='GZIP')

In [39]:
random_engine = np.random.randint(1,100)
val_data = data.loc[data.engine_id == random_engine].iloc[:,2:].copy()    
RUL_df = data.loc[data.engine_id != random_engine].iloc[:,2:].copy()

In [44]:
labels = RUL_df['RUL']
features = RUL_df.iloc[:,1:]
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [45]:
gbr_non_opt = ensemble.GradientBoostingRegressor()
gbr_non_opt.fit(X_train, y_train)

GradientBoostingRegressor()

In [48]:
pred_non_opt = gbr_non_opt.predict(X_test)
print('MAE: %s' % metrics.mean_absolute_error(y_test, pred_non_opt))
print('MSE: %s' % metrics.mean_squared_error(y_test, pred_non_opt))

MAE: 9.200445326669886
MSE: 142.63671398695553


In [49]:
n_estimators = [150, 250, 300, 400]
max_depth = [5, 10, 50, 100, None]
max_features = ['auto', 'sqrt']
min_samples_split= [2, 3, 5, 10]
min_samples_leaf= [1, 2, 4]

random_grid = json.dumps({'n_estimators' : n_estimators,
                           'max_depth' : max_depth,
                           'max_features' : max_features,
                           'min_samples_split' : min_samples_split,
                           'min_samples_leaf' : min_samples_leaf})

In [None]:
gbr = ensemble.GradientBoostingRegressor()
gbr_random = RandomizedSearchCV(estimator = gbr, param_distributions = random_grid, n_iter = 10, cv = 3, verbose = 2)
gbr_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] n_estimators=300, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=None 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
with open(best_param.json, 'w') as outfile:
    json.dump(gbr_random.best_params_, outfile)

In [None]:
pprint(gbr_random.best_params_)

In [None]:
X_val, y_val = val_data.iloc[:,1:], val_data.iloc[:,0]

val_pred_non_opt = gbr_non_opt.predict(X_val)
val_pred_random = gbr_random.predict(X_val)
print('MAE: %s' % metrics.mean_absolute_error(y_val val_pred_non_opt))
print('MSE: %s' % metrics.mean_squared_error(y_val, val_pred_non_opt))
print('MAE: %s' % metrics.mean_absolute_error(y_val, val_pred_random))
print('MSE: %s' % metrics.mean_squared_error(y_val, val_pred_random))

In [None]:
if metrics.mean_absolute_error(y_val val_pred_non_opt)) < metrics.mean_absolute_error(y_val, val_pred_random)):
    gbr_best = gbr_random.best_estimator
    print('optimized version is best')
else:
    gbr_best = gbr_non_opt
    print('non optimized version is best')

In [None]:
with open(tmp_dir + 'model.pickle', 'wb') as f:
    cPickle.dump(gbr_best, f, -1)

In [None]:
client = storage.Client()
bucket = client.get_bucket('RUL_prediction')
blob = bucket.blob(model_path+'model.pickle')
blob.upload_from_filename('/tmp/model.pickle')