In [255]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc
import seaborn as sns; sns.set()
%matplotlib inline

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
path ='C:/Users/Asger/data/predictive_maintenance/'

from tpot import TPOTClassifier

## Import data

In [367]:
features = pd.read_csv(path + 'feature.csv', header=[0,1], index_col=0, sep=';')
labels = pd.read_csv(path + 'train_label.csv')
df = features.join(labels['label'])
df.rename(columns={('Unnamed: 1_level_0','date'):'date'}, inplace=True)

# Using only data with a label
df.dropna(subset=['label'], inplace=True)

df['date'] = pd.to_datetime(df.date, infer_datetime_format=True)
date1 = df.date.sort_values().tolist()[0]
date2 = df.date.sort_values().tolist()[-1]

print('Start date: {}'.format(date1))
print('End Date: {}'.format(date2))

print(df.shape)
df.head()

## Feature engineering

In [370]:
count_cols = [col for col in df.columns if col[0] == 'count']  # Columns of count
mean_cols = [col for col in df.columns if col[0] == 'mean']  # Columns of mean
max_cols = [col for col in df.columns if col[0] == 'max']  # Columns of the last occurence timestamp
min_cols = [col for col in df.columns if col[0] == 'min']  # Columns of the first occurence timestamp
std_cols = [col for col in df.columns if col[0] == 'std']  # Columns of standard deviation

cols = count_cols + mean_cols + max_cols + min_cols + std_cols

# Removing collums with no informaiton or very little information
emt_columns = [col for col in cols if df[col].sum()<2] 
df.drop(emt_columns, axis=1, inplace=True)

# Adding new date fratures
df['day_of_week'] = df['date'].apply(lambda x: x.dayofweek + 1)
df['month'] = df['date'].apply(lambda x: x.month)
df['week_of_month'] = df['date'].apply(lambda x: x.day//7+1)

date_features = ['day_of_week', 'month', 'week_of_month']
for d in date_features:
    df = df.join(pd.get_dummies(df[d], prefix=d)).drop(d, axis=1)

# Adding z-scores of min and max
error_ids = [col[1] for col in df.columns if col[0] == 'count']
error_ids.remove('136222250') #min and max is removed for this error, hence no z-score can be calculated

for e in error_ids:    
    df[('z_score_min', e)]=(df[('min', e)]-df[('mean', e)])/df[('std', e)]
    df[('z_score_max', e)]=(df[('max', e)]-df[('mean', e)])/df[('std', e)]

In [371]:
# Handling missing values
count_cols = [col for col in df.columns if col[0] == 'count']  # Columns of count
mean_cols = [col for col in df.columns if col[0] == 'mean']  # Columns of mean
max_cols = [col for col in df.columns if col[0] == 'max']  # Columns of the last occurence timestamp
min_cols = [col for col in df.columns if col[0] == 'min']  # Columns of the first occurence timestamp
std_cols = [col for col in df.columns if col[0] == 'std']  # Columns of standard deviation
z_score_min_cols = [col for col in df.columns if col[0] == 'z_score_min']  # Columns of standard deviation
z_score_max_cols = [col for col in df.columns if col[0] == 'z_score_max']  # Columns of standard deviation

cols = count_cols + mean_cols + max_cols + min_cols + std_cols+z_score_min_cols+z_score_max_cols 
# Fill NaN values with -1
df.update(df[cols].fillna(-1))

In [372]:
df.head()

Unnamed: 0,date,"(count, 136088194)","(count, 136088202)","(count, 136088802)","(count, 136089546)","(count, 136110468)","(count, 136216674)","(count, 136222202)","(count, 136222210)","(count, 136222234)",...,"(z_score_min, 136676666)","(z_score_max, 136676666)","(z_score_min, 136676682)","(z_score_max, 136676682)","(z_score_min, 136676698)","(z_score_max, 136676698)","(z_score_min, 136676714)","(z_score_max, 136676714)","(z_score_min, 136676754)","(z_score_max, 136676754)"
1,2015-04-05,0,0,0,0,2250,0,1,2,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,2015-05-05,0,0,0,0,1700,0,2,5,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,2015-06-05,0,0,0,0,2261,0,10,7,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,2015-07-05,0,0,0,0,2950,0,5,8,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,2015-08-05,0,0,0,0,2810,0,7,1,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


### Label imbalance

In [373]:
df['label'].sum()/len(df_train)

0.07028753993610223

In [403]:
#Create one-day in advance label
df['label_1'] = df['label'].shift(-1)

# Using only data with a label
df.dropna(subset=['label_1'], inplace=True) #shift(-1) causes last observation to be without a label

#Splitting data
test_date= df.date.tolist()[-130]
df_train = df[df.date<val_date]
df_test = df[df.date>=val_date]

#Define inputs
train_x = df_train.drop(['label','label_1','date'],axis=1)
train_y = df_train['label_1']

test_x = df_test.drop(['label','label_1','date'],axis=1)
test_y = df_test['label_1']

## Using AutoML to create pipeline

In [378]:
tp_clf = TPOTClassifier(verbosity=2, generations=10, max_eval_time_mins=2, population_size=40, cv=5,scoring='roc_auc')
tp_clf.fit(train_x,train_y)

                                                                                                                       

Generation 1 - Current best internal CV score: 0.5426150121065374


                                                                                                                       

Generation 2 - Current best internal CV score: 0.5430992736077481


                                                                                                                       

Generation 3 - Current best internal CV score: 0.5806295399515738


                                                                                                                       

Generation 4 - Current best internal CV score: 0.5806295399515738


                                                                                                                       

Generation 5 - Current best internal CV score: 0.5806295399515738


                                                                                                                       

Generation 6 - Current best internal CV score: 0.6174334140435834


                                                                                                                       

Generation 7 - Current best internal CV score: 0.6217917675544793


                                                                                                                       

Generation 8 - Current best internal CV score: 0.6569007263922518


                                                                                                                       

Generation 9 - Current best internal CV score: 0.6569007263922518


                                                                                                                       

Generation 10 - Current best internal CV score: 0.6569007263922518


                                                           


Best pipeline: LogisticRegression(RFE(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.3, min_samples_leaf=17, min_samples_split=11, n_estimators=100), criterion=entropy, max_features=0.35000000000000003, n_estimators=100, step=0.6500000000000001), C=10.0, dual=True, penalty=l2)


TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=10, max_eval_time_mins=2,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=40, periodic_checkpoint_folder=None,
        population_size=40, random_state=None, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)

In [383]:
tp_clf.export('C:/Users/Asger/Dropbox/projects/tpot_exports/tpot_export_110119.py')

True

In [409]:
tpot_pred = tp_clf.predict_proba(test_x)[:,:1]
roc_auc_score(test_y, tpot_pred)

0.41087962962962965

## Final pipeline

In [420]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# Score on the training set was:0.6569007263922518
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.3, min_samples_leaf=17, min_samples_split=11, n_estimators=100)),
    RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.35000000000000003, n_estimators=100), step=0.6500000000000001),
    LogisticRegression(C=10.0, dual=True, penalty="l2")
)

exported_pipeline.fit(train_x, train_y)
results = exported_pipeline.predict(test_x)

tpot_pred = exported_pipeline.predict_proba(test_x)[:,:1]
roc_auc_score(test_y, tpot_pred)

0.5462962962962963