In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Loading dataFrame

In [2]:
df = pd.read_csv('predictive_maintenance_dataset.csv')
df.head()

Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,1/1/2015,S1F01085,0,215630672,55,0,52,6,407438,0,0,7
1,1/1/2015,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,1/1/2015,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,1/1/2015,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,1/1/2015,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


# Data Engineering

In [3]:
# we add the activedays and note that 0 = sunday
df.date = pd.to_datetime(df.date)

df['activedays']=df.date-df.date[0]

df['month']=df['date'].dt.month
df['week_day']=df.date.dt.weekday
df['week_day'].replace(0,7,inplace=True)
df.head()

Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9,activedays,month,week_day
0,2015-01-01,S1F01085,0,215630672,55,0,52,6,407438,0,0,7,0 days,1,3
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0,0 days,1,3
2,2015-01-01,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0,0 days,1,3
3,2015-01-01,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0,0 days,1,3
4,2015-01-01,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3,0 days,1,3


In [4]:
# we create the column "max_date"
df_date = df.groupby('device').agg({'date':max})
df_date.date.to_dict()
df['max_date']=df.device.map(df_date.date.to_dict())
df.head()

Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9,activedays,month,week_day,max_date
0,2015-01-01,S1F01085,0,215630672,55,0,52,6,407438,0,0,7,0 days,1,3,2015-01-06
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0,0 days,1,3,2015-01-06
2,2015-01-01,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0,0 days,1,3,2015-02-17
3,2015-01-01,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0,0 days,1,3,2015-01-06
4,2015-01-01,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3,0 days,1,3,2015-08-24


In [5]:
df1 = df.groupby('device').agg({'date':max})
df1.head()

Unnamed: 0_level_0,date
device,Unnamed: 1_level_1
S1F01085,2015-01-06
S1F013BB,2015-05-11
S1F0166B,2015-01-06
S1F01E6Y,2015-02-17
S1F01JE0,2015-01-06


In [6]:
# we create a new dataframe that contains unique value for each device. the date value is the max date value.
# which means, we have the values records related to the top date value .
df1=df1.reset_index()

df=df.reset_index(drop=True) 

df2= pd.merge(df1,df,how='left',on=['device','date'])

df2.head()

Unnamed: 0,device,date,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9,activedays,month,week_day,max_date
0,S1F01085,2015-01-06,0,128832128,56,0,52,6,409404,0,0,7,5 days,1,1,2015-01-06
1,S1F013BB,2015-05-11,0,115676688,0,0,0,5,689161,0,0,0,130 days,5,7,2015-05-11
2,S1F0166B,2015-01-06,0,7441792,0,3,0,6,404786,0,0,0,5 days,1,1,2015-01-06
3,S1F01E6Y,2015-02-17,0,147350000,0,0,0,12,259491,0,0,0,47 days,2,1,2015-02-17
4,S1F01JE0,2015-01-06,0,185424928,0,0,0,6,412151,0,0,0,5 days,1,1,2015-01-06


In [7]:
# create the "failure_before" column
df2['failure_before']=0
df2.head()

Unnamed: 0,device,date,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9,activedays,month,week_day,max_date,failure_before
0,S1F01085,2015-01-06,0,128832128,56,0,52,6,409404,0,0,7,5 days,1,1,2015-01-06,0
1,S1F013BB,2015-05-11,0,115676688,0,0,0,5,689161,0,0,0,130 days,5,7,2015-05-11,0
2,S1F0166B,2015-01-06,0,7441792,0,3,0,6,404786,0,0,0,5 days,1,1,2015-01-06,0
3,S1F01E6Y,2015-02-17,0,147350000,0,0,0,12,259491,0,0,0,47 days,2,1,2015-02-17,0
4,S1F01JE0,2015-01-06,0,185424928,0,0,0,6,412151,0,0,0,5 days,1,1,2015-01-06,0


In [8]:
df2.loc[df2.device == 'S1F136J0','failure_before'] = 1
df2.loc[df2.device == 'W1F0KCP2','failure_before'] = 1
df2.loc[df2.device == 'W1F0M35B','failure_before'] = 1
df2.loc[df2.device == 'S1F0GPFZ','failure_before'] = 1
df2.loc[df2.device == 'W1F11ZG9','failure_before'] = 1

# Data Transformation

In [9]:
cat_ftrs = ['metric3','metric4', 'metric5', 'metric7', 'metric9'] 
for col in cat_ftrs:
    df2[col]=df2[col].astype('object')

In [9]:
def str_to_num(str):
    return str.split(' ')[0]
df2.activedays = df2.activedays.astype('str')
df2.activedays=df2.activedays.apply(str_to_num)
df2.activedays = df2.activedays.astype('int')
df2.head()

Unnamed: 0,device,date,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9,activedays,month,week_day,max_date,failure_before
0,S1F01085,2015-01-06,0,128832128,56,0,52,6,409404,0,0,7,5,1,1,2015-01-06,0
1,S1F013BB,2015-05-11,0,115676688,0,0,0,5,689161,0,0,0,130,5,7,2015-05-11,0
2,S1F0166B,2015-01-06,0,7441792,0,3,0,6,404786,0,0,0,5,1,1,2015-01-06,0
3,S1F01E6Y,2015-02-17,0,147350000,0,0,0,12,259491,0,0,0,47,2,1,2015-02-17,0
4,S1F01JE0,2015-01-06,0,185424928,0,0,0,6,412151,0,0,0,5,1,1,2015-01-06,0


In [10]:
# convert month and weekday to categorical value
for col in ['month','week_day']:
    df2[col]=df2[col].astype('object')

In [11]:
# drop the metric8 because it's the same as metric7
df2.drop('metric8',axis=1,inplace=True)

# data for the pipeline and the pipeline

In [12]:
df_pipeline = df2.copy()
df_pipeline.head()

Unnamed: 0,device,date,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric9,activedays,month,week_day,max_date,failure_before
0,S1F01085,2015-01-06,0,128832128,56,0,52,6,409404,0,7,5,1,1,2015-01-06,0
1,S1F013BB,2015-05-11,0,115676688,0,0,0,5,689161,0,0,130,5,7,2015-05-11,0
2,S1F0166B,2015-01-06,0,7441792,0,3,0,6,404786,0,0,5,1,1,2015-01-06,0
3,S1F01E6Y,2015-02-17,0,147350000,0,0,0,12,259491,0,0,47,2,1,2015-02-17,0
4,S1F01JE0,2015-01-06,0,185424928,0,0,0,6,412151,0,0,5,1,1,2015-01-06,0


In [13]:
len(['metric1', 'metric2', 'metric3', 'metric4', 'metric5', 'metric6',
       'metric7', 'metric9', 'activedays', 'failure_before','device_S1F0', 'device_S1F1',
       'device_W1F0', 'device_W1F1', 'device_Z1F0', 'device_Z1F1',
       'device_Z1F2', 'month_1' , 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11','week_day_1', 'week_day_2',
       'week_day_3', 'week_day_4', 'week_day_5', 'week_day_6', 'week_day_7'])

35

In [42]:
from datetime import datetime
def pipeline(base,array,scaler):

    # our input array : array
    """
    [date d'aujourd'hui ,device name, 
    'metric1', 'metric2', 'metric3', 'metric4', 'metric5', 'metric6', 'metric7', 'metric9']
    """
    # our output array
    length = len(['metric1', 'metric2', 'metric3', 'metric4', 'metric5', 'metric6',
       'metric7', 'metric9', 'activedays', 'failure_before','device_S1F0', 'device_S1F1',
       'device_W1F0', 'device_W1F1', 'device_Z1F0', 'device_Z1F1',
       'device_Z1F2', 'month_1' , 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11','week_day_1', 'week_day_2',
       'week_day_3', 'week_day_4', 'week_day_5', 'week_day_6', 'week_day_7'])

    output_array = [0 for i in range(length)]

    # insert the device name
    match array[1][:4] :
        case "S1F0" : output_array[10] = 1
        case "S1F1" : output_array[11] = 1
        case "W1F0" : output_array[12] = 1
        case "W1F1" : output_array[13] = 1
        case "Z1F0" : output_array[14] = 1
        case "Z1F1" : output_array[15] = 1
        case "Z1F2" : output_array[16] = 1

    # get the month and weekday
    temp = array[0]
    array[0] = datetime.strptime(array[0], "%Y-%m-%d")
    month = array[0].month
    #print(f"month = {month}")
    day = array[0].weekday() +1 # LUNDI = 0 donc on ajoute 1 pour avoir lundi =1
    #print(f"day = {day}")


    # insert the weekday
    match day :
        case 1 : output_array[28] = 1
        case 2 : output_array[29] = 1
        case 3 : output_array[30] = 1
        case 4 : output_array[31] = 1
        case 5 : output_array[32] = 1
        case 6 : output_array[33] = 1
        case 7 : output_array[34] = 1

    # insert the weekday
    match month :
        case 1 : output_array[17] = 1
        case 2 : output_array[18] = 1
        case 3 : output_array[19] = 1
        case 4 : output_array[20] = 1
        case 5 : output_array[21] = 1
        case 6 : output_array[22] = 1
        case 7 : output_array[23] = 1
        case 8 : output_array[24] = 1
        case 9 : output_array[25] = 1
        case 10 : output_array[26] = 1
        case 11 : output_array[27] = 1

    #2 get the activeday attribute
    for i in base.device :
        if array[1] == i:
        #the dataframe column must be datetime type
            time = base[base.device == array[1]].date.values
            time = np.datetime_as_string(time, unit='D')[0]
            time = datetime.strptime(time,"%Y-%m-%d")
            output_array[8] = time.day
            #print(output_array[8])
            # add the days between today and 1st month 10
            new_days = datetime.strptime(temp, "%Y-%m-%d") - datetime.strptime('2015-10-01', "%Y-%m-%d")
            #print(f"difference = {new_days}")
            output_array[8] = output_array[8] + new_days.days 
            #output_array[8] = output_array[8] + new_days.days
            #print(f"active days = {output_array[8]}")
            break


    #3 did the device fail before. i didn't take in cosideration the failed_before column
    failures = base.groupby('device').agg({'failure_before':lambda x: np.sum(x)})
    for i in failures.index :
        if i == array[1] : 
            output_array[9] = failures.loc[i].failure_before
            
            #print(f"failure = {output_array[9]}") 

    #7 data standarization
    array = np.array(array)
    output_array = np.array(output_array,np.float64)
    val = scaler.transform(array[2:].reshape(1, -1))
    output_array[:8] = val.flatten()

    return output_array.reshape(1, -1)

# Data for training the model

In [16]:
df_train = df2.copy()

In [17]:
# standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# won't add metric8 because we wil dropt it later
num_ftrs =['metric1','metric2','metric3','metric4','metric5','metric6','metric7','metric9'] 
df_train[num_ftrs]=scaler.fit_transform(df_train[num_ftrs])
df_train.head()

Unnamed: 0,device,date,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric9,activedays,month,week_day,max_date,failure_before
0,S1F01085,2015-01-06,0,0.094795,-0.136309,-0.042339,0.534665,-0.521389,1.333502,-0.101656,-0.047396,5,1,1,2015-01-06,0
1,S1F013BB,2015-05-11,0,-0.092146,-0.14566,-0.042339,-0.124295,-0.60229,4.008798,-0.101656,-0.050645,130,5,7,2015-05-11,0
2,S1F0166B,2015-01-06,0,-1.630184,-0.14566,-0.038274,-0.124295,-0.521389,1.289341,-0.101656,-0.050645,5,1,1,2015-01-06,0
3,S1F01E6Y,2015-02-17,0,0.357937,-0.14566,-0.042339,-0.124295,-0.035987,-0.100105,-0.101656,-0.050645,47,2,1,2015-02-17,0
4,S1F01JE0,2015-01-06,0,0.898989,-0.14566,-0.042339,-0.124295,-0.521389,1.359772,-0.101656,-0.050645,5,1,1,2015-01-06,0


In [18]:
# drop date and max_date columns
df_train.drop(['date','max_date'],axis=1,inplace=True)

In [19]:
Id = df_train.device.values.tolist()
Id1 = [] 
for i in Id:
    i = i[:4]
    Id1.append(i)

df_train.device=Id1
df_train.head()

Unnamed: 0,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric9,activedays,month,week_day,failure_before
0,S1F0,0,0.094795,-0.136309,-0.042339,0.534665,-0.521389,1.333502,-0.101656,-0.047396,5,1,1,0
1,S1F0,0,-0.092146,-0.14566,-0.042339,-0.124295,-0.60229,4.008798,-0.101656,-0.050645,130,5,7,0
2,S1F0,0,-1.630184,-0.14566,-0.038274,-0.124295,-0.521389,1.289341,-0.101656,-0.050645,5,1,1,0
3,S1F0,0,0.357937,-0.14566,-0.042339,-0.124295,-0.035987,-0.100105,-0.101656,-0.050645,47,2,1,0
4,S1F0,0,0.898989,-0.14566,-0.042339,-0.124295,-0.521389,1.359772,-0.101656,-0.050645,5,1,1,0


In [20]:
df_train = pd.get_dummies(df_train)

  df_train = pd.get_dummies(df_train)


In [21]:
df_train.head()

Unnamed: 0,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric9,activedays,...,month_9,month_10,month_11,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6,week_day_7
0,0,0.094795,-0.136309,-0.042339,0.534665,-0.521389,1.333502,-0.101656,-0.047396,5,...,0,0,0,1,0,0,0,0,0,0
1,0,-0.092146,-0.14566,-0.042339,-0.124295,-0.60229,4.008798,-0.101656,-0.050645,130,...,0,0,0,0,0,0,0,0,0,1
2,0,-1.630184,-0.14566,-0.038274,-0.124295,-0.521389,1.289341,-0.101656,-0.050645,5,...,0,0,0,1,0,0,0,0,0,0
3,0,0.357937,-0.14566,-0.042339,-0.124295,-0.035987,-0.100105,-0.101656,-0.050645,47,...,0,0,0,1,0,0,0,0,0,0
4,0,0.898989,-0.14566,-0.042339,-0.124295,-0.521389,1.359772,-0.101656,-0.050645,5,...,0,0,0,1,0,0,0,0,0,0


In [22]:
df_train.columns

Index(['failure', 'metric1', 'metric2', 'metric3', 'metric4', 'metric5',
       'metric6', 'metric7', 'metric9', 'activedays', 'failure_before',
       'device_S1F0', 'device_S1F1', 'device_W1F0', 'device_W1F1',
       'device_Z1F0', 'device_Z1F1', 'device_Z1F2', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       'month_9', 'month_10', 'month_11', 'week_day_1', 'week_day_2',
       'week_day_3', 'week_day_4', 'week_day_5', 'week_day_6', 'week_day_7'],
      dtype='object')

In [23]:
len(df_train.columns)

36

In [24]:
# getting our X & Y
X = df_train.drop('failure',axis=1)
Y = df_train.failure

In [25]:
X.head(1)

Unnamed: 0,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric9,activedays,failure_before,...,month_9,month_10,month_11,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6,week_day_7
0,0.094795,-0.136309,-0.042339,0.534665,-0.521389,1.333502,-0.101656,-0.047396,5,0,...,0,0,0,1,0,0,0,0,0,0


In [26]:
Y.head(1)

0    0
Name: failure, dtype: int64

In [27]:
indexes_train = df_pipeline[df_pipeline.date < "2015-10-01"].index
X.iloc[indexes_train].head()

Unnamed: 0,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric9,activedays,failure_before,...,month_9,month_10,month_11,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6,week_day_7
0,0.094795,-0.136309,-0.042339,0.534665,-0.521389,1.333502,-0.101656,-0.047396,5,0,...,0,0,0,1,0,0,0,0,0,0
1,-0.092146,-0.14566,-0.042339,-0.124295,-0.60229,4.008798,-0.101656,-0.050645,130,0,...,0,0,0,0,0,0,0,0,0,1
2,-1.630184,-0.14566,-0.038274,-0.124295,-0.521389,1.289341,-0.101656,-0.050645,5,0,...,0,0,0,1,0,0,0,0,0,0
3,0.357937,-0.14566,-0.042339,-0.124295,-0.035987,-0.100105,-0.101656,-0.050645,47,0,...,0,0,0,1,0,0,0,0,0,0
4,0.898989,-0.14566,-0.042339,-0.124295,-0.521389,1.359772,-0.101656,-0.050645,5,0,...,0,0,0,1,0,0,0,0,0,0


In [28]:
indexes_test = df_pipeline[df_pipeline.date >= "2015-10-01"].index
X.iloc[indexes_test].head()

Unnamed: 0,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric9,activedays,failure_before,...,month_9,month_10,month_11,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6,week_day_7
60,-0.037285,-0.14566,-0.042339,-0.124295,-0.359588,1.431379,-0.101656,-0.049717,291,0,...,0,1,0,0,0,0,0,0,0,1
61,1.384632,-0.14566,-0.042339,-0.124295,-0.60229,0.882199,-0.101656,-0.050645,286,0,...,0,1,0,0,1,0,0,0,0,0
72,0.191214,0.400737,-0.042339,-0.048261,-0.359588,1.453268,-0.101656,-0.050645,284,0,...,0,1,0,0,0,0,0,0,0,1
79,1.183773,-0.14566,-0.042339,0.027773,-0.116887,0.797338,-0.101656,-0.050645,305,0,...,0,0,1,0,0,0,0,0,0,1
81,0.53581,-0.14566,-0.042339,0.040445,-0.116887,0.738315,0.630489,-0.050645,305,0,...,0,0,1,0,0,0,0,0,0,1


In [29]:
# split data
x_train , y_train , x_test , y_test = X.iloc[indexes_train] , Y.iloc[indexes_train] , X.iloc[indexes_test] , Y.iloc[indexes_test]

# ML MODEL

In [30]:
from TVSVM import TwinSVMClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

### TWIN SVM

In [213]:
params3 = {'Epsilon1': 0.1, 'Epsilon2': 0.1, 'C1': 0.1, 'C2': 0.1,'kernel_type':3,'kernel_param': 3,'fuzzy' :0}

names = "Twin SVM with RBF Kernel"
classifier = TwinSVMClassifier(**params3)
classifier.fit(x_train.values, y_train.values)
#clf = OneVsOneClassifier(classifier).fit(x_train, y_train) # or OneVsRestClassifier
y_pred = classifier.predict(x_test.values)
print(metrics.accuracy_score(y_test, y_pred))

0.02054794520547945


### TWIN SVM PCA

In [215]:
from sklearn.decomposition import PCA 
from sklearn.pipeline import Pipeline

pca = PCA(10)
x_train_pca= pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [223]:
params3 = {'Epsilon1': 0.1, 'Epsilon2': 0.1, 'C1': 0.1, 'C2': 0.1,'kernel_type':3,'kernel_param': 3,'fuzzy' :0}

names = "Twin SVM with RBF Kernel"
classifier = TwinSVMClassifier(**params3)
classifier.fit(x_train_pca, y_train.values)
#clf = OneVsOneClassifier(classifier).fit(x_train, y_train) # or OneVsRestClassifier
y_pred = classifier.predict(x_test_pca)
print(f"{metrics.accuracy_score(y_test, y_pred)*100} %")

2.054794520547945 %


### KNN

In [31]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

print(f"{metrics.accuracy_score(y_test, y_pred)*100} %")

97.94520547945206 %


## trying the pipeline (ps : it works)

In [43]:
l = ["2015-01-06","S1F01085",128832128,56,0,52,6,409404,0,7]
out = pipeline(df2,l,scaler)



In [47]:
y_pred = knn.predict(out)
y_pred



array([0], dtype=int64)