# Preprocessing

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
machine_failure = pd.read_csv('data/ai4i2020.csv')

In [3]:
machine_failure.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


# Pipeline para preprocesamiento

## Quita columnas de ID

In [4]:
machine_failure = machine_failure.drop(columns=['UDI', 'Product ID'])

## One-Hot Encoding para variables categóricas

In [5]:
machine_failure = pd.get_dummies(machine_failure, drop_first=True, dtype=int)

In [6]:
machine_failure.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_L,Type_M
0,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,0,1
1,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,1,0
2,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,1,0
3,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,1,0
4,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,1,0


## Label Encoding para clase de falla

In [7]:
not_failure_type = (
    (machine_failure['TWF'] == 0)
    & (machine_failure['HDF'] == 0)
    & (machine_failure['PWF'] == 0)
    & (machine_failure['OSF'] == 0)
    & (machine_failure['RNF'] == 0)
)

In [8]:
machine_failure[(machine_failure['Machine failure']) == 1 & not_failure_type]

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_L,Type_M
1221,297.0,308.3,1399,46.4,132,0,0,0,0,0,1,0,1
1302,298.6,309.8,1505,45.7,144,0,0,0,0,0,1,1,0
1437,298.8,309.9,1439,45.2,40,1,0,0,0,0,0,0,0
1748,298.4,307.7,1626,31.1,166,0,0,0,0,0,1,0,0
2072,299.6,309.5,1570,35.5,189,0,0,0,0,0,1,1,0
2559,299.3,309.0,1447,50.4,140,0,0,0,0,0,1,1,0
2749,299.7,309.2,1685,28.9,179,1,0,0,0,0,0,0,1
3065,300.1,309.2,1687,27.7,95,0,0,0,0,0,1,0,1
3452,301.6,310.5,1602,32.3,2,0,0,0,0,0,1,0,0
4044,301.9,310.9,1419,47.7,20,1,0,0,0,0,0,0,1


In [9]:
machine_failure[machine_failure['RNF'] == 1]

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_L,Type_M
1221,297.0,308.3,1399,46.4,132,0,0,0,0,0,1,0,1
1302,298.6,309.8,1505,45.7,144,0,0,0,0,0,1,1,0
1748,298.4,307.7,1626,31.1,166,0,0,0,0,0,1,0,0
2072,299.6,309.5,1570,35.5,189,0,0,0,0,0,1,1,0
2559,299.3,309.0,1447,50.4,140,0,0,0,0,0,1,1,0
3065,300.1,309.2,1687,27.7,95,0,0,0,0,0,1,0,1
3452,301.6,310.5,1602,32.3,2,0,0,0,0,0,1,0,0
3611,301.7,310.9,1405,46.4,207,1,1,0,0,0,1,1,0
5471,302.7,312.3,1346,61.2,170,0,0,0,0,0,1,1,0
5489,302.6,312.1,1499,35.0,215,0,0,0,0,0,1,1,0


In [10]:
machine_failure['Flags quantity'] = (
    machine_failure['Machine failure']
    + machine_failure['TWF']
    + machine_failure['HDF']
    + machine_failure['PWF']
    + machine_failure['OSF']
    + machine_failure['RNF']
)

In [11]:
machine_failure['Flags quantity'].value_counts()

Flags quantity
0    9643
2     306
1      27
3      23
4       1
Name: count, dtype: int64

In [12]:
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    display(machine_failure[machine_failure['Flags quantity'] == 1])

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_L,Type_M,Flags quantity
1221,297.0,308.3,1399,46.4,132,0,0,0,0,0,1,0,1,1
1302,298.6,309.8,1505,45.7,144,0,0,0,0,0,1,1,0,1
1437,298.8,309.9,1439,45.2,40,1,0,0,0,0,0,0,0,1
1748,298.4,307.7,1626,31.1,166,0,0,0,0,0,1,0,0,1
2072,299.6,309.5,1570,35.5,189,0,0,0,0,0,1,1,0,1
2559,299.3,309.0,1447,50.4,140,0,0,0,0,0,1,1,0,1
2749,299.7,309.2,1685,28.9,179,1,0,0,0,0,0,0,1,1
3065,300.1,309.2,1687,27.7,95,0,0,0,0,0,1,0,1,1
3452,301.6,310.5,1602,32.3,2,0,0,0,0,0,1,0,0,1
4044,301.9,310.9,1419,47.7,20,1,0,0,0,0,0,0,1,1


In [13]:
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    display(machine_failure[machine_failure['Flags quantity'] == 3])

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_L,Type_M,Flags quantity
69,298.9,309.0,1410,65.7,191,1,0,0,1,1,0,1,0,3
1324,298.8,310.1,1243,74.5,194,1,0,0,1,1,0,0,1,3
1496,298.0,308.7,1268,69.4,189,1,0,0,1,1,0,1,0,3
3611,301.7,310.9,1405,46.4,207,1,1,0,0,0,1,1,0,3
3854,302.4,311.0,1338,67.6,194,1,0,0,1,1,0,1,0,3
3943,302.3,311.4,1333,66.7,205,1,0,0,1,1,0,1,0,3
4254,302.6,311.0,1284,68.0,114,1,0,1,1,0,0,1,0,3
4342,301.7,309.8,1284,68.2,111,1,0,1,1,0,0,0,1,3
4370,302.0,309.9,1308,57.6,197,1,0,1,0,1,0,1,0,3
4383,301.7,309.5,1298,65.5,229,1,0,1,0,1,0,1,0,3


In [14]:
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    display(machine_failure[machine_failure['Flags quantity'] == 4])

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_L,Type_M,Flags quantity
5909,300.7,310.2,1364,65.3,208,1,1,0,1,1,0,0,0,4


In [15]:
print(f'TWF failures {machine_failure["TWF"].sum()}')
print(f'HDF failures {machine_failure["HDF"].sum()}')
print(f'PWF failures {machine_failure["PWF"].sum()}')
print(f'OSF failures {machine_failure["OSF"].sum()}')
print(f'RNF failures {machine_failure["RNF"].sum()}')

TWF failures 46
HDF failures 115
PWF failures 95
OSF failures 98
RNF failures 19


In [16]:
machine_failure['Flags quantity 5'] = (
    machine_failure['TWF']
    + machine_failure['HDF']
    + machine_failure['PWF']
    + machine_failure['OSF']
    + machine_failure['RNF']
)

In [17]:
machine_failure['Failure type'] = 0

In [18]:
machine_failure['Failure type'].value_counts()

Failure type
0    10000
Name: count, dtype: int64

In [19]:
one_failure_mask = machine_failure['Flags quantity 5'] == 1

In [20]:
machine_failure.loc[one_failure_mask & machine_failure['TWF'] == 1, 'Failure type'] = 1
machine_failure.loc[one_failure_mask & machine_failure['HDF'] == 1, 'Failure type'] = 2
machine_failure.loc[one_failure_mask & machine_failure['PWF'] == 1, 'Failure type'] = 3
machine_failure.loc[one_failure_mask & machine_failure['OSF'] == 1, 'Failure type'] = 4
machine_failure.loc[one_failure_mask & machine_failure['RNF'] == 1, 'Failure type'] = 5

In [21]:
machine_failure['Failure type'].value_counts()

Failure type
0    9676
2     106
3      80
4      78
1      42
5      18
Name: count, dtype: int64

In [22]:
two_failure_mask = machine_failure['Flags quantity 5'] == 2
two_failure_with_rnf_mask = two_failure_mask & machine_failure['RNF'] == 1

In [25]:
machine_failure[two_failure_with_rnf_mask]

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_L,Type_M,Flags quantity,Flags quantity 5,Failure type
3611,301.7,310.9,1405,46.4,207,1,1,0,0,0,1,1,0,3,2,1


In [23]:
machine_failure.loc[two_failure_with_rnf_mask & machine_failure['TWF'] == 1, 'Failure type'] = 1
machine_failure.loc[two_failure_with_rnf_mask & machine_failure['HDF'] == 1, 'Failure type'] = 2
machine_failure.loc[two_failure_with_rnf_mask & machine_failure['PWF'] == 1, 'Failure type'] = 3
machine_failure.loc[two_failure_with_rnf_mask & machine_failure['OSF'] == 1, 'Failure type'] = 4

In [24]:
machine_failure['Failure type'].value_counts()

Failure type
0    9675
2     106
3      80
4      78
1      43
5      18
Name: count, dtype: int64

In [32]:
two_failure_without_rnf_mask = (machine_failure['Flags quantity 5'] == 2) & (machine_failure['RNF'] == 0)

In [33]:
machine_failure[two_failure_without_rnf_mask]

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_L,Type_M,Flags quantity,Flags quantity 5,Failure type
69,298.9,309.0,1410,65.7,191,1,0,0,1,1,0,1,0,3,2,0
1324,298.8,310.1,1243,74.5,194,1,0,0,1,1,0,0,1,3,2,0
1496,298.0,308.7,1268,69.4,189,1,0,0,1,1,0,1,0,3,2,0
3854,302.4,311.0,1338,67.6,194,1,0,0,1,1,0,1,0,3,2,0
3943,302.3,311.4,1333,66.7,205,1,0,0,1,1,0,1,0,3,2,0
4254,302.6,311.0,1284,68.0,114,1,0,1,1,0,0,1,0,3,2,0
4342,301.7,309.8,1284,68.2,111,1,0,1,1,0,0,0,1,3,2,0
4370,302.0,309.9,1308,57.6,197,1,0,1,0,1,0,1,0,3,2,0
4383,301.7,309.5,1298,65.5,229,1,0,1,0,1,0,1,0,3,2,0
4417,302.6,310.4,1365,66.8,80,1,0,1,1,0,0,1,0,3,2,0


In [34]:
# 6 is the class for multi-failure
machine_failure.loc[two_failure_without_rnf_mask, 'Failure type'] = 6

In [35]:
machine_failure['Failure type'].value_counts()

Failure type
0    9653
2     106
3      80
4      78
1      43
6      22
5      18
Name: count, dtype: int64

In [36]:
multi_failure_mask = machine_failure['Flags quantity 5'] > 2

In [39]:
machine_failure[multi_failure_mask]

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_L,Type_M,Flags quantity,Flags quantity 5,Failure type
5909,300.7,310.2,1364,65.3,208,1,1,0,1,1,0,0,0,4,3,6


In [37]:
machine_failure.loc[multi_failure_mask, 'Failure type'] = 6

In [38]:
machine_failure['Failure type'].value_counts()

Failure type
0    9652
2     106
3      80
4      78
1      43
6      23
5      18
Name: count, dtype: int64

In [40]:
machine_failure['Failure type'].value_counts().sum()

10000

## Train-Test splitting

## StandardScaler para variables numéricas