In [241]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Preparation & Preprocessing

The target feature ( machine failure ) is in unbalance state and the goal of this work is not find solutions for this issue but only to explore and study the SSL & AL techniques , that's why we'll extract the dataset to a balanced version which we'll use on this following work.

In [242]:
data_labeled = pd.read_csv("data/labeled.csv")
data_unlabeled = pd.read_csv("data/unlabeled.csv")
columns = data_labeled.columns
data_labeled

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136424,136424,M22284,M,300.1,311.4,1530,37.5,210,0,0,0,0,0,0
136425,136425,H38017,H,297.5,308.5,1447,49.1,2,0,0,0,0,0,0
136426,136426,L54690,L,300.5,311.8,1524,38.5,214,0,0,0,0,0,0
136427,136427,L53876,L,301.7,310.9,1447,46.3,42,0,0,0,0,0,0


In [243]:
print("labeledset")
print("Machine failure (from): ",np.unique(data_labeled.loc[:,"Machine failure"],return_counts=True)[1])

# number of rows to keep in majority class
count_to_extract = np.unique(data_labeled.loc[:,"Machine failure"],return_counts=True)[1][1]

# extracting from the majority class
new_data_labeled_0 = np.array(data_labeled[data_labeled.loc[:,"Machine failure"] == 0])
np.random.shuffle(new_data_labeled_0)
new_data_labeled_0 = new_data_labeled_0[0:count_to_extract]

# concatenation to get the ne dataset
new_data_labeled_1 = np.array(data_labeled[data_labeled.loc[:,"Machine failure"] == 1])
new_data_labeled = np.vstack((new_data_labeled_1,new_data_labeled_0))
data_labeled = pd.DataFrame(new_data_labeled,columns=columns)

print("Machine failure (to): ",np.unique(data_labeled.loc[:,"Machine failure"],return_counts=True)[1])

labeledset
Machine failure (from):  [134281   2148]
Machine failure (to):  [2148 2148]


In [244]:
# drop ID's columns
data_labeled = data_labeled.drop(['Product ID','id'],axis=1)
data_unlabeled = data_unlabeled.drop(['Product ID','id'],axis=1)

# one hot to categorial features
data_labeled = pd.get_dummies(data_labeled,columns=['Type'])
data_unlabeled = pd.get_dummies(data_unlabeled,columns=['Type'])
data_labeled

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_H,Type_L,Type_M
0,303.9,312.8,1345,56.5,21,1,0,0,0,0,0,0,1,0
1,302.5,310.4,1307,54.8,174,1,0,1,0,0,0,0,1,0
2,300.7,309.7,1878,27.9,20,1,0,0,0,0,0,0,0,1
3,297.3,308.6,1258,61.8,144,1,0,0,0,1,0,0,1,0
4,303.0,311.3,1341,51.0,174,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4291,298.3,308.8,1556,39.2,92,0,0,0,0,0,0,0,0,1
4292,298.3,309.4,1573,33.8,143,0,0,0,0,0,0,1,0,0
4293,295.8,306.5,1509,49.2,76,0,0,0,0,0,0,0,1,0
4294,300.7,311.2,1334,51.9,12,0,0,0,0,0,0,0,1,0


In [245]:
# normalization min-max to a column
def normalize_min_max(column):
    return (column - column.min()) / (column.max() - column.min())

# to all dataset
data_labeled = data_labeled.apply(normalize_min_max)
data_unlabeled = data_unlabeled.apply(normalize_min_max)
data = pd.concat((data_labeled,data_unlabeled),ignore_index=True)
data

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_H,Type_L,Type_M
0,0.94382,0.883117,0.096188,0.723901,0.083004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.786517,0.571429,0.0739,0.700549,0.687747,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.58427,0.480519,0.408798,0.331044,0.079051,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.202247,0.337662,0.045161,0.796703,0.56917,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.842697,0.688312,0.093842,0.648352,0.687747,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95245,0.769231,0.703704,0.183935,0.502747,0.059289,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95246,0.285714,0.506173,0.217695,0.412088,0.12253,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95247,0.032967,0.061728,0.19383,0.516484,0.73913,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95248,0.307692,0.259259,0.213038,0.501374,0.272727,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [247]:
# extracting inputs data and target feature
inputs = data.drop('Machine failure',axis=1)
target = data.loc[:,'Machine failure']

After this step of preprocessing we get a complete dataset with a target column labeled at 4.5% (4296 rows with labels and 90954 without)

# 1. Semi-Supervised Learning SSL

## 1.1 Self training (pseudo-labeling)

cette technique revient à considérer les prédictions faites par le modèle sur les données non labellisées comme de nouveaux labels dont le modèle pourra se servir pour affiner son apprentissage.

In [249]:
###################################################################

#Fonctions

def labeled_unlabeled_split(data,target):
    labeled_data = data[target.isna() == False] 
    unlabeled_data = data[target.isna()]
    
    return labeled_data,unlabeled_data,target[target.isna() == False]

###################################################################

In [250]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, Y_train, Y_test = train_test_split(inputs,target, test_size=0.3, random_state=42)

X_train_labeled , X_train_unlabeled , Y_train_labeled = labeled_unlabeled_split(X_train,Y_train)
Y_train_labeled = Y_train_labeled.astype(int)

X_test_labeled , X_test_unlabeled , Y_test_labeled = labeled_unlabeled_split(X_test,Y_test)
Y_train_labeled = Y_train_labeled.astype(int)

In [254]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

tree_model = DecisionTreeClassifier()
grid_search = GridSearchCV(tree_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_labeled, Y_train_labeled)

best_params = grid_search.best_params_

In [262]:
# first step 

# fiting the model with the dataset labeled
best_tree_model = DecisionTreeClassifier(**best_params)
best_tree_model.fit(X_train_labeled, Y_train_labeled)
prediction = best_tree_model.predict(X_train_labeled)
confidence_score = accuracy_score(prediction, Y_train_labeled)
print(confidence_score)

0.9366032699366033


In [266]:
# second step

# if the confidence score is higher than a supposed threshold we can annotate the unlabeled dataset
new_Y_train_unlabeled = best_tree_model.predict(X_train_unlabeled)

In [272]:
new_target = np.hstack((new_Y_train_labeled,Y_train_unlabeled))

In [273]:
new_target.shape

(66675,)

In [274]:
X_train.shape

(66675, 13)

In [281]:
new_inputs = np.vstack((X_train_labeled , X_train_unlabeled))
new_inputs.shape

(66675, 13)

In [284]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

# Entraîner le modèle sur les données d'entraînement
model.fit(new_inputs,new_target)

# Faire des prédictions sur l'ensemble de test
y_pred = model.predict(new_inputs)

# Calculer l'accuracy du modèle
accuracy = accuracy_score(new_target, y_pred)
accuracy

0.9165054368203974

In [285]:
best_tree_model = DecisionTreeClassifier(**best_params)
best_tree_model.fit(new_inputs, new_target)
prediction = best_tree_model.predict(new_inputs)
accuracy_score(prediction, new_target)

0.9167304086989126

In [278]:
print(target)

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
95245    NaN
95246    NaN
95247    NaN
95248    NaN
95249    NaN
Name: Machine failure, Length: 95250, dtype: object
