In [2]:
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)


In [9]:
import pandas as pd

df = pd.read_csv("../data/ai4i2020.csv")
df.head()


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [58]:
# Target column
y = df["Machine failure"]

# Drop unnecessary columns
X = df.drop(["Machine failure", "UDI", "Product ID"], axis=1)


In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (8000, 11)
Test size: (2000, 11)


In [78]:
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix


In [82]:
print(X_train.dtypes[X_train.dtypes == 'object'])


Type    object
dtype: object


In [92]:
print(X_train.dtypes[X_train.dtypes == 'object'])


Series([], dtype: object)


In [94]:
print(X_train.dtypes)


Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
TWF                          int64
HDF                          int64
PWF                          int64
OSF                          int64
RNF                          int64
Type_H                        bool
Type_L                        bool
Type_M                        bool
dtype: object


In [96]:
for col in X_train.columns:
    print(f"{col}: {X_train[col].dtype}, unique values: {X_train[col].unique()[:5]}")


Air temperature [K]: float64, unique values: [298.1 298.2 298.3 298.5 298.4]
Process temperature [K]: float64, unique values: [308.6 308.7 308.5 309.  308.9]
Rotational speed [rpm]: int64, unique values: [1551 1408 1498 1433 1425]
Torque [Nm]: float64, unique values: [42.8 46.3 49.4 39.5 40. ]
Tool wear [min]: int64, unique values: [0 3 5 7 9]
TWF: int64, unique values: [0 1]
HDF: int64, unique values: [0 1]
PWF: int64, unique values: [0 1]
OSF: int64, unique values: [0 1]
RNF: int64, unique values: [0 1]
Type_H: bool, unique values: [False  True]
Type_L: bool, unique values: [False  True]
Type_M: bool, unique values: [ True False]


In [98]:
print(X_train.isnull().sum())


Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
Type_H                     0
Type_L                     0
Type_M                     0
dtype: int64


In [102]:
print(X_train.dtypes)


Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
TWF                          int64
HDF                          int64
PWF                          int64
OSF                          int64
RNF                          int64
Type_H                        bool
Type_L                        bool
Type_M                        bool
dtype: object


In [106]:
# Remove brackets and units from column names
X_train.columns = X_train.columns.str.replace(r"[\[\]<>()]", "", regex=True)
X_test.columns = X_test.columns.str.replace(r"[\[\]<>()]", "", regex=True)

print(X_train.columns)


Index(['Air temperature K', 'Process temperature K', 'Rotational speed rpm',
       'Torque Nm', 'Tool wear min', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF',
       'Type_H', 'Type_L', 'Type_M'],
      dtype='object')


In [116]:


import xgboost as xgb

model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)

model.fit(X_train, y_train)



In [120]:
predictions = model.predict(X_test)


In [122]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print("Confusion Matrix:")
print(conf_matrix)


Precision: 1.000
Recall: 0.949
F1 Score: 0.974
Confusion Matrix:
[[1961    0]
 [   2   37]]


In [128]:
feature_stats = pd.DataFrame({
    'count': df.count(),
    'unique': df.nunique(),
    'missing_values': df.isnull().sum(),
    'mean': df.mean(numeric_only=True),
    'std': df.std(numeric_only=True),
    'min': df.min(numeric_only=True),
    '25%': df.quantile(0.25, numeric_only=True),
    '50%': df.median(numeric_only=True),
    '75%': df.quantile(0.75, numeric_only=True),
    'max': df.max(numeric_only=True),
    'skewness': df.skew(numeric_only=True)
})

feature_stats.to_csv('feature_stats.csv')

print("Detailed feature_stats.csv created successfully!")


Detailed feature_stats.csv created successfully!
