# HW2 of DAML
- Dataset: maintenance prediction

## Preprocessing

In [70]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('./maintenance_prediction.csv')

In [71]:
print("Missing values in each column:")
print(df.isnull().sum())

# 2. Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)

Missing values in each column:
date       0
device     0
failure    0
metric1    0
metric2    0
metric3    0
metric4    0
metric5    0
metric6    0
metric7    0
metric8    0
metric9    0
dtype: int64
Number of duplicate rows: 1


## Q1

In [72]:
# Count unique device IDs
unique_devices = df['device'].nunique()
print(f'Number of unique device IDs: {unique_devices}')

Number of unique device IDs: 1169


## Q2

In [73]:
# Summary statistics
summary_stats = df.describe()
print(summary_stats)

# Failure rate
failure_rate = df['failure'].mean()
print(f'Failure rate: {failure_rate}')

# Correlation analysis (excluding non-numeric columns)
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
correlation_matrix = df[numeric_columns].corr()
print(correlation_matrix)

             failure       metric1        metric2        metric3  \
count  124493.000000  1.244930e+05  124493.000000  124493.000000   
mean        0.000851  1.223875e+08     159.493988       9.940977   
std         0.029167  7.045934e+07    2179.686488     185.748875   
min         0.000000  0.000000e+00       0.000000       0.000000   
25%         0.000000  6.128346e+07       0.000000       0.000000   
50%         0.000000  1.227971e+08       0.000000       0.000000   
75%         0.000000  1.833091e+08       0.000000       0.000000   
max         1.000000  2.441405e+08   64968.000000   24929.000000   

             metric4        metric5        metric6        metric7  \
count  124493.000000  124493.000000  124493.000000  124493.000000   
mean        1.741134      14.222719  260173.031022       0.292531   
std        22.908598      15.943082   99151.389285       7.436954   
min         0.000000       1.000000       8.000000       0.000000   
25%         0.000000       8.000000  22145

Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,1/1/2015,S1F01085,0,215630672,55,0,52,6,407438,0,0,7
1,1/1/2015,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,1/1/2015,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,1/1/2015,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,1/1/2015,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
124489,11/2/2015,Z1F0MA1S,0,18310224,0,0,0,10,353705,8,8,0
124490,11/2/2015,Z1F0Q8RT,0,172556680,96,107,4,11,332792,0,0,13
124491,11/2/2015,Z1F0QK05,0,19029120,4832,0,0,11,350410,0,0,0
124492,11/2/2015,Z1F0QL3N,0,226953408,0,0,0,12,358980,0,0,0


## Q3

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Features and target
X = df.drop(columns=['date', 'device', 'failure'])
y = df['failure']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

X

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     37319
           1       0.00      0.00      0.00        29

    accuracy                           1.00     37348
   macro avg       0.50      0.50      0.50     37348
weighted avg       1.00      1.00      1.00     37348



Unnamed: 0,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,215630672,55,0,52,6,407438,0,0,7
1,61370680,0,3,0,6,403174,0,0,0
2,173295968,0,0,0,12,237394,0,0,0
3,79694024,0,0,0,6,410186,0,0,0
4,135970480,0,0,0,15,313173,0,0,3
...,...,...,...,...,...,...,...,...,...
124489,18310224,0,0,0,10,353705,8,8,0
124490,172556680,96,107,4,11,332792,0,0,13
124491,19029120,4832,0,0,11,350410,0,0,0
124492,226953408,0,0,0,12,358980,0,0,0


In [75]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define oversampling and undersampling strategies
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

# Create a pipeline
steps = [('o', over), ('u', under), ('model', RandomForestClassifier(random_state=42))]
pipeline = Pipeline(steps=steps)

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Random Forest Classification Report (with SMOTE and undersampling):")
print(classification_report(y_test, y_pred))
df


Random Forest Classification Report (with SMOTE and undersampling):
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     37319
           1       0.02      0.17      0.04        29

    accuracy                           0.99     37348
   macro avg       0.51      0.58      0.52     37348
weighted avg       1.00      0.99      1.00     37348



Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,1/1/2015,S1F01085,0,215630672,55,0,52,6,407438,0,0,7
1,1/1/2015,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,1/1/2015,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,1/1/2015,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,1/1/2015,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
124489,11/2/2015,Z1F0MA1S,0,18310224,0,0,0,10,353705,8,8,0
124490,11/2/2015,Z1F0Q8RT,0,172556680,96,107,4,11,332792,0,0,13
124491,11/2/2015,Z1F0QK05,0,19029120,4832,0,0,11,350410,0,0,0
124492,11/2/2015,Z1F0QL3N,0,226953408,0,0,0,12,358980,0,0,0


In [76]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Initialize and train the SVM model
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions
svm_y_pred = svm_model.predict(X_test)

# Evaluate the model
print("SVM Classification Report:")
print(classification_report(y_test, svm_y_pred))


SVM Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     37319
           1       0.00      0.00      0.00        29

    accuracy                           1.00     37348
   macro avg       0.50      0.50      0.50     37348
weighted avg       1.00      1.00      1.00     37348



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [77]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Initialize and train the GBM model
gbm_model = GradientBoostingClassifier(random_state=42)
gbm_model.fit(X_train, y_train)

# Make predictions
gbm_y_pred = gbm_model.predict(X_test)

# Evaluate the model
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, gbm_y_pred))


Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     37319
           1       0.00      0.00      0.00        29

    accuracy                           1.00     37348
   macro avg       0.50      0.50      0.50     37348
weighted avg       1.00      1.00      1.00     37348



In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize and train the Logistic Regression model
log_reg_model = LogisticRegression(random_state=42, max_iter=10000)
log_reg_model.fit(X_train, y_train)

# Make predictions
log_reg_y_pred = log_reg_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Classification Report:")
print(classification_report(y_test, log_reg_y_pred))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     37319
           1       0.00      0.00      0.00        29

    accuracy                           1.00     37348
   macro avg       0.50      0.50      0.50     37348
weighted avg       1.00      1.00      1.00     37348



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize and train the Logistic Regression model with class weights
log_reg_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=10000)
log_reg_model.fit(X_train, y_train)

# Make predictions
log_reg_y_pred = log_reg_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Classification Report (with class weight):")
print(classification_report(y_test, log_reg_y_pred))
df

Logistic Regression Classification Report (with class weight):
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     37319
           1       0.01      0.38      0.02        29

    accuracy                           0.97     37348
   macro avg       0.50      0.67      0.50     37348
weighted avg       1.00      0.97      0.98     37348



Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,1/1/2015,S1F01085,0,215630672,55,0,52,6,407438,0,0,7
1,1/1/2015,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,1/1/2015,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,1/1/2015,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,1/1/2015,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
124489,11/2/2015,Z1F0MA1S,0,18310224,0,0,0,10,353705,8,8,0
124490,11/2/2015,Z1F0Q8RT,0,172556680,96,107,4,11,332792,0,0,13
124491,11/2/2015,Z1F0QK05,0,19029120,4832,0,0,11,350410,0,0,0
124492,11/2/2015,Z1F0QL3N,0,226953408,0,0,0,12,358980,0,0,0


In [86]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize and train the Logistic Regression model with class weights
# log_reg_model = LogisticRegression(random_state=42, max_iter=10000)
log_reg_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=10000)
log_reg_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
log_reg_y_pred = log_reg_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Classification Report (with SMOTE):")
print(classification_report(y_test, log_reg_y_pred))

joblib.dump(log_reg_model, 'log_reg_model.pkl')
# You can apply SMOTE for other models similarly
df

Logistic Regression Classification Report (with SMOTE):
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     37319
           1       0.01      0.41      0.02        29

    accuracy                           0.97     37348
   macro avg       0.50      0.69      0.50     37348
weighted avg       1.00      0.97      0.98     37348



Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,1/1/2015,S1F01085,0,215630672,55,0,52,6,407438,0,0,7
1,1/1/2015,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,1/1/2015,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,1/1/2015,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,1/1/2015,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
124489,11/2/2015,Z1F0MA1S,0,18310224,0,0,0,10,353705,8,8,0
124490,11/2/2015,Z1F0Q8RT,0,172556680,96,107,4,11,332792,0,0,13
124491,11/2/2015,Z1F0QK05,0,19029120,4832,0,0,11,350410,0,0,0
124492,11/2/2015,Z1F0QL3N,0,226953408,0,0,0,12,358980,0,0,0


## Q4

In [82]:
# Feature importance
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances)

metric1    0.339165
metric6    0.264325
metric4    0.090679
metric5    0.087963
metric2    0.085172
metric8    0.051276
metric7    0.040269
metric9    0.029875
metric3    0.011275
dtype: float64


## Q5

In [87]:
# Data from the image
new_data = {
    'metric1': [127175526, 4527376],
    'metric2': [4109.433, 0],
    'metric3': [3.90566, 0],
    'metric4': [54.63208, 3],
    'metric5': [15.46226, 24],
    'metric6': [258303.5, 0],
    'metric7': [30.62264, 0],
    'metric8': [30.62264, 0],
    'metric9': [23.08491, 0]
}

new_df = pd.DataFrame(new_data)

# Load the saved model
log_reg_model = joblib.load('log_reg_model.pkl')

# Make predictions for the new data
new_predictions = log_reg_model.predict(new_df)
print("Predictions for new data:", new_predictions)

Predictions for new data: [1 0]
