In [128]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.utils import to_categorical
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import KMeansSMOTE
from sklearn.cluster import MiniBatchKMeans
from imblearn.over_sampling import SVMSMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df_train = pd.read_csv('training_set.csv')
df_train.head()

Unnamed: 0,Timesteps,ChassisId_encoded,af1__0,af1__1,af1__2,af1__3,af1__4,af1__5,af1__6,af1__7,...,f__239,f__240,f__241,f__242,RUL,risk_level_High,risk_level_Low,risk_level_Medium,f__22,af2__3
0,0.0,4953.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.495562,-0.928794,-0.214402,-0.09067,0.0,1,0,0,-0.009128,0
1,0.0,4955.0,0.000322,2e-05,0.057749,8.627059e-08,0.029966,0.030469,0.019837,0.000113,...,-0.484087,0.320285,2.311401,-0.026043,28.0,0,1,0,-0.009128,0
2,1.0,4955.0,0.000322,2e-05,0.057749,8.627059e-08,0.029966,0.030469,0.019837,0.000113,...,-0.472325,0.322941,2.284491,-0.026732,27.0,0,1,0,-0.009128,0
3,2.0,4955.0,0.000322,2e-05,0.057749,8.627059e-08,0.029966,0.030469,0.019837,0.000113,...,-0.509273,0.306841,2.266275,-0.027429,26.0,0,1,0,-0.009128,0
4,3.0,4955.0,0.000322,2e-05,0.057749,8.627059e-08,0.029966,0.030469,0.019837,0.000113,...,-0.513796,0.309057,2.237995,-0.02815,25.0,0,1,0,-0.009128,0


In [3]:
X = df_train.drop(['Timesteps', 'ChassisId_encoded', 'risk_level_High', 'risk_level_Low', 'risk_level_Medium'], axis=1)

In [4]:
X.shape

(157379, 289)

In [5]:
y = df_train[['risk_level_High', 'risk_level_Low', 'risk_level_Medium']]

In [6]:
y_int = np.argmax(y.values, axis=1)

In [7]:
y_int = pd.Series(y_int)

In [81]:
class_distribution = y_int.value_counts()

In [82]:
class_distribution

1    143384
0      8356
2      5639
dtype: int64

In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_int, test_size=0.33, shuffle=True)

In [78]:
X_train.shape

(105443, 289)

In [79]:
y_train.shape

(105443,)

In [83]:
minority_classes = class_distribution[class_distribution < class_distribution.max()].index
majority_class = class_distribution.idxmax()

print(minority_classes) 
print(majority_class) 

Int64Index([0, 2], dtype='int64')
1


In [84]:
def upsample(X_train, y_train):    

    resampled_X_train = X_train.copy()
    resampled_y_train = y_train.copy()

    for minority_class in minority_classes:
        minority_samples = X_train[y_train == minority_class] 
        oversampled_minority_samples = minority_samples.sample(n=class_distribution.max(), replace=True, random_state=42)
        resampled_X_train = pd.concat([resampled_X_train, oversampled_minority_samples])
        resampled_y_train = pd.concat([resampled_y_train, pd.Series([minority_class] * class_distribution.max())])
    
    return resampled_X_train, resampled_y_train

In [85]:
resampled_X_train, resampled_y_train = upsample(X_train,  y_train)

In [16]:
resampled_y_train.shape

(392211,)

In [20]:
input_shape = (17, 17, 1)

model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(64, activation='relu')) 

feature_extractor = Model(inputs=model.input, outputs=model.get_layer('dense_3').output)

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_6 (Conv2D)           (None, 15, 15, 32)        320       
                                                                 
 max_pooling2d_6 (MaxPooling  (None, 7, 7, 32)         0         
 2D)                                                             
                                                                 
 dropout_6 (Dropout)         (None, 7, 7, 32)          0         
                                                                 
 conv2d_7 (Conv2D)           (None, 5, 5, 64)          18496     
                                                                 
 max_pooling2d_7 (MaxPooling  (None, 2, 2, 64)         0         
 2D)                                                             
                                                                 
 dropout_7 (Dropout)         (None, 2, 2, 64)         

In [54]:
X_train_reshaped = resampled_X_train.values.reshape((resampled_X_train.shape[0], 17, 17, 1))

X_train_features = feature_extractor.predict(X_train_reshaped)



In [43]:
X_train_features.shape

(392211, 64)

In [23]:
X_test_reshaped = X_test.values.reshape((X_test.shape[0], 17, 17, 1))

X_test_features = feature_extractor.predict(X_test_reshaped)



In [44]:
xgb_model_ups = XGBClassifier(max_depth=4, n_estimators=100, n_jobs=-1, verbose=2, learning_rate=0.1)
xgb_model_ups.fit(X_train_features, resampled_y_train)  

Parameters: { "verbose" } are not used.



XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=-1,
              num_parallel_tree=None, objective='multi:softprob', ...)

In [45]:
xgb_predictions_ups = xgb_model_ups.predict(X_test_features)

print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_predictions_ups))
print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, xgb_predictions_ups))

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.70      0.51      2768
           1       0.99      0.86      0.92     47285
           2       0.18      0.59      0.28      1883

    accuracy                           0.84     51936
   macro avg       0.53      0.72      0.57     51936
weighted avg       0.93      0.84      0.88     51936

XGBoost Confusion Matrix:
[[ 1936   101   731]
 [ 2304 40725  4256]
 [  612   156  1115]]


In [106]:
X_test_sub = pd.read_csv('public_X_test.csv')

rul_test = pd.DataFrame(X_test_sub.groupby('ChassisId_encoded')['Timesteps'].max()).reset_index()
rul_test.columns = ['ChassisId_encoded', 'max']
X_test_sub = X_test_sub.merge(rul_test, on=['ChassisId_encoded'], how='left')
X_test_sub['RUL'] = X_test_sub['max'] - X_test_sub['Timesteps']

remove_cols = X_test_sub.columns.difference(['Timesteps','ChassisId_encoded', 'max', 'gen', 'af2__5', 'af2__6', 'af2__13', 'af2__19', 'af2__20', 'af2__22', 'af2__18', 'af2__9', 'af1__28', 'f__51','f__52','f__65','f__117','f__119','f__123','f__133'])

X_test_sub = X_test_sub[remove_cols]

input_shape = (17, 17, 1) 
X_test_sub = X_test_sub.values.reshape(-1, *input_shape)

X_test_sub_flattened = feature_extractor.predict(X_test_sub)

In [46]:
x_test_sub_predicted_classes = xgb_model_ups.predict(X_test_sub_flattened)

predictions = pd.DataFrame()
predictions['pred'] = x_test_sub_predicted_classes.ravel()

predictions['pred'] = predictions['pred'].replace({0: 'High', 1: 'Low', 2: 'Medium'})

print(predictions['pred'].value_counts())

Low       27331
Medium     3313
High       2946
Name: pred, dtype: int64


In [47]:
predictions.to_csv('prediction.csv',index=False)

In [28]:
sm = KMeansSMOTE(kmeans_estimator=MiniBatchKMeans(n_init=1, random_state=0), random_state=42, k_neighbors=5, cluster_balance_threshold=0.01)

X_res_kmeans_smote, y_res_kmeans_smote = sm.fit_resample(X_train, y_train)



In [29]:
X_train_reshaped = X_res_kmeans_smote.values.reshape((X_res_kmeans_smote.shape[0], 17, 17, 1))

X_train_features = feature_extractor.predict(X_train_reshaped)



In [30]:
y_res_kmeans_smote.shape

(288304,)

In [31]:
X_train_features.shape

(288304, 64)

In [32]:
ann_model = Sequential()
ann_model.add(Dense(64, activation='relu', input_shape=(X_train_features.shape[1],)))
ann_model.add(Dropout(0.25))
ann_model.add(Dense(32, activation='relu'))
ann_model.add(Dropout(0.25))
ann_model.add(Dense(3, activation='softmax')) 

ann_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = ann_model.fit(X_train_features, to_categorical(y_res_kmeans_smote), epochs=50, batch_size=128, validation_split=0.33, verbose=1)

loss, accuracy = ann_model.evaluate(X_test_features, to_categorical(y_test, num_classes=3), verbose=1)
print(f'Test Accuracy: {accuracy:.4f}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 0.9481


In [33]:
x_test_sub_predicted_classes = ann_model.predict(X_test_sub_flattened)

x_test_sub_predicted_classes = np.argmax(x_test_sub_predicted_classes, axis=1)

predictions = pd.DataFrame()
predictions['pred'] = x_test_sub_predicted_classes.ravel()

predictions['pred'] = predictions['pred'].replace({0: 'High', 1: 'Low', 2: 'Medium'})

print(predictions['pred'].value_counts())

Low       31873
High       1687
Medium       30
Name: pred, dtype: int64


In [34]:
xgb_model_ups.fit(X_train_features, y_res_kmeans_smote)  

xgb_predictions_ups = xgb_model_ups.predict(X_test_features)

print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_predictions_ups))
print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, xgb_predictions_ups))

Parameters: { "verbose" } are not used.



XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.58      0.55      2768
           1       0.98      0.96      0.97     47285
           2       0.35      0.47      0.40      1883

    accuracy                           0.92     51936
   macro avg       0.62      0.67      0.64     51936
weighted avg       0.93      0.92      0.93     51936

XGBoost Confusion Matrix:
[[ 1610   466   692]
 [  980 45361   944]
 [  491   507   885]]


In [35]:
x_test_sub_predicted_classes = xgb_model_ups.predict(X_test_sub_flattened)

predictions = pd.DataFrame()
predictions['pred'] = x_test_sub_predicted_classes.ravel()

predictions['pred'] = predictions['pred'].replace({0: 'High', 1: 'Low', 2: 'Medium'})

print(predictions['pred'].value_counts())

Low       29144
Medium     2789
High       1657
Name: pred, dtype: int64


In [36]:
class_distribution = y_train.value_counts().to_dict()

total_samples = sum(class_distribution.values())
class_weights = {class_label: total_samples / (len(class_distribution) * class_count) for class_label, class_count in class_distribution.items()}

minority_classes = list(class_distribution.keys())
for class_label in minority_classes:
    class_weights[class_label] *= 5  

In [37]:
X_train_reshaped = X_train.values.reshape((X_train.shape[0], 17, 17, 1))

X_train_features = feature_extractor.predict(X_train_reshaped)



In [39]:
xgb_model = XGBClassifier(class_weight = class_weights)

xgb_model.fit(X_train_features, y_train.values)  

xgb_predictions_ups = xgb_model.predict(X_test_features)

print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_predictions_ups))
print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, xgb_predictions_ups))

Parameters: { "class_weight" } are not used.



XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      2768
           1       0.98      1.00      0.99     47285
           2       0.76      0.50      0.60      1883

    accuracy                           0.97     51936
   macro avg       0.85      0.77      0.80     51936
weighted avg       0.96      0.97      0.97     51936

XGBoost Confusion Matrix:
[[ 2246   371   151]
 [   75 47072   138]
 [  452   497   934]]


In [40]:
x_test_sub_predicted_classes = xgb_model.predict(X_test_sub_flattened)

predictions = pd.DataFrame()
predictions['pred'] = x_test_sub_predicted_classes.ravel()

predictions['pred'] = predictions['pred'].replace({0: 'High', 1: 'Low', 2: 'Medium'})

print(predictions['pred'].value_counts())

Low       32803
High        724
Medium       63
Name: pred, dtype: int64


## PCA - Upsampling - CNN

In [116]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
                               
pca = PCA(n_components=100) 
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X_pca), y_int, test_size=0.2, random_state=42, shuffle=True)

In [90]:
y_train

15296     1
141507    1
118851    1
147290    1
54134     1
         ..
119879    1
103694    1
131932    1
146867    1
121958    1
Length: 125903, dtype: int64

In [93]:
input_shape = (10, 10, 1)

model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(64, activation='relu')) 

feature_extractor = Model(inputs=model.input, outputs=model.get_layer('dense_8').output)

model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_10 (Conv2D)          (None, 8, 8, 32)          320       
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 4, 4, 32)         0         
 g2D)                                                            
                                                                 
 dropout_12 (Dropout)        (None, 4, 4, 32)          0         
                                                                 
 conv2d_11 (Conv2D)          (None, 2, 2, 64)          18496     
                                                                 
 max_pooling2d_11 (MaxPoolin  (None, 1, 1, 64)         0         
 g2D)                                                            
                                                                 
 dropout_13 (Dropout)        (None, 1, 1, 64)         

In [117]:
resampled_X_train, resampled_y_train = upsample(X_train,  y_train)

X_train_reshaped = resampled_X_train.values.reshape((resampled_X_train.shape[0], 10, 10, 1))

X_train_features = feature_extractor.predict(X_train_reshaped)

X_test_reshaped = X_test.values.reshape((X_test.shape[0], 10, 10, 1))

X_test_features = feature_extractor.predict(X_test_reshaped)



In [118]:
X_test_features.shape

(31476, 64)

In [119]:
xgb_model_ups.fit(X_train_features, resampled_y_train)  

xgb_predictions_ups = xgb_model_ups.predict(X_test_features)

print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_predictions_ups))
print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, xgb_predictions_ups))

Parameters: { "verbose" } are not used.



XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.67      0.42      1610
           1       0.99      0.82      0.90     28808
           2       0.15      0.57      0.23      1058

    accuracy                           0.80     31476
   macro avg       0.48      0.69      0.52     31476
weighted avg       0.93      0.80      0.85     31476

XGBoost Confusion Matrix:
[[ 1081    97   432]
 [ 2096 23638  3074]
 [  332   120   606]]


In [129]:
knn_ups = KNeighborsClassifier(n_neighbors=5)

knn_ups.fit(X_train_features, resampled_y_train)

y_pred_ups = knn_ups.predict(X_test_features)

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_ups))
print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_ups))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      1610
           1       1.00      0.99      0.99     28808
           2       0.72      0.88      0.79      1058

    accuracy                           0.98     31476
   macro avg       0.87      0.93      0.90     31476
weighted avg       0.98      0.98      0.98     31476

XGBoost Confusion Matrix:
[[ 1498     9   103]
 [   91 28450   267]
 [  110    16   932]]


In [120]:
X_test_sub.shape

(33590, 289)

In [121]:
scaler = StandardScaler()
X_test_sub_scaled = scaler.fit_transform(X_test_sub)
                               
pca = PCA(n_components=100) 
X_test_sub_pca = pca.fit_transform(X_test_sub_scaled)

In [123]:
X_test_sub_pca.shape

(33590, 100)

In [124]:
X_test_sub_reshaped = X_test_sub_pca.reshape((X_test_sub_pca.shape[0], 10, 10, 1))

X_test_sub_features = feature_extractor.predict(X_test_sub_reshaped)



In [125]:
X_test_sub_features.shape

(33590, 64)

In [126]:
x_test_sub_predicted_classes = xgb_model_ups.predict(X_test_sub_features)

predictions = pd.DataFrame()
predictions['pred'] = x_test_sub_predicted_classes.ravel()

predictions['pred'] = predictions['pred'].replace({0: 'High', 1: 'Low', 2: 'Medium'})

print(predictions['pred'].value_counts())

Low       24365
Medium     4669
High       4556
Name: pred, dtype: int64


In [127]:
predictions.to_csv('prediction.csv',index=False)

In [130]:
x_test_sub_predicted_classes = knn_ups.predict(X_test_sub_features)

predictions = pd.DataFrame()
predictions['pred'] = x_test_sub_predicted_classes.ravel()

predictions['pred'] = predictions['pred'].replace({0: 'High', 1: 'Low', 2: 'Medium'})

print(predictions['pred'].value_counts())

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Low       29604
High       2297
Medium     1689
Name: pred, dtype: int64
