In [18]:
import pandas as pd

file_path = 'modified_data_test.csv'
data = pd.read_csv(file_path)
print(data.head())


   faultNumber  simulationRun  sample  xmeas_1  xmeas_2  xmeas_3  xmeas_4  \
0            0            2.0      21  0.25673   3668.3   4510.3   9.3270   
1            0            2.0      22  0.25728   3644.0   4482.8   9.3429   
2            0            2.0      23  0.30579   3692.9   4515.2   9.2993   
3            0            2.0      24  0.30538   3729.0   4512.3   9.3040   
4            0            2.0      25  0.27300   3678.1   4517.3   9.2570   

   xmeas_5  xmeas_6  xmeas_7  ...  xmeas_38  xmeas_39  xmeas_40  xmeas_41  \
0   26.645   42.457   2702.8  ...   0.84462   0.11533    53.435    43.598   
1   26.699   42.166   2704.1  ...   0.84462   0.11533    53.435    43.598   
2   26.911   42.110   2704.3  ...   0.84462   0.11533    53.435    43.598   
3   26.702   42.511   2704.7  ...   0.84462   0.11533    53.435    43.598   
4   26.836   42.018   2707.7  ...   0.84462   0.11533    53.435    43.598   

    xmv_1      xmv_2   xmv_3   xmv_4   xmv_5  xmv_10  
0  62.260  54.10976

#### Logistics Model "Binary  Classification"

In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Load training and testing data
train_data = pd.read_csv('modified_data_train.csv')
test_data  = pd.read_csv('modified_data_test.csv')

# 2. Convert fault numbers to binary (0 = No Fault, 1 = Faulty)
train_data['binary_fault'] = train_data['faultNumber'].apply(lambda x: 0 if x == 0 else 1)
test_data['binary_fault'] = test_data['faultNumber'].apply(lambda x: 0 if x == 0 else 1)

# 3. Drop non-predictive columns
drop_cols = ['faultNumber', 'simulationRun', 'sample', 'binary_fault']

# 4. Separate features (X) and target (y)
X_train = train_data.drop(columns=drop_cols, errors='ignore')
y_train = train_data['binary_fault']

X_test = test_data.drop(columns=drop_cols, errors='ignore')
y_test = test_data['binary_fault']

# 5. Align columns: Keep only common features
common_cols = sorted(list(set(X_train.columns).intersection(set(X_test.columns))))
X_train = X_train[common_cols]
X_test  = X_test[common_cols]

# 6. Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# 7. Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# 8. Make Predictions
y_pred = model.predict(X_test_scaled)

# 9. Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.9440110323089046
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2820
           1       0.94      1.00      0.97     47940

    accuracy                           0.94     50760
   macro avg       0.47      0.50      0.49     50760
weighted avg       0.89      0.94      0.92     50760



#### Logistics Model for "Multiclass Classification"

In [20]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Load your training and testing data
train_data = pd.read_csv('modified_data_train.csv')
test_data  = pd.read_csv('modified_data_test.csv')

# 2. Identify columns to drop (non-predictive or target columns)
drop_cols = ['faultNumber', 'simulationRun', 'sample']

# Separate features (X) and target (y)
X_train = train_data.drop(columns=drop_cols, errors='ignore')
y_train = train_data['faultNumber']

X_test = test_data.drop(columns=drop_cols, errors='ignore')
y_test = test_data['faultNumber']

# 3. Find the intersection of columns so that train and test have the same features
common_cols = sorted(list(set(X_train.columns).intersection(set(X_test.columns))))

# Keep only the common columns in the same order
X_train = X_train[common_cols]
X_test  = X_test[common_cols]

# 4. Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# 5. Create and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# 6. Evaluate on the test set
y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print('Test Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, y_pred))




import numpy as np


new_data = np.array([[12.5, 4.3, 18.6, 22.1, 7.9, 5.5, 16.2, 3.8, 11.9, 14.0,
                      20.3, 9.8, 13.1, 17.5, 15.6, 19.2, 10.4, 6.7, 8.9, 12.3,
                      11.5, 14.8, 16.9, 7.3, 9.5, 15.0, 18.2, 19.8, 13.9, 10.7,
                      16.1, 12.2, 7.8, 14.4, 11.0, 13.7, 17.3, 14.6]])  

# Scale data
new_data_scaled = scaler.transform(new_data)

# Predict
fault_prediction = model.predict(new_data_scaled)

# Result
if fault_prediction[0] == 0:
    print("The data point is Non-Faulty ")
else:
    print(f"The data point is Faulty  with Fault Number: {fault_prediction[0]}")



Test Accuracy: 0.3854609929078014
Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.04      0.06      2820
           1       0.92      0.84      0.88      2820
           2       0.94      0.84      0.89      2820
           4       0.62      0.85      0.72      2820
           5       0.12      0.19      0.14      2820
           6       1.00      0.22      0.36      2820
           7       1.00      0.85      0.92      2820
           8       0.26      0.49      0.34      2820
          10       0.19      0.12      0.15      2820
          11       0.08      0.03      0.05      2820
          12       0.14      0.26      0.18      2820
          13       0.10      0.19      0.13      2820
          14       0.08      0.04      0.05      2820
          16       0.12      0.03      0.05      2820
          17       0.69      0.70      0.69      2820
          18       0.61      0.58      0.59      2820
          19       0.08 



#### Naive Bayes  "Binary Classification"

In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the training and testing data
train_data = pd.read_csv('modified_data_train.csv')
test_data  = pd.read_csv('modified_data_test.csv')

#  Convert fault numbers to binary (0 = No Fault, 1 = Faulty)
train_data['binary_fault'] = train_data['faultNumber'].apply(lambda x: 0 if x == 0 else 1)
test_data['binary_fault'] = test_data['faultNumber'].apply(lambda x: 0 if x == 0 else 1)

#  Drop non-predictive columns
drop_cols = ['faultNumber', 'simulationRun', 'sample']

# Separate features (X) and target (y)
X_train = train_data.drop(columns=drop_cols + ['binary_fault'], errors='ignore')
y_train = train_data['binary_fault']

X_test = test_data.drop(columns=drop_cols + ['binary_fault'], errors='ignore')
y_test = test_data['binary_fault']

#  Align columns between train and test
common_cols = sorted(list(set(X_train.columns).intersection(set(X_test.columns))))
X_train = X_train[common_cols]
X_test = X_test[common_cols]

#  Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  Initialize and train Gaussian Naive Bayes
model = GaussianNB()
model.fit(X_train_scaled, y_train)

#  Make predictions
y_pred = model.predict(X_test_scaled)

#  Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("🔍 Accuracy:", accuracy)
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))


🔍 Accuracy: 0.5521079590228526

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.11      1.00      0.20      2820
           1       1.00      0.53      0.69     47940

    accuracy                           0.55     50760
   macro avg       0.56      0.76      0.44     50760
weighted avg       0.95      0.55      0.66     50760



#### Naive Bayes  "Multiclass"

In [22]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# 1. Load training and testing data
train_data = pd.read_csv('modified_data_train.csv')
test_data  = pd.read_csv('modified_data_test.csv')

# 2. Define columns to drop (target and non-predictive columns)
drop_cols = ['faultNumber', 'simulationRun', 'sample']

# 3. Separate features and target
X_train = train_data.drop(columns=drop_cols, errors='ignore')
y_train = train_data['faultNumber']

X_test = test_data.drop(columns=drop_cols, errors='ignore')
y_test = test_data['faultNumber']

# 4. Align columns: select only common features in both train and test
common_cols = sorted(set(X_train.columns).intersection(set(X_test.columns)))
X_train = X_train[common_cols]
X_test  = X_test[common_cols]

# 5. Create a Pipeline with StandardScaler and GaussianNB
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('nb', GaussianNB())
])

# 6. Define hyperparameter grid for GaussianNB
param_grid = {
    'nb__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

# 7. Use GridSearchCV to find the best hyperparameters with 5-fold CV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',  # You can choose other metrics like 'f1_macro'
    n_jobs=-1,
    verbose=1
)

# 8. Fit grid search on the training data
grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)

# 9. Get the best model and evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters: {'nb__var_smoothing': 1e-06}
Test Accuracy: 0.5765957446808511
Classification Report:
               precision    recall  f1-score   support

           0       0.17      0.80      0.28      2820
           1       0.95      0.82      0.88      2820
           2       1.00      0.81      0.90      2820
           4       0.94      0.83      0.88      2820
           5       0.24      0.11      0.15      2820
           6       1.00      0.79      0.88      2820
           7       1.00      0.80      0.89      2820
           8       0.70      0.48      0.57      2820
          10       0.54      0.21      0.30      2820
          11       0.66      0.66      0.66      2820
          12       0.46      0.36      0.40      2820
          13       0.76      0.54      0.63      2820
          14       0.79      0.74      0.77      2820
          16       0.36      0.20      0.26      2820
          17       0