Step 1: Getting and Transforming the Data
Download the dataset and split it:

In [45]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
file_path = r"C:\Users\mozhdeh\Desktop\programming 4\sensor.csv"
sensor_data = pd.read_csv(file_path)

# Drop the unnamed index column if it exists
sensor_data = sensor_data.loc[:, ~sensor_data.columns.str.contains('^Unnamed')]

# Check if 'timestamp' column exists before attempting to drop it
if 'timestamp' in sensor_data.columns:
    # Drop the timestamp column
    sensor_data = sensor_data.drop(columns=['timestamp'])

# Ensure all columns are numeric
sensor_data = sensor_data.apply(pd.to_numeric, errors='coerce')

# Save the column names before imputation
columns_before_imputation = sensor_data.columns

# Check which columns have NaN values
columns_with_nan = sensor_data.columns[sensor_data.isnull().any()]

# Print columns with NaN values
print(f"Columns with NaN values: {columns_with_nan}")

# Fill NaN values using SimpleImputer on relevant columns
imputer = SimpleImputer(strategy='median')
sensor_data = pd.DataFrame(imputer.fit_transform(sensor_data), columns=columns_before_imputation)

# Check if there are still any NaN values after imputation
if sensor_data.isnull().values.any():
    raise ValueError("There are still NaN values in the features after imputation. Please check your data preprocessing steps.")




Columns with NaN values: Index(['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04',
       'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09',
       'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14',
       'sensor_15', 'sensor_16', 'sensor_17', 'sensor_18', 'sensor_19',
       'sensor_20', 'sensor_21', 'sensor_22', 'sensor_23', 'sensor_24',
       'sensor_25', 'sensor_26', 'sensor_27', 'sensor_28', 'sensor_29',
       'sensor_30', 'sensor_31', 'sensor_32', 'sensor_33', 'sensor_34',
       'sensor_35', 'sensor_36', 'sensor_37', 'sensor_38', 'sensor_39',
       'sensor_40', 'sensor_41', 'sensor_42', 'sensor_43', 'sensor_44',
       'sensor_45', 'sensor_46', 'sensor_47', 'sensor_48', 'sensor_49',
       'sensor_50', 'sensor_51', 'machine_status'],
      dtype='object')




ValueError: Shape of passed values is (220320, 51), indices imply (220320, 53)

Step 2: Create the Model and the Drawer
Train the model and persist it:

In [47]:
# Split the data into features and target
X = sensor_data.drop(columns=['machine_status'])
y = sensor_data['machine_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature columns
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

# Save the model to a file
model_filename = r"C:\Users\mozhdeh\Desktop\programming 4\sensor_model.joblib"
joblib.dump(model, model_filename)

# Save the scaler to a file for consistent preprocessing
scaler_filename = r"C:\Users\mozhdeh\Desktop\programming 4\scaler.joblib"
joblib.dump(scaler, scaler_filename)

print(f'Model and scaler saved to {model_filename} and {scaler_filename}')

# Load the model and scaler for example prediction
loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load(scaler_filename)



# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values