In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import joblib
import yaml

class DataPreparation:
    def __init__(self, config_file):
        with open(config_file, 'r') as f:
            try:
                config = yaml.safe_load(f)
                self.file_path = config['sensor_file']
                self.sensor_data = pd.read_csv(self.file_path)
            except yaml.YAMLError as exc:
                print(exc)
    
    def preprocess_data(self):
        # Drop unnecessary columns, if any
        columns_to_drop = []  # Specify columns to drop, if necessary
        self.sensor_data = self.sensor_data.drop(columns=columns_to_drop)
        
        # Ensure all remaining columns are numeric
        self.sensor_data = self.sensor_data.apply(pd.to_numeric, errors='coerce')
        
        # Extract column names for future use
        columns = self.sensor_data.columns
        
        # Convert to NumPy array for sklearn transformations
        sensor_data_np = self.sensor_data.values
        
        # Impute missing values with median (for training data)
        imputer = SimpleImputer(strategy='median')
        sensor_data_np = imputer.fit_transform(sensor_data_np)
        
        # Standardize the data (for training data)
        scaler = StandardScaler()
        sensor_data_np = scaler.fit_transform(sensor_data_np)
        
        # Check if columns length matches the number of columns in sensor_data_np
        if sensor_data_np.shape[1] != len(columns):
            raise ValueError(f"Number of columns after preprocessing ({sensor_data_np.shape[1]}) does not match original number of columns ({len(columns)})")
        
        # Convert back to DataFrame with original columns
        self.sensor_data = pd.DataFrame(sensor_data_np, columns=columns)
        
        return self.sensor_data, columns

    def train_model(self):
        # Assume machine_status is the target variable for anomaly detection
        X = self.sensor_data.drop(columns=['machine_status'])
        y = self.sensor_data['machine_status']
        
        # Train a RandomForestClassifier (or any suitable model)
        model = RandomForestClassifier(random_state=42)
        model.fit(X, y)
        
        # Save the trained model
        model_filename = "trained_model.joblib"
        joblib.dump(model, model_filename)
        
        return model_filename
