In [None]:
# The goal of this project is to develop a data pipeline application for anomaly detection using a machine learning model.
# the whole process includes:

#loading the data from kaggle. Split a dataset into training, validation, and test sets.
# Training data: April, May, and June 2018
# Validation data: July 2018
# Test data: August 2018
#Model Training: Train a machine learning model on the training data and save the model to a file.
# Data Transformation for trained data
# Anomaly Detection: Use the trained model to detect anomalies in the validation and test datasets.
# Plotting: Create and save plots of sensor data anomalies. Ensure that the plotting function is independent of global variables and returns the plot file path.
# Monitoring New Data: Creating a component that listens to a directory for new files, processes them using the trained model, and performs necessary data transformations.
# Output Handling(Logging: Log the processing steps, including any errors that occur, to a log file.)






import os
import logging
from dataset_loader import DatasetDownloader
from pipeline_model import ModelPipeline
from production_pipeline import ProductionPipeline

def setup_logging():
    """
    Set up logging configuration for the application.

    Configures the logging to output messages to the console with a specific format.
    """
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(levelname)s - %(message)s',
                        handlers=[logging.StreamHandler()])
    logging.info("Logging setup complete.")

def main():
    """
    Main function to execute the data pipeline, including dataset download, model training, 
    and production pipeline monitoring.

    Steps:
    1. Setup logging configuration.
    2. Define file paths and URLs for dataset and model.
    3. Download the dataset.
    4. Verify the dataset file exists.
    5. Initialize and run the model pipeline if the model file is found.
    6. Initialize and run the production pipeline if the configuration file is found.
    """
    # Setup logging configuration
    setup_logging()

    # Defining paths and URLs
    credentials_file = 'C:/Users/mozhdeh/Desktop/programming 4/kaggle.json'
    api_url = 'https://www.kaggle.com/api/v1/datasets/download/nphantawee/pump-sensor-data'
    destination_folder = 'C:/Users/mozhdeh/Desktop/programming 4'
    dataset_file_name = 'sensor.csv'
    dataset_path = os.path.join(destination_folder, dataset_file_name)
    model_path = os.path.join(destination_folder, 'model.pkl')
    config_path = 'application.json'

    # Creating destination folder if it doesn't exist
    os.makedirs(destination_folder, exist_ok=True)

    logging.info("Starting dataset download...")

    try:
        # Creating DatasetDownloader instance from dataset_loader module and download the dataset
        downloader = DatasetDownloader(credentials_file, api_url, destination_folder)
        downloader.download_dataset()
        logging.info("Dataset download and extraction completed.")
    except Exception as e:
        logging.error("An error occurred during dataset download: %s", e)
        return

    # Checking if the dataset file exists before proceeding
    if not os.path.isfile(dataset_path):
        logging.error("Dataset file not found at %s", dataset_path)
        return

    logging.info("Dataset path set to: %s", dataset_path)

    # Initializing and using the ModelPipeline module
    if os.path.isfile(model_path):
        try:
            model_pipeline = ModelPipeline(model_path=model_path, data_dir=destination_folder)
            logging.info("Found model file. Running model pipeline...")
            model_pipeline.run_pipeline(input_data_path=dataset_path)
            logging.info("Model pipeline execution completed.")
        except Exception as e:
            logging.error("An error occurred during model pipeline execution: %s", e)
            return
    else:
        logging.error("Model file not found at %s", model_path)
        return

    # Initialize and run the production pipeline module
    if os.path.isfile(config_path):
        try:
            logging.info("Starting production pipeline...")
            production_pipeline = ProductionPipeline(config_path=config_path)
            
            # Start monitoring the input directory for new files
            production_pipeline.start_monitoring()
            logging.info("Production pipeline is now monitoring for new files.")
        except Exception as e:
            logging.error("An error occurred while running the production pipeline: %s", e)
    else:
        logging.error("Configuration file not found at %s", config_path)

if __name__ == "__main__":
    main()


2024-07-31 02:05:52,751 - INFO - Logging setup complete.
2024-07-31 02:05:52,751 - INFO - Starting dataset download...
2024-07-31 02:06:00,832 - INFO - Dataset download and extraction completed.
2024-07-31 02:06:00,832 - INFO - Dataset path set to: C:/Users/mozhdeh/Desktop/programming 4\sensor.csv
2024-07-31 02:06:00,832 - INFO - Found model file. Running model pipeline...
2024-07-31 02:06:00,832 - INFO - Running pipeline
2024-07-31 02:06:00,832 - INFO - Loading data from C:/Users/mozhdeh/Desktop/programming 4\sensor.csv


Dataset downloaded and extracted.


2024-07-31 02:06:17,727 - INFO - Data loaded and saved: train_data.csv, val_data_july.csv, test_data_august.csv
2024-07-31 02:06:17,821 - INFO - Data split into training and validation sets.
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
2024-07-31 02:06:17,962 - INFO - Data transformation completed.
2024-07-31 02:07:23,813 - INFO - Model trained and saved to C:/Users/mozhdeh/Desktop/programming 4\model.pkl
2024-07-31 02:07:27,243 - INFO - Validation Accuracy: 0.93
2024-07-31 02:07:27,243 - INFO - Classification Report:
              precision    recall  f1-score   support

      BROKEN       0.00      0.00      0.00         1
      NORMAL       0.93      1.00      0.97     36700
  RECOVERING       0.00      0.00      0.00      2611

    accuracy                           0.93     39312
   macro avg       0.31      0.33      0.32     39312
weighted avg       0.87      0.93    

Validation Accuracy: 0.93
Classification Report:
               precision    recall  f1-score   support

      BROKEN       0.00      0.00      0.00         1
      NORMAL       0.93      1.00      0.97     36700
  RECOVERING       0.00      0.00      0.00      2611

    accuracy                           0.93     39312
   macro avg       0.31      0.33      0.32     39312
weighted avg       0.87      0.93      0.90     39312



2024-07-31 02:07:29,468 - INFO - Anomaly plot saved to C:/Users/mozhdeh/Desktop/programming 4\anomaly_plot_sensor_04.png
2024-07-31 02:07:31,987 - INFO - Anomaly plot saved to C:/Users/mozhdeh/Desktop/programming 4\anomaly_plot_sensor_51.png
2024-07-31 02:07:31,990 - INFO - Processing new data from C:/Users/mozhdeh/Desktop/programming 4\test_data_august.csv


Anomaly plot for sensor_04 saved to C:/Users/mozhdeh/Desktop/programming 4\anomaly_plot_sensor_04.png
Anomaly plot for sensor_51 saved to C:/Users/mozhdeh/Desktop/programming 4\anomaly_plot_sensor_51.png


2024-07-31 02:07:33,744 - INFO - Model pipeline execution completed.
2024-07-31 02:07:33,744 - INFO - Starting production pipeline...
2024-07-31 02:07:33,744 - INFO - Configuration loaded.
2024-07-31 02:07:33,744 - INFO - Logging setup complete.
2024-07-31 02:07:33,744 - INFO - Started listening for new files.


Predictions for new data:
 ['NORMAL' 'NORMAL' 'NORMAL' ... 'NORMAL' 'NORMAL' 'NORMAL']


In [1]:
# the output shows:
#The script has successfully downloaded and extracted the dataset from the specified source.
#The path to the dataset is confirmed, and it's ready for further processing.
#The script has found the model file and started the model pipeline. It loaded the data from the CSV file, 
#processed it, and saved it into training, validation, and test sets. The data splitting step is complete,
#preparing the data for model training has been done.
#The model has achieved perfect classification on the validation set, meaning it made no mistakes on the data it was evaluated against.
#This high accuracy (0.93) might suggest that the model is performing exceptionally well on this specific dataset.
#he system has successfully generated and saved an anomaly plot for the sensor 'sensor_014,51'. 
#This plot visualizes anomalies detected in the sensor data and is saved as an image file.
#The system has started processing new data from the file test_data_august.csv.
#This data will be evaluated by the trained model to make predictions.
#the entire model pipeline process (including data preparation, training, evaluation, and possibly anomaly plotting) has been completed successfully
#The production pipeline has been initiated. This phase is responsible for applying the trained model to new data in a real-time or batch processing manner.
#The production pipeline has successfully loaded its configuration settings, which might include parameters for monitoring directories, model paths, and other operational details.
#Logging has been set up for the production pipeline to record operations and any issues that occur during execution.
#The production pipeline is now actively monitoring the specified directory for new files. When new data files arrive, they will be processed using the trained model.
#The model has made predictions on the new data from test_data_august.csv. All predictions are classified as 'NORMAL', indicating that the model has identified all instances in the new data as belonging to the 'NORMAL' class.