In [67]:
import os
import requests
import json
from zipfile import ZipFile
import logging
import  numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler



In [68]:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

def download_dataset(api_url, destination_folder):
    try:
        logging.info("Starting dataset download...")
        
        # Load Kaggle API credentials
        credentials_file = 'C:/Users/mozhdeh/Desktop/programming 4/kaggle.json'
        logging.debug(f"Reading credentials from: {credentials_file}")
        
        with open(credentials_file) as f:
            kaggle_creds = json.load(f)
        
        # Access credentials correctly
        username = kaggle_creds['username']
        api_key = kaggle_creds['key']
        headers = {'Authorization': f'Kaggle {username}:{api_key}'}
        logging.debug(f"Headers: {headers}")
        
        # Send a GET request to download the dataset
        logging.debug(f"Sending GET request to: {api_url}")
        response = requests.get(api_url, headers=headers, stream=True)
        response.raise_for_status()  # Check if the request was successful
        
        # Save the zip file
        zip_path = os.path.join(destination_folder, 'pump-sensor-data.zip')
        logging.debug(f"Saving zip file to: {zip_path}")
        
        with open(zip_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        
        # Extract the zip file
        logging.info("Extracting dataset...")
        
        with ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        
        # Clean up by removing the zip file
        os.remove(zip_path)
        
        logging.info("Dataset download and extraction completed.")
        
    except Exception as e:
        logging.error(f"Error: {e}")

# Specify the destination folder
destination_folder = r"C:\Users\mozhdeh\Desktop\programming 4"
os.makedirs(destination_folder, exist_ok=True)

# Call the function to download and extract the dataset
download_dataset('https://www.kaggle.com/api/v1/datasets/download/nphantawee/pump-sensor-data', destination_folder)


2024-07-23 14:24:55,324 - INFO - Starting dataset download...
2024-07-23 14:24:55,324 - DEBUG - Reading credentials from: C:/Users/mozhdeh/Desktop/programming 4/kaggle.json
2024-07-23 14:24:55,332 - DEBUG - Headers: {'Authorization': 'Kaggle fatemeh62:ab5f9d2ab1026d86358e745ea6ac8cf5'}
2024-07-23 14:24:55,333 - DEBUG - Sending GET request to: https://www.kaggle.com/api/v1/datasets/download/nphantawee/pump-sensor-data
2024-07-23 14:24:55,338 - DEBUG - Starting new HTTPS connection (1): www.kaggle.com:443
2024-07-23 14:24:56,160 - DEBUG - https://www.kaggle.com:443 "GET /api/v1/datasets/download/nphantawee/pump-sensor-data HTTP/1.1" 302 0
2024-07-23 14:24:56,160 - DEBUG - Starting new HTTPS connection (1): storage.googleapis.com:443
2024-07-23 14:24:57,070 - DEBUG - https://storage.googleapis.com:443 "GET /kaggle-data-sets/131138/312855/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240723%2Fauto%2Fstorage

In [69]:
sensor_data = pd.read_csv("C:/Users/mozhdeh/Desktop/programming 4/sensor.csv")
sensor_data

Unnamed: 0.1,Unnamed: 0,timestamp,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,...,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,machine_status
0,0,2018-04-01 00:00:00,2.465394,47.09201,53.211800,46.310760,634.375000,76.45975,13.41146,16.13136,...,41.92708,39.641200,65.68287,50.92593,38.194440,157.9861,67.70834,243.0556,201.3889,NORMAL
1,1,2018-04-01 00:01:00,2.465394,47.09201,53.211800,46.310760,634.375000,76.45975,13.41146,16.13136,...,41.92708,39.641200,65.68287,50.92593,38.194440,157.9861,67.70834,243.0556,201.3889,NORMAL
2,2,2018-04-01 00:02:00,2.444734,47.35243,53.211800,46.397570,638.888900,73.54598,13.32465,16.03733,...,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,241.3194,203.7037,NORMAL
3,3,2018-04-01 00:03:00,2.460474,47.09201,53.168400,46.397568,628.125000,76.98898,13.31742,16.24711,...,40.88541,39.062500,64.81481,51.21528,38.194440,155.9606,66.84028,240.4514,203.1250,NORMAL
4,4,2018-04-01 00:04:00,2.445718,47.13541,53.211800,46.397568,636.458300,76.58897,13.35359,16.21094,...,41.40625,38.773150,65.10416,51.79398,38.773150,158.2755,66.55093,242.1875,201.3889,NORMAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220315,220315,2018-08-31 23:55:00,2.407350,47.69965,50.520830,43.142361,634.722229,64.59095,15.11863,16.65220,...,38.28125,68.287030,52.37268,48.32176,41.087960,212.3843,153.64580,,231.1921,NORMAL
220316,220316,2018-08-31 23:56:00,2.400463,47.69965,50.564240,43.142361,630.902771,65.83363,15.15480,16.70284,...,38.28125,66.840280,50.63657,48.03241,40.798610,213.8310,156.25000,,231.1921,NORMAL
220317,220317,2018-08-31 23:57:00,2.396528,47.69965,50.520830,43.142361,625.925903,67.29445,15.08970,16.70284,...,39.06250,65.393520,48.90046,48.03241,40.798610,217.3032,155.38190,,232.0602,NORMAL
220318,220318,2018-08-31 23:58:00,2.406366,47.69965,50.520832,43.142361,635.648100,65.09175,15.11863,16.56539,...,40.62500,64.236110,47.74306,48.32176,40.509258,222.5116,153.93520,,234.0856,NORMAL


In [70]:
sensor_data.shape

(220320, 55)

In [71]:
sensor_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220320 entries, 0 to 220319
Data columns (total 55 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unnamed: 0      220320 non-null  int64  
 1   timestamp       220320 non-null  object 
 2   sensor_00       210112 non-null  float64
 3   sensor_01       219951 non-null  float64
 4   sensor_02       220301 non-null  float64
 5   sensor_03       220301 non-null  float64
 6   sensor_04       220301 non-null  float64
 7   sensor_05       220301 non-null  float64
 8   sensor_06       215522 non-null  float64
 9   sensor_07       214869 non-null  float64
 10  sensor_08       215213 non-null  float64
 11  sensor_09       215725 non-null  float64
 12  sensor_10       220301 non-null  float64
 13  sensor_11       220301 non-null  float64
 14  sensor_12       220301 non-null  float64
 15  sensor_13       220301 non-null  float64
 16  sensor_14       220299 non-null  float64
 17  sensor_15 

In [72]:
sensor_data.index = pd.to_datetime(sensor_data.index)

# DataFrame is named sensor_data and currently has this index
# Convert the index to datetime
sensor_data.index = pd.to_datetime(sensor_data.index, unit='ns')

# Now,checking the first few rows to verify
print(sensor_data.head())


                               Unnamed: 0            timestamp  sensor_00  \
1970-01-01 00:00:00.000000000           0  2018-04-01 00:00:00   2.465394   
1970-01-01 00:00:00.000000001           1  2018-04-01 00:01:00   2.465394   
1970-01-01 00:00:00.000000002           2  2018-04-01 00:02:00   2.444734   
1970-01-01 00:00:00.000000003           3  2018-04-01 00:03:00   2.460474   
1970-01-01 00:00:00.000000004           4  2018-04-01 00:04:00   2.445718   

                               sensor_01  sensor_02  sensor_03  sensor_04  \
1970-01-01 00:00:00.000000000   47.09201    53.2118  46.310760   634.3750   
1970-01-01 00:00:00.000000001   47.09201    53.2118  46.310760   634.3750   
1970-01-01 00:00:00.000000002   47.35243    53.2118  46.397570   638.8889   
1970-01-01 00:00:00.000000003   47.09201    53.1684  46.397568   628.1250   
1970-01-01 00:00:00.000000004   47.13541    53.2118  46.397568   636.4583   

                               sensor_05  sensor_06  sensor_07  ...  \
197

In [73]:
# Inspect the first few rows of the 'timestamp' column
print(sensor_data['timestamp'].head())

# Check for any duplicated timestamps
duplicate_timestamps = sensor_data['timestamp'].duplicated().sum()
print(f"Number of duplicated timestamps: {duplicate_timestamps}")

# Show some duplicated timestamps if any
if duplicate_timestamps > 0:
    print(sensor_data[sensor_data['timestamp'].duplicated(keep=False)].head())


1970-01-01 00:00:00.000000000    2018-04-01 00:00:00
1970-01-01 00:00:00.000000001    2018-04-01 00:01:00
1970-01-01 00:00:00.000000002    2018-04-01 00:02:00
1970-01-01 00:00:00.000000003    2018-04-01 00:03:00
1970-01-01 00:00:00.000000004    2018-04-01 00:04:00
Name: timestamp, dtype: object
Number of duplicated timestamps: 0


In [74]:
# Remove duplicated columns
sensor_data = sensor_data.loc[:, ~sensor_data.columns.duplicated()]

# Verify the columns to ensure duplication is resolved
print(sensor_data.columns)


Index(['Unnamed: 0', 'timestamp', 'sensor_00', 'sensor_01', 'sensor_02',
       'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07',
       'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12',
       'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16', 'sensor_17',
       'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21', 'sensor_22',
       'sensor_23', 'sensor_24', 'sensor_25', 'sensor_26', 'sensor_27',
       'sensor_28', 'sensor_29', 'sensor_30', 'sensor_31', 'sensor_32',
       'sensor_33', 'sensor_34', 'sensor_35', 'sensor_36', 'sensor_37',
       'sensor_38', 'sensor_39', 'sensor_40', 'sensor_41', 'sensor_42',
       'sensor_43', 'sensor_44', 'sensor_45', 'sensor_46', 'sensor_47',
       'sensor_48', 'sensor_49', 'sensor_50', 'sensor_51', 'machine_status'],
      dtype='object')


In [75]:
# Ensure 'timestamp' column is properly converted to datetime
sensor_data.loc[:, 'timestamp'] = pd.to_datetime(sensor_data['timestamp'], errors='coerce')

# Check for any NaT values (which are invalid timestamps)
print(f"Number of NaT values: {sensor_data['timestamp'].isna().sum()}")

# Drop rows with NaT values in 'timestamp'
sensor_data_cleaned = sensor_data.dropna(subset=['timestamp'])

# Set 'timestamp' as the index
sensor_data_cleaned.set_index('timestamp', inplace=True)

# Drop any remaining unnecessary columns
sensor_data_cleaned.drop(columns=['level_0', 'index'], errors='ignore', inplace=True)

# Verify the result
print("Original DataFrame shape:", sensor_data.shape)
print("Cleaned DataFrame shape:", sensor_data_cleaned.shape)
print(sensor_data_cleaned.head())
# Saving cleaned data to a CSV file
sensor_data_cleaned.to_csv('C:/Users/mozhdeh/Desktop/programming 4/cleaned_sensor_data.csv')


Number of NaT values: 0
Original DataFrame shape: (220320, 55)
Cleaned DataFrame shape: (220320, 54)
                     Unnamed: 0  sensor_00  sensor_01  sensor_02  sensor_03  \
timestamp                                                                     
2018-04-01 00:00:00           0   2.465394   47.09201    53.2118  46.310760   
2018-04-01 00:01:00           1   2.465394   47.09201    53.2118  46.310760   
2018-04-01 00:02:00           2   2.444734   47.35243    53.2118  46.397570   
2018-04-01 00:03:00           3   2.460474   47.09201    53.1684  46.397568   
2018-04-01 00:04:00           4   2.445718   47.13541    53.2118  46.397568   

                     sensor_04  sensor_05  sensor_06  sensor_07  sensor_08  \
timestamp                                                                    
2018-04-01 00:00:00   634.3750   76.45975   13.41146   16.13136   15.56713   
2018-04-01 00:01:00   634.3750   76.45975   13.41146   16.13136   15.56713   
2018-04-01 00:02:00   638.8889   

In [76]:


# Path to the CSV file
file_path = 'C:/Users/mozhdeh/Desktop/programming 4/cleaned_sensor_data.csv'

# Read the CSV file into a DataFrame
sensor_data_read = pd.read_csv(file_path, parse_dates=['timestamp'], index_col='timestamp')

# Print the first few rows of the DataFrame to verify
print(sensor_data_read.head())


                     Unnamed: 0  sensor_00  sensor_01  sensor_02  sensor_03  \
timestamp                                                                     
2018-04-01 00:00:00           0   2.465394   47.09201    53.2118  46.310760   
2018-04-01 00:01:00           1   2.465394   47.09201    53.2118  46.310760   
2018-04-01 00:02:00           2   2.444734   47.35243    53.2118  46.397570   
2018-04-01 00:03:00           3   2.460474   47.09201    53.1684  46.397568   
2018-04-01 00:04:00           4   2.445718   47.13541    53.2118  46.397568   

                     sensor_04  sensor_05  sensor_06  sensor_07  sensor_08  \
timestamp                                                                    
2018-04-01 00:00:00   634.3750   76.45975   13.41146   16.13136   15.56713   
2018-04-01 00:01:00   634.3750   76.45975   13.41146   16.13136   15.56713   
2018-04-01 00:02:00   638.8889   73.54598   13.32465   16.03733   15.61777   
2018-04-01 00:03:00   628.1250   76.98898   13.31742   1

In [79]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest


# Load the dataset
sensor_data = pd.read_csv('C:/Users/mozhdeh/Desktop/programming 4/cleaned_sensor_data.csv')

# Convert the 'timestamp' column to datetime and set it as the index
sensor_data['timestamp'] = pd.to_datetime(sensor_data['timestamp'])
sensor_data.set_index('timestamp', inplace=True)

# Split the dataset into three parts
train_data = sensor_data['2018-04-01':'2018-06-30']
test_data_july = sensor_data['2018-07-01':'2018-07-31']
test_data_august = sensor_data['2018-08-01':'2018-08-31']

# Handle missing or invalid values in the training data
X_train = train_data.drop(columns=['machine_status'])
y_train = train_data['machine_status']

# Check for NaNs and infinite values
print("Before cleaning:")
print(f"NaNs: {X_train.isnull().sum().sum()}")
print(f"Infinite values: {np.isinf(X_train.values).sum()}")

# Replace infinite values with NaNs
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaNs with the mean of the respective columns
X_train.fillna(X_train.mean(), inplace=True)

# Verify that there are no remaining NaNs or infinite values
print("After initial cleaning:")
print(f"NaNs: {X_train.isnull().sum().sum()}")
print(f"Infinite values: {np.isinf(X_train.values).sum()}")

# Drop columns with more than 50% NaNs
threshold = len(X_train) * 0.5
X_train.dropna(axis=1, thresh=threshold, inplace=True)

# Drop rows with any remaining NaNs
X_train.dropna(axis=0, inplace=True)

# Verify that there are no remaining NaNs or infinite values
print("After rigorous cleaning:")
print(f"NaNs: {X_train.isnull().sum().sum()}")
print(f"Infinite values: {np.isinf(X_train.values).sum()}")

# Ensure y_train matches X_train after dropping rows
y_train = y_train[X_train.index]

# Create a pipeline with a scaler and a classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

# Train the model
pipeline.fit(X_train, y_train)





# Assuming you have a trained model
pipeline= IsolationForest(contamination=0.1)
# Persist the model to the local file system
joblib.dump(pipeline, 'C:/Users/mozhdeh/Desktop/programming 4/model.pkl')

# # Define the plot_sensor_anomalies function
# def plot_sensor_anomalies(sensor, name, df):
#     plt.figure(figsize=(15, 5))
#     plt.plot(df.index, df[sensor], label=sensor)
#     anomalies = df[df['machine_status'] == 'BROKEN']
#     plt.scatter(anomalies.index, anomalies[sensor], color='red', label='Anomaly')
#     plt.title(f'Sensor {name} Anomalies')
#     plt.xlabel('Time')
#     plt.ylabel(f'{name} Readings')
#     plt.legend()
#     plt.grid(True)
#     plt.tight_layout()
#     return plt

# # Example usage of the plot_sensor_anomalies function
# sensor = 'sensor_00'
# name = 'Sensor 00'
# plt = plot_sensor_anomalies(sensor, name, train_data)
# plt.show()



Before cleaning:
NaNs: 158763
Infinite values: 0
After initial cleaning:
NaNs: 131040
Infinite values: 0
After rigorous cleaning:
NaNs: 0
Infinite values: 0


['C:/Users/mozhdeh/Desktop/programming 4/model.pkl']

In [84]:
# test_import.py

try:
    from utils import plot_sensor_anomalies, save_plot
    print("Import successful")
except ImportError as e:
    print(f"ImportError: {e}")

# main.py

import json
import pandas as pd
from utils import plot_sensor_anomalies, save_plot
from listener import Listener
from predictor import Predictor

def main():
    # Load configuration
    with open('application.json', 'r') as file:
        config = json.load(file)
    
    # Initialize Listener
    listener = Listener(config)
    
    # Initialize Predictor
    predictor = Predictor()
    
    # Example usage of Predictor class
    # Load some data to predict on
    data = pd.read_csv("C:/Users/mozhdeh/Desktop/programming 4/test_data_august.csv")  
    
    # Transform data using the Predictor
    transformed_data = predictor.transform_data(data)
    
    # Make predictions using the Predictor
    predictions = predictor.predict(transformed_data)
    
    # Combine the predictions with the original data
    data_with_predictions = data.merge(predictions, on='timestamp', how='left')
    
    # Generate and save plots for each sensor in the configuration
    for sensor in config['sensor_names']:
        plt = plot_sensor_anomalies(sensor, f'Sensor {sensor}', data_with_predictions)
        save_plot(plt, f'{config["img_directory"]}/{sensor}_anomalies.png')
    
    # Start listening for new data files
    listener.listen()

if __name__ == "__main__":
    main()


ImportError: cannot import name 'save_plot' from 'utils' (C:\Users\mozhdeh\utils.py)


ImportError: cannot import name 'save_plot' from 'utils' (C:\Users\mozhdeh\utils.py)