In [83]:
import os
import requests
import json
from zipfile import ZipFile
import logging
import  numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler



In [None]:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

def download_dataset(api_url, destination_folder):
    try:
        logging.info("Starting dataset download...")
        
        # Load Kaggle API credentials
        credentials_file = 'C:/Users/mozhdeh/Desktop/programming 4/kaggle.json'
        logging.debug(f"Reading credentials from: {credentials_file}")
        
        with open(credentials_file) as f:
            kaggle_creds = json.load(f)
        
        # Access credentials correctly
        username = kaggle_creds['username']
        api_key = kaggle_creds['key']
        headers = {'Authorization': f'Kaggle {username}:{api_key}'}
        logging.debug(f"Headers: {headers}")
        
        # Send a GET request to download the dataset
        logging.debug(f"Sending GET request to: {api_url}")
        response = requests.get(api_url, headers=headers, stream=True)
        response.raise_for_status()  # Check if the request was successful
        
        # Save the zip file
        zip_path = os.path.join(destination_folder, 'pump-sensor-data.zip')
        logging.debug(f"Saving zip file to: {zip_path}")
        
        with open(zip_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        
        # Extract the zip file
        logging.info("Extracting dataset...")
        
        with ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        
        # Clean up by removing the zip file
        os.remove(zip_path)
        
        logging.info("Dataset download and extraction completed.")
        
    except Exception as e:
        logging.error(f"Error: {e}")

# Specify the destination folder
destination_folder = r"C:\Users\mozhdeh\Desktop\programming 4"
os.makedirs(destination_folder, exist_ok=True)

# Call the function to download and extract the dataset
download_dataset('https://www.kaggle.com/api/v1/datasets/download/nphantawee/pump-sensor-data', destination_folder)


In [11]:
sensor_data = pd.read_csv("C:/Users/mozhdeh/Desktop/programming 4/sensor.csv")
sensor_data

Unnamed: 0.1,Unnamed: 0,timestamp,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,...,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,machine_status
0,0,2018-04-01 00:00:00,2.465394,47.09201,53.211800,46.310760,634.375000,76.45975,13.41146,16.13136,...,41.92708,39.641200,65.68287,50.92593,38.194440,157.9861,67.70834,243.0556,201.3889,NORMAL
1,1,2018-04-01 00:01:00,2.465394,47.09201,53.211800,46.310760,634.375000,76.45975,13.41146,16.13136,...,41.92708,39.641200,65.68287,50.92593,38.194440,157.9861,67.70834,243.0556,201.3889,NORMAL
2,2,2018-04-01 00:02:00,2.444734,47.35243,53.211800,46.397570,638.888900,73.54598,13.32465,16.03733,...,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,241.3194,203.7037,NORMAL
3,3,2018-04-01 00:03:00,2.460474,47.09201,53.168400,46.397568,628.125000,76.98898,13.31742,16.24711,...,40.88541,39.062500,64.81481,51.21528,38.194440,155.9606,66.84028,240.4514,203.1250,NORMAL
4,4,2018-04-01 00:04:00,2.445718,47.13541,53.211800,46.397568,636.458300,76.58897,13.35359,16.21094,...,41.40625,38.773150,65.10416,51.79398,38.773150,158.2755,66.55093,242.1875,201.3889,NORMAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220315,220315,2018-08-31 23:55:00,2.407350,47.69965,50.520830,43.142361,634.722229,64.59095,15.11863,16.65220,...,38.28125,68.287030,52.37268,48.32176,41.087960,212.3843,153.64580,,231.1921,NORMAL
220316,220316,2018-08-31 23:56:00,2.400463,47.69965,50.564240,43.142361,630.902771,65.83363,15.15480,16.70284,...,38.28125,66.840280,50.63657,48.03241,40.798610,213.8310,156.25000,,231.1921,NORMAL
220317,220317,2018-08-31 23:57:00,2.396528,47.69965,50.520830,43.142361,625.925903,67.29445,15.08970,16.70284,...,39.06250,65.393520,48.90046,48.03241,40.798610,217.3032,155.38190,,232.0602,NORMAL
220318,220318,2018-08-31 23:58:00,2.406366,47.69965,50.520832,43.142361,635.648100,65.09175,15.11863,16.56539,...,40.62500,64.236110,47.74306,48.32176,40.509258,222.5116,153.93520,,234.0856,NORMAL


In [66]:
sensor_data.shape

(220320, 56)

In [67]:
sensor_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 220320 entries, 1970-01-01 00:00:00 to 1970-01-01 00:00:00.000220319
Data columns (total 56 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   level_0         220320 non-null  int64         
 1   timestamp       220320 non-null  datetime64[ns]
 2   index           220320 non-null  int64         
 3   timestamp       220320 non-null  datetime64[ns]
 4   sensor_00       210112 non-null  float64       
 5   sensor_01       219951 non-null  float64       
 6   sensor_02       220301 non-null  float64       
 7   sensor_03       220301 non-null  float64       
 8   sensor_04       220301 non-null  float64       
 9   sensor_05       220301 non-null  float64       
 10  sensor_06       215522 non-null  float64       
 11  sensor_07       214869 non-null  float64       
 12  sensor_08       215213 non-null  float64       
 13  sensor_09       215725 non-null  float64     

In [65]:
sensor_data.index = pd.to_datetime(sensor_data.index)

# DataFrame is named sensor_data and currently has this index
# Convert the index to datetime
sensor_data.index = pd.to_datetime(sensor_data.index, unit='ns')

# Now,checking the first few rows to verify
print(sensor_data.head())


                               level_0                     timestamp  index  \
1970-01-01 00:00:00.000000000        0 1970-01-01 00:00:00.000000000      0   
1970-01-01 00:00:00.000000001        1 1970-01-01 00:00:00.000000001      1   
1970-01-01 00:00:00.000000002        2 1970-01-01 00:00:00.000000002      2   
1970-01-01 00:00:00.000000003        3 1970-01-01 00:00:00.000000003      3   
1970-01-01 00:00:00.000000004        4 1970-01-01 00:00:00.000000004      4   

                                                  timestamp  sensor_00  \
1970-01-01 00:00:00.000000000 1970-01-01 00:00:00.000000000   2.465394   
1970-01-01 00:00:00.000000001 1970-01-01 00:00:00.000000001   2.465394   
1970-01-01 00:00:00.000000002 1970-01-01 00:00:00.000000002   2.444734   
1970-01-01 00:00:00.000000003 1970-01-01 00:00:00.000000003   2.460474   
1970-01-01 00:00:00.000000004 1970-01-01 00:00:00.000000004   2.445718   

                               sensor_01  sensor_02  sensor_03  sensor_04  \
197

In [78]:
# Inspect the first few rows of the 'timestamp' column
print(sensor_data['timestamp'].head())

# Check for any duplicated timestamps
duplicate_timestamps = sensor_data['timestamp'].duplicated().sum()
print(f"Number of duplicated timestamps: {duplicate_timestamps}")

# Show some duplicated timestamps if any
if duplicate_timestamps > 0:
    print(sensor_data[sensor_data['timestamp'].duplicated(keep=False)].head())


1970-01-01 00:00:00.000000000   1970-01-01 00:00:00.000000000
1970-01-01 00:00:00.000000001   1970-01-01 00:00:00.000000001
1970-01-01 00:00:00.000000002   1970-01-01 00:00:00.000000002
1970-01-01 00:00:00.000000003   1970-01-01 00:00:00.000000003
1970-01-01 00:00:00.000000004   1970-01-01 00:00:00.000000004
Name: timestamp, dtype: datetime64[ns]
Number of duplicated timestamps: 0


In [79]:
# Remove duplicated columns
sensor_data = sensor_data.loc[:, ~sensor_data.columns.duplicated()]

# Verify the columns to ensure duplication is resolved
print(sensor_data.columns)


Index(['level_0', 'timestamp', 'index', 'sensor_00', 'sensor_01', 'sensor_02',
       'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07',
       'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12',
       'sensor_13', 'sensor_14', 'sensor_16', 'sensor_17', 'sensor_18',
       'sensor_19', 'sensor_20', 'sensor_21', 'sensor_22', 'sensor_23',
       'sensor_24', 'sensor_25', 'sensor_26', 'sensor_27', 'sensor_28',
       'sensor_29', 'sensor_30', 'sensor_31', 'sensor_32', 'sensor_33',
       'sensor_34', 'sensor_35', 'sensor_36', 'sensor_37', 'sensor_38',
       'sensor_39', 'sensor_40', 'sensor_41', 'sensor_42', 'sensor_43',
       'sensor_44', 'sensor_45', 'sensor_46', 'sensor_47', 'sensor_48',
       'sensor_49', 'sensor_50', 'sensor_51', 'machine_status'],
      dtype='object')


In [106]:
# Ensure 'timestamp' column is properly converted to datetime
sensor_data.loc[:, 'timestamp'] = pd.to_datetime(sensor_data['timestamp'], errors='coerce')

# Check for any NaT values (which are invalid timestamps)
print(f"Number of NaT values: {sensor_data['timestamp'].isna().sum()}")

# Drop rows with NaT values in 'timestamp'
sensor_data_cleaned = sensor_data.dropna(subset=['timestamp'])

# Set 'timestamp' as the index
sensor_data_cleaned.set_index('timestamp', inplace=True)

# Drop any remaining unnecessary columns
sensor_data_cleaned.drop(columns=['level_0', 'index'], errors='ignore', inplace=True)

# Verify the result
print("Original DataFrame shape:", sensor_data.shape)
print("Cleaned DataFrame shape:", sensor_data_cleaned.shape)
print(sensor_data_cleaned.head())
# Saving cleaned data to a CSV file
sensor_data_cleaned.to_csv('C:/Users/mozhdeh/Desktop/programming 4/cleaned_sensor_data.csv')


Number of NaT values: 0
Original DataFrame shape: (220320, 55)
Cleaned DataFrame shape: (220320, 52)
                               sensor_00  sensor_01  sensor_02  sensor_03  \
timestamp                                                                   
1970-01-01 00:00:00.000000000   2.465394   47.09201    53.2118  46.310760   
1970-01-01 00:00:00.000000001   2.465394   47.09201    53.2118  46.310760   
1970-01-01 00:00:00.000000002   2.444734   47.35243    53.2118  46.397570   
1970-01-01 00:00:00.000000003   2.460474   47.09201    53.1684  46.397568   
1970-01-01 00:00:00.000000004   2.445718   47.13541    53.2118  46.397568   

                               sensor_04  sensor_05  sensor_06  sensor_07  \
timestamp                                                                   
1970-01-01 00:00:00.000000000   634.3750   76.45975   13.41146   16.13136   
1970-01-01 00:00:00.000000001   634.3750   76.45975   13.41146   16.13136   
1970-01-01 00:00:00.000000002   638.8889   73.54598

In [107]:


# Path to the CSV file
file_path = 'C:/Users/mozhdeh/Desktop/programming 4/cleaned_sensor_data.csv'

# Read the CSV file into a DataFrame
sensor_data_read = pd.read_csv(file_path, parse_dates=['timestamp'], index_col='timestamp')

# Print the first few rows of the DataFrame to verify
print(sensor_data_read.head())


                               sensor_00  sensor_01  sensor_02  sensor_03  \
timestamp                                                                   
1970-01-01 00:00:00.000000000   2.465394   47.09201    53.2118  46.310760   
1970-01-01 00:00:00.000000001   2.465394   47.09201    53.2118  46.310760   
1970-01-01 00:00:00.000000002   2.444734   47.35243    53.2118  46.397570   
1970-01-01 00:00:00.000000003   2.460474   47.09201    53.1684  46.397568   
1970-01-01 00:00:00.000000004   2.445718   47.13541    53.2118  46.397568   

                               sensor_04  sensor_05  sensor_06  sensor_07  \
timestamp                                                                   
1970-01-01 00:00:00.000000000   634.3750   76.45975   13.41146   16.13136   
1970-01-01 00:00:00.000000001   634.3750   76.45975   13.41146   16.13136   
1970-01-01 00:00:00.000000002   638.8889   73.54598   13.32465   16.03733   
1970-01-01 00:00:00.000000003   628.1250   76.98898   13.31742   16.24711  

In [127]:
import numpy as np

# Define the time range and frequency
time_range = pd.date_range(start='1970-01-01', periods=220320, freq='U')

# Generate synthetic sensor data
synthetic_data = {
    'sensor_00': np.random.uniform(2.4, 2.5, size=len(time_range)),
    'sensor_01': np.random.uniform(46, 57, size=len(time_range)),
    # Add other sensors as needed
}

# Create a DataFrame
synthetic_df = pd.DataFrame(synthetic_data, index=time_range)

# Print the range of synthetic data
print(f"Synthetic data range: {synthetic_df.index.min()} to {synthetic_df.index.max()}")


Synthetic data range: 1970-01-01 00:00:00 to 1970-01-01 00:00:00.220319


In [124]:
import pandas as pd

# Assuming you have already loaded the dataset
sensor_data = pd.read_csv('C:/Users/mozhdeh/Desktop/programming 4/cleaned_sensor_data.csv')

# Convert 'timestamp' to datetime
sensor_data['timestamp'] = pd.to_datetime(sensor_data['timestamp'], errors='coerce')
sensor_data.set_index('timestamp', inplace=True)

# Resample to a 1-second frequency
sensor_data_resampled = sensor_data.resample('1S').mean()

# Check the new data range
print(f"Resampled data range: {sensor_data_resampled.index.min()} to {sensor_data_resampled.index.max()}")


Resampled data range: 1970-01-01 00:00:00 to 1970-01-01 00:00:00


In [132]:
# Example: Fill missing values with the mean of the column
sensor_data_resampled.fillna(sensor_data_resampled.mean(), inplace=True)

# Or drop columns with excessive missing values
sensor_data_resampled.dropna(axis=1, how='any', inplace=True)



In [133]:
sensor_data_resampled.to_csv('C:/Users/mozhdeh/Desktop/programming 4/cleaned_sensor_data_final.csv')


In [134]:
# Inspect the first few rows of the cleaned data
print(sensor_data_resampled.head())

# Check for any remaining missing values
print(sensor_data_resampled.isna().sum().sum())


            sensor_00  sensor_01  sensor_02  sensor_03   sensor_04  sensor_05  \
timestamp                                                                       
1970-01-01   2.372221  47.591611  50.867392  43.752481  590.673936  73.396414   

            sensor_06  sensor_07  sensor_08  sensor_09  ...  sensor_42  \
timestamp                                               ...              
1970-01-01  13.501537  15.843152  15.200721   14.79921  ...  35.453455   

            sensor_43  sensor_44  sensor_45  sensor_46  sensor_47   sensor_48  \
timestamp                                                                       
1970-01-01  43.879591  42.656877  43.094984  48.018585  44.340903  150.889044   

            sensor_49  sensor_50   sensor_51  
timestamp                                     
1970-01-01  57.119968  183.04926  202.699667  

[1 rows x 51 columns]
0


In [142]:

# Define date ranges
train_start = '2018-04-01'
train_end = '2018-06-30'
validation_start = '2018-07-01'
validation_end = '2018-07-31'
test_start = '2018-08-01'
test_end = '2018-08-31'

# Split the data
train_data = sensor_data_resampled.loc[train_start:train_end]
validation_data = sensor_data_resampled.loc[validation_start:validation_end]
test_data = sensor_data_resampled.loc[test_start:test_end]

# Save the splits
train_data.to_csv('train_data.csv')
validation_data.to_csv('validation_data.csv')
test_data.to_csv('test_data.csv')



In [147]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import joblib

# Load the training data
train_data = pd.read_csv('train_data.csv', index_col='timestamp', parse_dates=True)
# Check if the DataFrame is empty
print(train_data.head())  # Check the first few rows
print(train_data.shape)   # Check the shape of the DataFrame


print(sensor_data_resampled.index.min(), sensor_data_resampled.index.max())  # Check the date range





Empty DataFrame
Columns: [sensor_00, sensor_01, sensor_02, sensor_03, sensor_04, sensor_05, sensor_06, sensor_07, sensor_08, sensor_09, sensor_10, sensor_11, sensor_12, sensor_13, sensor_14, sensor_16, sensor_17, sensor_18, sensor_19, sensor_20, sensor_21, sensor_22, sensor_23, sensor_24, sensor_25, sensor_26, sensor_27, sensor_28, sensor_29, sensor_30, sensor_31, sensor_32, sensor_33, sensor_34, sensor_35, sensor_36, sensor_37, sensor_38, sensor_39, sensor_40, sensor_41, sensor_42, sensor_43, sensor_44, sensor_45, sensor_46, sensor_47, sensor_48, sensor_49, sensor_50, sensor_51]
Index: []

[0 rows x 51 columns]
(0, 51)
1970-01-01 00:00:00 1970-01-01 00:00:00


In [119]:
# # Load the scaler and model
# scaler = joblib.load('scaler.pkl')
# model = joblib.load('isolation_forest_model.pkl')

# # Load validation and test data
# validation_data = pd.read_csv('validation_data.csv', index_col='timestamp', parse_dates=True)
# test_data = pd.read_csv('test_data.csv', index_col='timestamp', parse_dates=True)

# # Transform the data
# X_validation_scaled = scaler.transform(validation_data)
# X_test_scaled = scaler.transform(test_data)

# # Predict anomalies
# validation_predictions = model.predict(X_validation_scaled)
# test_predictions = model.predict(X_test_scaled)

# # Save the predictions
# validation_data['anomaly'] = validation_predictions
# test_data['anomaly'] = test_predictions

# validation_data.to_csv('validation_results.csv')
# test_data.to_csv('test_results.csv')


Full date range of the cleaned dataset: 1970-01-01 00:00:00 to 1970-01-01 00:00:00.000220319


In [None]:
# import matplotlib.pyplot as plt

# def plot_sensor_anomalies(df, sensor, name):
#     """
#     Plots the sensor data with anomalies.

#     Parameters:
#     df (pd.DataFrame): The dataframe containing sensor data and anomalies.
#     sensor (str): The sensor column name.
#     name (str): The name to be used in the title and legend.

#     Returns:
#     plt.Figure: The matplotlib figure object with the plot.
#     """
#     fig, ax = plt.subplots(figsize=(12, 6))
#     ax.plot(df.index, df[sensor], label=f'{sensor} - {name}')
#     ax.set_xlabel('Time')
#     ax.set_ylabel('Value')
#     ax.set_title(f'{name} Anomalies')
#     ax.legend()
    
#     return fig


In [None]:
# import matplotlib.pyplot as plt

# def plot_sensor_anomalies(df, sensor, name):
#     """
#     Plots the sensor data with anomalies.

#     Parameters:
#     df (pd.DataFrame): The dataframe containing sensor data and anomalies.
#     sensor (str): The sensor column name.
#     name (str): The name to be used in the title and legend.

#     Returns:
#     plt.Figure: The matplotlib figure object with the plot.
#     """
#     fig, ax = plt.subplots(figsize=(12, 6))
#     ax.plot(df.index, df[sensor], label=f'{sensor} - {name}')
#     ax.set_xlabel('Time')
#     ax.set_ylabel('Value')
#     ax.set_title(f'{name} Anomalies')
#     ax.legend()
    
#     return fig


In [None]:
# # Assuming df is your dataframe with sensor data and anomalies
# fig = plot_sensor_anomalies(validation_data, 'sensor_00', 'Validation Data')
# fig.savefig('validation_sensor_00_plot.png')  # Save the plot if needed
# plt.show()
