## EXPLANATORY DATA ANALYSIS

The dataset originally comes from: https://www.kaggle.com/datasets/behrad3d/nasa-cmaps

In [1]:
# Importing necessary libraries.
import pandas as pd
import numpy as np
import os
from pathlib import Path


In [2]:
# Load the files
url = os.path.join(os.path.dirname(os.getcwd()), "data","raw")
print(url)
# Label the columns
column_names = [
    'engine_id', 'cycle',
    'op_setting_1', 'op_setting_2', 'op_setting_3',
    'T2', 'T24', 'T30', 'T50',
    'P2', 'P15', 'P30',
    'Nf', 'Nc', 'epr', 'Ps30', 'phi',
    'NRf', 'NRc', 'BPR', 'farB', 'htBleed',
    'Nf_dmd', 'PCNfR_dmd', 'W31', 'W32'
]

# Get file paths
train_data = ["FD001_train.txt", "FD002_train.txt","FD003_train.txt", "FD004_train.txt"]
test_data = ["FD001_test.txt", "FD002_test.txt","FD003_test.txt", "FD004_test.txt"]
RUL_data = ["FD001_RUL.txt","FD002_RUL.txt","FD003_RUL.txt","FD004_RUL.txt"]

# Iterate through the train_data to label all of them and save them as csv.
def label_train_data(train_data, columns = column_names, train_df_dict=None):
    #Initialize the dictionary
    train_df_dict = {}
    for data in train_data:
        # Generate the file paths
        file_path = os.path.join(url,"nasa_cmaps_txt",data)
        # We only want to access the stem name if we want to access the full name plus the extension we would have os.path.basename
        base_name = Path(file_path).stem
        # Read the file names to a temporary dataframe and store it in a dictionary.
        temporary_dataframe = pd.read_csv(file_path, sep=r'\s+', names = column_names)
        train_df_dict[base_name]= temporary_dataframe
        # Saving the files as csv
    for key,value in train_df_dict.items():
        file_name = key + ".csv"
        file_path = os.path.join(url,"nasa_cmaps_csv", file_name)
        print(file_path)
        value.to_csv(file_path, sep=',', index = False)
    return train_df_dict  #Keeping the dataframes for later use.
train_data = label_train_data(train_data)

# Iterate through test data to label all of them and save them as csv
def label_test_data(test_data, columns = column_names , test_df_dict=None):
    test_df_dict = {}
    #Iterate through to get the file path of each
    for data in test_data:
        file_path = os.path.join(url, "nasa_cmaps_txt",data)
        # Get the stem name of each
        basename = Path(file_path).stem
        # Read it to a df
        temporary_dataframe = pd.read_csv(file_path, sep=r'\s+', names = column_names)
        # Save the key,value pair into a dictionary
        test_df_dict[basename] = temporary_dataframe
    # Iterate through the dictionary to save the dfs as csv
    for key, value in test_df_dict.items():
        filename = key + ".csv"
        file_path = os.path.join(url, "nasa_cmaps_csv",filename)
        print(file_path)
        #save the dataframe to csv
        value.to_csv(file_path)

    return test_df_dict
test_data = label_test_data(test_data)

# Iterate through RUL and label it
def label_rul_data(RUL_data , column="remaining_useful_life", rul_df_dict=None):
    rul_df_dict = {}
    for data in RUL_data:
        file_path = os.path.join(url, "nasa_cmaps_txt", data)
        basename = Path(file_path).stem
        temp_dataframe = pd.read_csv(file_path, sep=r'\s+', names=column_names)
        rul_df_dict[basename] = temp_dataframe
    #Iterate through the dict to save it as csv
    for key, value in rul_df_dict.items():
        filename = key + ".csv"
        file_path = os.path.join(url,"nasa_cmaps_csv", filename)
        print(file_path)
        value.to_csv(file_path, sep=',', index = False)
    return rul_df_dict
rul_data = label_rul_data(RUL_data)


/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw
/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw/nasa_cmaps_csv/FD001_train.csv
/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw/nasa_cmaps_csv/FD002_train.csv
/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw/nasa_cmaps_csv/FD003_train.csv
/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw/nasa_cmaps_csv/FD004_train.csv
/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw/nasa_cmaps_csv/FD001_test.csv
/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw/nasa_cmaps_csv/FD002_test.csv
/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw/nasa_cmaps_csv/FD003_test.csv
/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw/nasa_cmaps_csv/FD004_test.csv
/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw/nasa_cmaps_csv/FD001_RUL.csv
/home/local-host/PycharmProjects/Turbofan-engine-rul/data/raw/nasa_cmaps_csv/FD002_RUL.csv
/home/local-host

## N/B: We will focus on FD001_TRAIN for EDA,
- The simplest dataset with only one type of degradation and 100 engines.
- We will not combine FD001-FD004 as theyt represent different physical realities that is different degradation patterns and different sensor behaviours

### DATA LOADING AND INITIAL INSPECTION

In [3]:
## Load FD001_train to df
URL = os.path.join(os.path.dirname(os.getcwd()), "data","raw","nasa_cmaps_csv")
file_path = os.path.join(URL,"FD001_train.csv")
df_FD1 = pd.read_csv(file_path, sep=',')

# Head
df_FD1.head(10)

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,T2,T24,T30,T50,P2,...,phi,NRf,NRc,BPR,farB,htBleed,Nf_dmd,PCNfR_dmd,W31,W32
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044
5,1,6,-0.0043,-0.0001,100.0,518.67,642.1,1584.47,1398.37,14.62,...,521.68,2388.03,8132.85,8.4108,0.03,391,2388,100.0,38.98,23.3669
6,1,7,0.001,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,522.32,2388.03,8132.32,8.3974,0.03,392,2388,100.0,39.1,23.3774
7,1,8,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,...,522.47,2388.03,8131.07,8.4076,0.03,391,2388,100.0,38.97,23.3106
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.8,14.62,...,521.79,2388.05,8125.69,8.3728,0.03,392,2388,100.0,39.05,23.4066
9,1,10,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,...,521.79,2388.06,8129.38,8.4286,0.03,393,2388,100.0,38.95,23.4694


In [4]:
# Get the last 10 rows
df_FD1.tail(10)

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,T2,T24,T30,T50,P2,...,phi,NRf,NRc,BPR,farB,htBleed,Nf_dmd,PCNfR_dmd,W31,W32
20621,100,191,-0.0005,-0.0,100.0,518.67,643.69,1610.87,1427.19,14.62,...,519.8,2388.28,8143.56,8.5092,0.03,398,2388,100.0,38.39,23.1218
20622,100,192,-0.0009,0.0001,100.0,518.67,643.53,1601.23,1419.48,14.62,...,520.59,2388.21,8143.46,8.4892,0.03,397,2388,100.0,38.56,23.077
20623,100,193,-0.0001,0.0002,100.0,518.67,643.09,1599.81,1428.93,14.62,...,520.11,2388.19,8142.02,8.5424,0.03,397,2388,100.0,38.47,23.023
20624,100,194,-0.0011,0.0003,100.0,518.67,643.72,1597.29,1427.41,14.62,...,519.55,2388.22,8139.67,8.5215,0.03,394,2388,100.0,38.38,23.1324
20625,100,195,-0.0002,-0.0001,100.0,518.67,643.41,1600.04,1431.9,14.62,...,519.71,2388.28,8142.9,8.5519,0.03,394,2388,100.0,38.14,23.1923
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.6,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,...,519.68,2388.22,8136.5,8.5139,0.03,395,2388,100.0,38.3,23.1594
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.064
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,...,519.3,2388.26,8137.33,8.5036,0.03,396,2388,100.0,38.37,23.0522


In [5]:
# Get the shape of the data
df_FD1.shape

(20631, 26)

- The dataset has 20631 rows of data and 26 columns bearing different variables for EDA.
- N/B: Documentation on what each type of column variable has been documented in the README.md

In [6]:
# Data types verification
df_FD1.dtypes

engine_id         int64
cycle             int64
op_setting_1    float64
op_setting_2    float64
op_setting_3    float64
T2              float64
T24             float64
T30             float64
T50             float64
P2              float64
P15             float64
P30             float64
Nf              float64
Nc              float64
epr             float64
Ps30            float64
phi             float64
NRf             float64
NRc             float64
BPR             float64
farB            float64
htBleed           int64
Nf_dmd            int64
PCNfR_dmd       float64
W31             float64
W32             float64
dtype: object

- From our dataset we have 22 rows as float64 which is correct and 4 rows as int64 which is correct

### DATA QUALITY ASSESSMENT

In [7]:
# Checking of missing values
df_FD1.isnull().sum()

engine_id       0
cycle           0
op_setting_1    0
op_setting_2    0
op_setting_3    0
T2              0
T24             0
T30             0
T50             0
P2              0
P15             0
P30             0
Nf              0
Nc              0
epr             0
Ps30            0
phi             0
NRf             0
NRc             0
BPR             0
farB            0
htBleed         0
Nf_dmd          0
PCNfR_dmd       0
W31             0
W32             0
dtype: int64

- There are no null values (missing values) in our dataset

In [8]:
# Divide the data to engines then do their min and max, mean, std, count, 25%, 75% and 50%
def cluster_by_engine_id(df_FD1, engine_summaries = None ):
    # Create an empty list to store the summaries
    engine_summaries = []
    # Group by engine id
    groups = df_FD1.groupby('engine_id')
    for engine_id, group in groups:
        engine_rows = group
        engine_rows.describe()



    # Store the results

    return None


engine_statistics = cluster_by_engine_id(df_FD1)