In [1]:
###################################################################
#NASA Turbofan Engine Degradation reliability and failure analysis#
###################################################################
#
# Data cleaning on the NASA Turbofan engine degradation data
# The following questions will be investigated
# - Which engines show the fastest degradation?
# - Which sensors are strongest failure indicators?
# - How long do engines typically operate before failure?

In [None]:
# Create tables - engine_readings, engine_metadata, failure_events
# Data clean and prep - missing values, rename columns, convert timestamps/cycles,  rolling avgs, delts
# EDA - trends over time, time to failure, correlation between failure and sensors (graphs)
# Feature creation - RUL, degradation, failure frequency per engine type
# Run SQL queries - avg cycles to failure by engine type, rank engine by degradation spead, top 10 sensors with early failure,
#                   rolling avg using windows functions, failure rate by batch/custom (save sql files and show in notebook)
# Dashboard

In [2]:
import pandas as pd
from sqlalchemy import create_engine, text

In [3]:
# create connection
engine = create_engine("postgresql://postgres:Project1-NASA@localhost:5432/Project1-NASA")

engine.connect()

<sqlalchemy.engine.base.Connection at 0x12ec011b310>

In [4]:
# connection to pull the data from the database and assign it to a dataframe
with engine.connect() as connection:
    result = connection.execute(text('SELECT * FROM engine_readings LIMIT 10'))
    df = pd.DataFrame(result)

In [None]:
# information check
df.describe()

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,1.0,5.5,-0.00135,-1e-05,100.0,518.67,642.201,1587.712,1400.84,14.62,...,522.146,2388.044,8132.228,8.40839,0.03,391.8,2388.0,100.0,38.984,23.3896
std,0.0,3.02765,0.002382,0.000238,0.0,0.0,0.275941,4.030751,3.313461,0.0,...,0.400949,0.020111,3.333946,0.022654,7.314236e-18,0.918937,0.0,0.0,0.070111,0.044857
min,1.0,1.0,-0.0043,-0.0004,100.0,518.67,641.71,1582.79,1394.8,14.62,...,521.66,2388.02,8125.69,8.3682,0.03,390.0,2388.0,100.0,38.88,23.3106
25%,1.0,3.25,-0.003375,-0.000175,100.0,518.67,642.105,1583.3375,1398.8925,14.62,...,521.79,2388.03,8131.175,8.39995,0.03,391.25,2388.0,100.0,38.95,23.36865
50%,1.0,5.5,-0.0013,5e-05,100.0,518.67,642.25,1588.845,1400.785,14.62,...,522.235,2388.035,8132.585,8.4143,0.03,392.0,2388.0,100.0,38.975,23.3909
75%,1.0,7.75,0.000775,0.0001,100.0,518.67,642.365,1591.175,1402.8225,14.62,...,522.395,2388.0575,8133.6575,8.426325,0.03,392.0,2388.0,100.0,39.0375,23.4159
max,1.0,10.0,0.0019,0.0003,100.0,518.67,642.56,1592.32,1406.22,14.62,...,522.86,2388.08,8138.62,8.4318,0.03,393.0,2388.0,100.0,39.1,23.4694


In [None]:
# information check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 28 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   engine_id      10 non-null     int64  
 1   cycle          10 non-null     int64  
 2   op_setting_1   10 non-null     float64
 3   op_setting_2   10 non-null     float64
 4   op_setting_3   10 non-null     float64
 5   sensor_1       10 non-null     float64
 6   sensor_2       10 non-null     float64
 7   sensor_3       10 non-null     float64
 8   sensor_4       10 non-null     float64
 9   sensor_5       10 non-null     float64
 10  sensor_6       10 non-null     float64
 11  sensor_7       10 non-null     float64
 12  sensor_8       10 non-null     float64
 13  sensor_9       10 non-null     float64
 14  sensor_10      10 non-null     float64
 15  sensor_11      10 non-null     float64
 16  sensor_12      10 non-null     float64
 17  sensor_13      10 non-null     float64
 18  sensor_14    

In [11]:
# Check for missing values
df.isnull().sum()

engine_id        0
cycle            0
op_setting_1     0
op_setting_2     0
op_setting_3     0
sensor_1         0
sensor_2         0
sensor_3         0
sensor_4         0
sensor_5         0
sensor_6         0
sensor_7         0
sensor_8         0
sensor_9         0
sensor_10        0
sensor_11        0
sensor_12        0
sensor_13        0
sensor_14        0
sensor_15        0
sensor_16        0
sensor_17        0
sensor_18        0
sensor_19        0
sensor_20        0
sensor_21        0
dataset_id       0
dataset_split    0
dtype: int64

In [None]:
# check if there are duplicate values
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool

In [None]:
# Overall, the data was very clean with no duplicates and no missing information.
# Therefore, I will not do any further cleaning and will move on to exploratory data analysis.