In [1]:
###################################################################
#NASA Turbofan Engine Degradation reliability and failure analysis#
###################################################################
#
# Data cleaning on the NASA Turbofan engine degradation data
# The following questions will be investigated
# - Which engines show the fastest degradation?
# - Which sensors are strongest failure indicators?
# - How long do engines typically operate before failure?

In [None]:
# Create tables - engine_readings, engine_metadata, failure_events
# Data clean and prep - missing values, rename columns, convert timestamps/cycles,  rolling avgs, delts
# EDA - trends over time, time to failure, correlation between failure and sensors (graphs)
# Feature creation - RUL, degradation, failure frequency per engine type
# Run SQL queries - avg cycles to failure by engine type, rank engine by degradation spead, top 10 sensors with early failure,
#                   rolling avg using windows functions, failure rate by batch/custom (save sql files and show in notebook)
# Dashboard

In [46]:
import pandas as pd
from sqlalchemy import create_engine, text

In [47]:
# create connection
engine = create_engine("postgresql://postgres:Project1-NASA@localhost:5432/Project1-NASA")

engine.connect()

<sqlalchemy.engine.base.Connection at 0x2b34bf85dd0>

In [48]:
# connection to pull the data from the database and assign it to a dataframe
with engine.connect() as connection:
    result = connection.execute(text('SELECT * FROM engine_readings'))
    df = pd.DataFrame(result)

In [49]:
# information check
df.describe()

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
count,265256.0,265256.0,265256.0,265256.0,265256.0,265256.0,265256.0,265256.0,265256.0,265256.0,...,265256.0,265256.0,265256.0,265256.0,265256.0,265256.0,265256.0,265256.0,265256.0,265256.0
mean,104.590965,112.288337,17.224563,0.410353,95.733933,485.821285,597.278898,1466.127181,1259.4869,9.892154,...,338.650301,2349.710886,8087.265017,9.051522,0.025127,360.457294,2273.788744,98.392759,25.94554,15.567569
std,72.288716,81.173106,16.528816,0.367993,12.346819,30.422878,42.459627,118.054562,136.086342,4.265541,...,164.160366,111.057719,79.995135,0.75033,0.004998,30.987659,142.396262,4.651664,11.695193,7.017239
min,1.0,1.0,-0.0087,-0.0006,60.0,445.0,535.41,1240.94,1023.77,3.91,...,128.26,2027.57,7845.78,8.1563,0.02,302.0,1915.0,84.93,10.16,6.0105
25%,44.0,48.0,0.0013,0.0002,100.0,449.44,549.89,1356.46,1125.63,5.48,...,164.79,2387.96,8069.62,8.4332,0.02,332.0,2212.0,100.0,14.34,8.6039
50%,89.0,99.0,19.9982,0.62,100.0,489.05,605.81,1491.81,1269.2,9.35,...,320.06,2388.06,8117.35,9.05925,0.03,367.0,2319.0,100.0,24.85,14.91645
75%,163.0,160.0,35.0015,0.84,100.0,518.67,642.28,1585.82,1400.96,14.62,...,521.45,2388.14,8138.19,9.3398,0.03,392.0,2388.0,100.0,38.85,23.3106
max,260.0,543.0,42.008,0.842,100.0,518.67,645.11,1616.91,1441.49,14.62,...,537.49,2390.49,8293.72,11.0669,0.03,400.0,2388.0,100.0,39.89,23.9505


In [50]:
# information check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265256 entries, 0 to 265255
Data columns (total 28 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   engine_id      265256 non-null  int64  
 1   cycle          265256 non-null  int64  
 2   op_setting_1   265256 non-null  float64
 3   op_setting_2   265256 non-null  float64
 4   op_setting_3   265256 non-null  float64
 5   sensor_1       265256 non-null  float64
 6   sensor_2       265256 non-null  float64
 7   sensor_3       265256 non-null  float64
 8   sensor_4       265256 non-null  float64
 9   sensor_5       265256 non-null  float64
 10  sensor_6       265256 non-null  float64
 11  sensor_7       265256 non-null  float64
 12  sensor_8       265256 non-null  float64
 13  sensor_9       265256 non-null  float64
 14  sensor_10      265256 non-null  float64
 15  sensor_11      265256 non-null  float64
 16  sensor_12      265256 non-null  float64
 17  sensor_13      265256 non-nul

In [35]:
# Check for missing values
df.isnull().sum()

engine_id        0
cycle            0
op_setting_1     0
op_setting_2     0
op_setting_3     0
sensor_1         0
sensor_2         0
sensor_3         0
sensor_4         0
sensor_5         0
sensor_6         0
sensor_7         0
sensor_8         0
sensor_9         0
sensor_10        0
sensor_11        0
sensor_12        0
sensor_13        0
sensor_14        0
sensor_15        0
sensor_16        0
sensor_17        0
sensor_18        0
sensor_19        0
sensor_20        0
sensor_21        0
dataset_id       0
dataset_split    0
dtype: int64

In [None]:
# check if there are duplicate values
# used the following columns to allow for each engine and cycle to be checked to ensure no duplicate data
duplicates = df.duplicated(subset=["engine_id", "cycle", 'dataset_id', 'dataset_split']).sum()
duplicates

0