In [1]:
###################################################################
#NASA Turbofan Engine Degradation reliability and failure analysis#
###################################################################
#
# Data cleaning on the NASA Turbofan engine degradation data
# The following questions will be investigated
# - Which engines show the fastest degradation?
# - Which sensors are strongest failure indicators?
# - How long do engines typically operate before failure?

In [None]:
# Create tables - engine_readings, engine_metadata, failure_events
# Data clean and prep - missing values, rename columns, convert timestamps/cycles,  rolling avgs, delts
# EDA - trends over time, time to failure, correlation between failure and sensors (graphs)
# Feature creation - RUL, degradation, failure frequency per engine type
# Run SQL queries - avg cycles to failure by engine type, rank engine by degradation spead, top 10 sensors with early failure,
#                   rolling avg using windows functions, failure rate by batch/custom (save sql files and show in notebook)
# Dashboard

In [2]:
import pandas as pd
from sqlalchemy import create_engine, text

In [3]:
# create connection
engine = create_engine("postgresql://postgres:Project1-NASA@localhost:5432/Project1-NASA")

engine.connect()

<sqlalchemy.engine.base.Connection at 0x1b5530dc150>

In [4]:
# connection to pull the data from the database and assign it to a dataframe
with engine.connect() as connection:
    result = connection.execute(text('SELECT * FROM engine_readings'))
    df = pd.DataFrame(result)

In [5]:
# information check
df.describe()

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
count,285887.0,285887.0,285887.0,285887.0,285887.0,285887.0,285887.0,285887.0,285887.0,285887.0,...,285887.0,285887.0,285887.0,285887.0,285887.0,285887.0,285887.0,285887.0,285887.0,285887.0
mean,100.760136,112.037169,15.981554,0.38074,96.041793,488.191808,600.555331,1475.104199,1270.271716,10.233339,...,351.839383,2352.480954,8091.341445,9.007547,0.025479,362.820936,2282.030785,98.508745,26.874355,16.124836
std,71.406409,80.353969,16.533342,0.370029,11.944094,30.512431,42.553019,118.194571,136.690625,4.287012,...,165.046807,107.435575,78.596006,0.739819,0.004977,31.031327,140.309767,4.499937,11.747391,7.048526
min,1.0,1.0,-0.0087,-0.0006,60.0,445.0,535.41,1240.94,1023.77,3.91,...,128.26,2027.57,7845.78,8.1563,0.02,302.0,1915.0,84.93,10.16,6.0105
25%,42.0,48.0,0.0009,0.0001,100.0,449.44,550.13,1358.17,1127.0,5.48,...,165.09,2387.97,8071.72,8.4299,0.02,332.0,2212.0,100.0,14.41,8.6457
50%,84.0,99.0,10.0046,0.2506,100.0,491.19,607.15,1497.7,1303.73,10.52,...,371.39,2388.06,8123.62,8.6586,0.03,369.0,2324.0,100.0,28.53,17.1196
75%,157.0,159.0,34.9997,0.84,100.0,518.67,642.39,1587.18,1403.05,14.62,...,521.54,2388.14,8139.35,9.3312,0.03,392.0,2388.0,100.0,38.87,23.321
max,260.0,543.0,42.008,0.842,100.0,518.67,645.11,1616.91,1441.49,14.62,...,537.49,2390.49,8293.72,11.0669,0.03,400.0,2388.0,100.0,39.89,23.9505


In [6]:
# information check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285887 entries, 0 to 285886
Data columns (total 28 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   engine_id      285887 non-null  int64  
 1   cycle          285887 non-null  int64  
 2   op_setting_1   285887 non-null  float64
 3   op_setting_2   285887 non-null  float64
 4   op_setting_3   285887 non-null  float64
 5   sensor_1       285887 non-null  float64
 6   sensor_2       285887 non-null  float64
 7   sensor_3       285887 non-null  float64
 8   sensor_4       285887 non-null  float64
 9   sensor_5       285887 non-null  float64
 10  sensor_6       285887 non-null  float64
 11  sensor_7       285887 non-null  float64
 12  sensor_8       285887 non-null  float64
 13  sensor_9       285887 non-null  float64
 14  sensor_10      285887 non-null  float64
 15  sensor_11      285887 non-null  float64
 16  sensor_12      285887 non-null  float64
 17  sensor_13      285887 non-nul

In [7]:
# Check for missing values
df.isnull().sum()

engine_id        0
cycle            0
op_setting_1     0
op_setting_2     0
op_setting_3     0
sensor_1         0
sensor_2         0
sensor_3         0
sensor_4         0
sensor_5         0
sensor_6         0
sensor_7         0
sensor_8         0
sensor_9         0
sensor_10        0
sensor_11        0
sensor_12        0
sensor_13        0
sensor_14        0
sensor_15        0
sensor_16        0
sensor_17        0
sensor_18        0
sensor_19        0
sensor_20        0
sensor_21        0
dataset_id       0
dataset_split    0
dtype: int64

In [16]:
# check if there are duplicate values
print(df.duplicated().value_counts())
df[df.duplicated()].head()

False    265256
True      20631
Name: count, dtype: int64


Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,dataset_id,dataset_split
125528,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392.0,2388.0,100.0,39.06,23.419,FD001,train
125529,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392.0,2388.0,100.0,39.0,23.4236,FD001,train
125530,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390.0,2388.0,100.0,38.95,23.3442,FD001,train
125531,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392.0,2388.0,100.0,38.88,23.3739,FD001,train
125532,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393.0,2388.0,100.0,38.9,23.4044,FD001,train
