In [None]:
import sys
import os

#Add src directory to the Python path
sys.path.append(os.path.abspath("../src"))

In [None]:
from data_loader import load_turbofan_data, rename_turbofan_columns, check_data_integrity
from utils import identify_constant_sensors, drop_constant_sensors, plot_engine_cycle_distribution, plot_sensor_distributions

In [None]:
from data_loader import load_turbofan_data, rename_turbofan_columns, check_data_integrity

In [None]:
#Set file path
file_path = '../data/train_FD001.txt'

In [None]:
#Load data
df = load_turbofan_data(file_path)


In [None]:
#Rename columns
df = rename_turbofan_columns(df)

In [None]:
#Check integrity
check_data_integrity(df)

In [None]:
from utils import plot_engine_cycle_distribution, plot_sensor_distributions

In [None]:
plot_engine_cycle_distribution(df)

In [None]:
sensors_to_plot = [
    'sensor_measurement_2',
    'sensor_measurement_3',
    'sensor_measurement_4'
]

plot_sensor_distributions(df, sensors_to_plot)

In [None]:
from utils import identify_constant_sensors

In [None]:
constant_sensors = identify_constant_sensors(df)

In [None]:
from utils import drop_constant_sensors

In [None]:
from utils import plot_operational_settings_distribution

In [None]:
plot_operational_settings_distribution(df)

In [None]:
from utils import drop_columns

In [None]:
columns_to_remove = ['operational_setting_3']
df = drop_columns(df, columns_to_remove)

In [None]:
from utils import plot_correlation_heatmap

In [None]:
plot_correlation_heatmap(df)

In [None]:
columns_to_remove = [
    'sensor_measurement_1',
    'sensor_measurement_5',
    'sensor_measurement_10',
    'sensor_measurement_16',
    'sensor_measurement_18',
    'sensor_measurement_19'
]

df = drop_columns(df, columns_to_remove)

In [None]:
from utils import plot_sensor_boxplots

In [None]:
sensors_to_plot = [
    'sensor_measurement_2',
    'sensor_measurement_3',
    'sensor_measurement_4',
    'sensor_measurement_7',
    'sensor_measurement_11'
]

plot_sensor_boxplots(df, sensors_to_plot)

In [None]:
from utils import compute_rul

In [None]:
df = compute_rul(df)

In [None]:
from utils import train_random_forest_and_get_feature_importance

In [None]:
importances_df = train_random_forest_and_get_feature_importance(df)

In [None]:
from utils import train_rf_and_evaluate

In [None]:
importances_df = train_rf_and_evaluate(df)

In [None]:
from utils import add_rolling_features

In [None]:
df = add_rolling_features(df, window=5)

In [None]:
importances_df = train_rf_and_evaluate(df)

In [None]:
from utils import train_lightgbm_and_evaluate

In [None]:
importances_df = train_lightgbm_and_evaluate(df)

In [None]:
from utils import train_xgboost_and_evaluate

In [None]:
importances_df = train_xgboost_and_evaluate(df)

In [None]:
from utils import train_rf_and_return_model

In [None]:
trained_rf_model, df = train_rf_and_return_model(df)

In [None]:
from utils import compute_anomaly_threshold, plot_prediction_error_distribution

In [None]:
#Anomaly threshold and graph
threshold = compute_anomaly_threshold(df)
plot_prediction_error_distribution(df, threshold)

#List anomaly examples
anomalies = df[df['error'] > threshold]
print(f"Number of anomalies detected: {len(anomalies)}")
display(anomalies[['unit_number', 'time_in_cycles', 'RUL', 'predicted_RUL', 'error']].head())

In [None]:
anomalies.to_csv("anomalies_detected.csv", index=False)
print("Anomalies exported to anomalies_detected.csv")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df['is_anomaly'] = df['error'] > threshold

plt.figure(figsize=(12, 6))
sns.scatterplot(
    data=df,
    x='time_in_cycles',
    y='error',
    hue='is_anomaly',
    palette={True: 'red', False: 'blue'},
    legend='brief'
)
plt.title("Prediction Error over Cycles with Anomalies Highlighted")
plt.xlabel("Time in Cycles")
plt.ylabel("Prediction Error")
plt.legend(title="Anomaly", labels=["Normal", "Anomaly"])
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(
    data=df,
    x='unit_number',
    y='error',
    hue='is_anomaly',
    palette={True: 'red', False: 'blue'},
    legend='brief'
)
plt.title("Prediction Error by Unit Number with Anomalies Highlighted")
plt.xlabel("Unit Number")
plt.ylabel("Prediction Error")
plt.legend(title="Anomaly", labels=["Normal", "Anomaly"])
plt.show()

In [None]:
report_content = """
# Turbofan Engine RUL Prediction & Anomaly Detection Report

## Project Objective
Develop a machine learning pipeline to:
- Predict Remaining Useful Life (RUL) of turbofan engines.
- Detect anomalies based on prediction errors.

---

## Data & Features
- Dataset: NASA CMAPSS Turbofan Engine Degradation Dataset (FD001)
- Rolling features: 5-cycle rolling mean and std of sensor readings
- Final feature count: 47 (after adding rolling features and dropping constants)

---

## Final Model
- Model: Random Forest Regressor
- R²: 0.7431
- MAE: 23.59
- RMSE: 34.26
- Best features: Rolling mean & std of key sensors (e.g., sensor_4, sensor_11, sensor_9)

---

## Anomaly Detection
- Method: IQR-based thresholding of prediction error
- Threshold: 35.68
- Total anomalies detected: 1306

---

## Visual Insights
- Prediction Error over Cycles: See `cycle_anomaly_plot.png`
- Prediction Error by Unit Number: See `unit_anomaly_plot.png`

---

## Exported Files
- `anomalies_detected.csv`: List of detected anomalies with unit, cycle, RUL, prediction, error

---

## Next Steps
- Optional hyperparameter tuning
- Alternative anomaly thresholds (e.g. z-score)
- Deployment as a script or microservice
"""

with open("README.md", "w", encoding="utf-8") as file:
    file.write(report_content)

print("README.md file has been created successfully!")


In [None]:
#Cycle anomaly plot
plt.figure(figsize=(12, 6))
sns.scatterplot(
    data=df,
    x='time_in_cycles',
    y='error',
    hue='is_anomaly',
    palette={True: 'red', False: 'blue'},
    legend='brief'
)
plt.title("Prediction Error over Cycles with Anomalies Highlighted")
plt.xlabel("Time in Cycles")
plt.ylabel("Prediction Error")
plt.legend(title="Anomaly", labels=["Normal", "Anomaly"])
plt.savefig("cycle_anomaly_plot.png")
plt.close()

#Unit anomaly plot
plt.figure(figsize=(12, 6))
sns.scatterplot(
    data=df,
    x='unit_number',
    y='error',
    hue='is_anomaly',
    palette={True: 'red', False: 'blue'},
    legend='brief'
)
plt.title("Prediction Error by Unit Number with Anomalies Highlighted")
plt.xlabel("Unit Number")
plt.ylabel("Prediction Error")
plt.legend(title="Anomaly", labels=["Normal", "Anomaly"])
plt.savefig("unit_anomaly_plot.png")
plt.close()

print("Plots saved as cycle_anomaly_plot.png and unit_anomaly_plot.png")


In [None]:
pipeline_code = '''
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("YOUR_DATA_FILE.csv")  # <--- DATA DOSYANIN ADINI GÜNCELLE

# Feature engineering (örneğin rolling mean/std varsa buraya ekle)
# ...

# Model
feature_cols = [col for col in df.columns if col not in ['unit_number', 'time_in_cycles', 'RUL']]
X = df[feature_cols]
y = df['RUL']

model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X, y)
df['predicted_RUL'] = model.predict(X)
df['error'] = np.abs(df['RUL'] - df['predicted_RUL'])

# Threshold
Q1 = df['error'].quantile(0.25)
Q3 = df['error'].quantile(0.75)
IQR = Q3 - Q1
threshold = Q3 + 1.5 * IQR

df['is_anomaly'] = df['error'] > threshold

# Export anomalies
df[df['is_anomaly']].to_csv("anomalies_detected.csv", index=False)

# Plot & save
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='time_in_cycles', y='error', hue='is_anomaly', palette={True: 'red', False: 'blue'}, legend='brief')
plt.title("Prediction Error over Cycles with Anomalies Highlighted")
plt.savefig("cycle_anomaly_plot.png")
plt.close()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='unit_number', y='error', hue='is_anomaly', palette={True: 'red', False: 'blue'}, legend='brief')
plt.title("Prediction Error by Unit Number with Anomalies Highlighted")
plt.savefig("unit_anomaly_plot.png")
plt.close()

print("Pipeline completed. Files saved.")
'''

with open("pipeline.py", "w", encoding="utf-8") as f:
    f.write(pipeline_code)

print("Pipeline.py file created. Update YOUR_DATA_FILE.csv path in the script!")
# This code creates a pipeline script that loads data, trains a model, detects anomalies, and saves results.

In [None]:
from fpdf import FPDF

pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)

with open("README.md", "r", encoding="utf-8") as file:
    for line in file:
        pdf.multi_cell(0, 10, line)

pdf.output("Turbofan_Anomaly_Detection.pdf")
print("PDF created: turbofan_project_report.pdf")


In [None]:
import zipfile
import os

final_zip = "Turbofan_Anomaly_Detection.zip"

files_to_include = [
    "README.md",
    "Turbofan_Anomaly_Detection.pdf",
    "pipeline.py",
    "anomalies_detected.csv",
    "cycle_anomaly_plot.png",
    "unit_anomaly_plot.png"
]

with zipfile.ZipFile(final_zip, 'w') as zipf:
    for file in files_to_include:
        if os.path.exists(file):
            zipf.write(file)
            print(f"Added: {file}")
        else:
            print(f"Warning: {file} not found, skipping.")

print(f"ZIP file created: {final_zip}")
