# Data Preprocessing and Exploratory Data Analysis (EDA)

This notebook covers the initial data loading, exploratory data analysis, and preprocessing steps for the AI4I 2020 Predictive Maintenance Dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Set plot style
sns.set(style='whitegrid')

## Load the Dataset

In [None]:
# Load the dataset
data = pd.read_csv('../data/ai4i2020.csv')
data.head()

## Exploratory Data Analysis (EDA)

In [None]:
# Display basic information about the dataset
data.info()

In [None]:
# Summary statistics
data.describe()

In [None]:
# Check for missing values
data.isnull().sum()

## Visualize the Data

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

In [None]:
# Distribution of the target variable
sns.countplot(data['machine failure'])
plt.title('Distribution of Machine Failure')
plt.show()

## Data Preprocessing

In [None]:
# Define features and target variable
X = data.drop(['machine failure'], axis=1)
y = data['machine failure']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Save the Preprocessed Data

In [None]:
# Save the preprocessed data
np.savez('../data/preprocessed_data.npz', X_train=X_train_scaled, X_test=X_test_scaled, y_train=y_train, y_test=y_test)

## Conclusion

In this notebook, we have loaded the dataset, performed exploratory data analysis, visualized the data, and preprocessed the data for machine learning. The preprocessed data has been saved and is ready for model training.