In [3]:
# ------------------------------------------------------------
# Project Title: Manufacturing Equipment Output Prediction
# Author: Your Name
# ------------------------------------------------------------

# 1. Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# ------------------------------------------------------------
# 2. Load the dataset
data = pd.read_csv("manufacturing_dataset_1000_samples project1.csv")

print("\n✅ Dataset Loaded Successfully!")
print("\nFirst 5 rows:")
print(data.head())

print("\nDataset Info:")
print(data.info())

# ------------------------------------------------------------
# 3. Handle Missing Values
# Numerical columns -> fill with mean
# Categorical columns -> fill with mode
num_cols = ['Material_Viscosity', 'Ambient_Temperature', 'Operator_Experience']
for col in num_cols:
    data[col].fillna(data[col].mean(), inplace=True)

print("\n✅ Missing values handled successfully!")

# ------------------------------------------------------------
# 4. Encode Categorical Columns
cat_cols = ['Shift', 'Machine_Type', 'Material_Grade', 'Day_of_Week']

le = LabelEncoder()
for col in cat_cols:
    data[col] = le.fit_transform(data[col])

print("\n✅ Categorical columns encoded successfully!")

# ------------------------------------------------------------
# 5. Define Features (X) and Target (y)
X = data.drop('Parts_Per_Hour', axis=1)
y = data['Parts_Per_Hour']

# ------------------------------------------------------------
# 6. Feature Scaling (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\n✅ Feature scaling done successfully!")

# ------------------------------------------------------------
# 7. Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print("\n✅ Data split into training and testing sets!")

# ------------------------------------------------------------
# 8. Train the Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

print("\n✅ Linear Regression model trained successfully!")

# ------------------------------------------------------------
# 9. Model Evaluation
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\n✅ Model Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# ------------------------------------------------------------
# 10. Visualization
plt.figure(figsize=(7,5))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Parts per Hour")
plt.show()

# ------------------------------------------------------------
# 11. Feature Importance (Model Coefficients)
coeff_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\n✅ Feature Importance:\n", coeff_df)

# ------------------------------------------------------------
# 12. Final Summary
print("\n🎯 Summary:")
print("✔ Model successfully built using Linear Regression.")
print("✔ Predictions can help optimize machine settings and detect underperformance.")


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values