In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset using the correct relative path
df = pd.read_csv('../Dataset/garments_worker_productivity.csv')

# Display the first 5 rows
df.head()


Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


In [3]:
## 3. Visualizing and Analyzing The Data (EDA)


In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(15, 12))

# Draw the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

plt.title('Correlation Matrix of Employee Features')
plt.show()


ValueError: could not convert string to float: '1/1/2015'

In [None]:
# Select a few important columns to visualize against productivity
features_to_plot = ['targeted_productivity', 'over_time', 'incentive', 'no_of_workers']

# Create scatterplots
for feature in features_to_plot:
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=df[feature], y=df['actual_productivity'])
    plt.title(f'Productivity vs. {feature}')
    plt.show()

# Create boxplots for categorical-like features
categorical_features = ['quarter', 'department']

for feature in categorical_features:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x=df[feature], y=df['actual_productivity'])
    plt.title(f'Productivity across different {feature}s')
    plt.show()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# Convert the 'date' column to a datetime object
df['date'] = pd.to_datetime(df['date'])

# Extract features like day, month, and year
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Drop the original 'date' column as we've replaced it
df = df.drop('date', axis=1)

In [None]:
# Fill missing 'wip' values with the mean of the column
# inplace=True modifies the DataFrame directly
df['wip'].fillna(df['wip'].mean(), inplace=True)

# Confirm that there are no more missing values
print(df.isnull().sum())

In [None]:
# This is the recommended way to fill missing values
df['wip'] = df['wip'].fillna(df['wip'].mean())

In [None]:
# Convert 'department' into numerical columns
df = pd.get_dummies(df, columns=['department'], drop_first=True)

In [None]:
# See the new columns and confirm no missing values
df.info()

# View the first few rows of your fully numerical dataset
df.head()

In [None]:
# The 'target' is what we want to predict
y = df['actual_productivity']

# The 'features' are all columns except the target
X = df.drop('actual_productivity', axis=1)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into 80% for training and 20% for testing
# random_state=42 ensures you get the same split every time you run the code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

# Create an instance of the model
lr_model = LinearRegression()

# Train the model on your training data
lr_model.fit(X_train, y_train)

In [None]:
# This will print the names of any columns that still contain text
print(df.select_dtypes(include=['object']).columns)


In [None]:
# This will convert both 'quarter' and 'department' into numerical columns
df = pd.get_dummies(df, columns=['quarter', 'department'], drop_first=True)

In [None]:
print(df.columns)

In [None]:
# Only include the 'quarter' column, which we know is a text column
df = pd.get_dummies(df, columns=['quarter'], drop_first=True)

In [None]:
y = df['actual_productivity']
X = df.drop('actual_productivity', axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

# Create and train the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

print("Model trained successfully!")

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

# Make predictions on the test data
predictions = lr_model.predict(X_test)

# Check the performance
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions with the Random Forest model
rf_predictions = rf_model.predict(X_test)

# Check the performance
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print("--- Random Forest Performance ---")
print(f"Mean Absolute Error: {rf_mae}")
print(f"R-squared: {rf_r2}")

In [None]:
from xgboost import XGBRegressor

# Create and train the XGBoost model
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

In [None]:
# Make predictions with the XGBoost model
xgb_predictions = xgb_model.predict(X_test)

# Check the performance
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)

print("--- XGBoost Performance ---")
print(f"Mean Absolute Error: {xgb_mae}")
print(f"R-squared: {xgb_r2}")

In [None]:
import pickle

# The 'rf_model' is the variable holding your trained Random Forest
# 'wb' means 'write binary' mode
with open('model_rf.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

print("Random Forest model saved successfully as model_rf.pkl")

In [None]:
print(X_train.columns.tolist())

In [None]:
print(X_train.columns.tolist())

In [None]:
import pickle

# Save the list of column names to a file
with open('model_columns.pkl', 'wb') as file:
    pickle.dump(X_train.columns, file)

print("Model columns saved successfully to model_columns.pkl!")
