### Data Cleaning
---

In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('garments_worker_productivity.csv')
data.head()
df = pd.DataFrame(data)

In [3]:
df.describe()

Unnamed: 0,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
count,1197.0,1197.0,1197.0,691.0,1197.0,1197.0,1197.0,1197.0,1197.0,1197.0,1197.0
mean,6.426901,0.729632,15.062172,1190.465991,4567.460317,38.210526,0.730159,0.369256,0.150376,34.609858,0.735091
std,3.463963,0.097891,10.943219,1837.455001,3348.823563,160.182643,12.709757,3.268987,0.427848,22.197687,0.174488
min,1.0,0.07,2.9,7.0,0.0,0.0,0.0,0.0,0.0,2.0,0.233705
25%,3.0,0.7,3.94,774.5,1440.0,0.0,0.0,0.0,0.0,9.0,0.650307
50%,6.0,0.75,15.26,1039.0,3960.0,0.0,0.0,0.0,0.0,34.0,0.773333
75%,9.0,0.8,24.26,1252.5,6960.0,50.0,0.0,0.0,0.0,57.0,0.850253
max,12.0,0.8,54.56,23122.0,25920.0,3600.0,300.0,45.0,2.0,89.0,1.120437


In [4]:
# Check which column have nan values
print(df.isna().any())

date                     False
quarter                  False
department               False
day                      False
team                     False
targeted_productivity    False
smv                      False
wip                       True
over_time                False
incentive                False
idle_time                False
idle_men                 False
no_of_style_change       False
no_of_workers            False
actual_productivity      False
dtype: bool


Ignoring any specific tuple or remove the entire row or column may lead to loss of valueable data and potential bias during analysis of data and the relations between features. Hence, these methods were not applied.

In [5]:
# Filling nan values with mean, median and mode

# Mean helps provide an average/central value of data to balance out the extreme values. However, this method may lead to misinterpretation due to outliers.
mean = df['wip'].mean()
mean = np.round(mean, 1)
print("Mean of 'wip': ", mean)

# Median can be used for categorical values but can lead to heavily skewed distribution.
median = df['wip'].median()
print("Median of 'wip': ", median)

# Mode is not influenced by outliers but may remove unique information from the dataset.
mode = df['wip'].mode().iloc[0]
print("Mode of 'wip': ", mode)

Mean of 'wip':  1190.5
Median of 'wip':  1039.0
Mode of 'wip':  1039.0


In [6]:
print(df['wip'].fillna(mean))

0       1108.0
1       1190.5
2        968.0
3        968.0
4       1170.0
         ...  
1192    1190.5
1193    1190.5
1194    1190.5
1195    1190.5
1196    1190.5
Name: wip, Length: 1197, dtype: float64


In [7]:
print(df['wip'].fillna(median))

0       1108.0
1       1039.0
2        968.0
3        968.0
4       1170.0
         ...  
1192    1039.0
1193    1039.0
1194    1039.0
1195    1039.0
1196    1039.0
Name: wip, Length: 1197, dtype: float64


In [8]:
print(df['wip'].fillna(mode))

0       1108.0
1       1039.0
2        968.0
3        968.0
4       1170.0
         ...  
1192    1039.0
1193    1039.0
1194    1039.0
1195    1039.0
1196    1039.0
Name: wip, Length: 1197, dtype: float64


In [10]:
# Global constant
# May preserve the structure of dataset for statistical analysis but may result in distortion of data and bias.
global_constant = float(input("Enter global constant")) # 700
# global_constant
print(df['wip'].fillna(global_constant))

0       1108.0
1        700.0
2        968.0
3        968.0
4       1170.0
         ...  
1192     700.0
1193     700.0
1194     700.0
1195     700.0
1196     700.0
Name: wip, Length: 1197, dtype: float64
