In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
df = pd.read_csv("/Users/erningxu/Desktop/data assignment/Final_Report/data/data_after_EDA.csv")
df = df.sort_values(by='date')
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_month'] = df['date'].dt.day
df['week_of_year'] = df['date'].dt.isocalendar().week
df['is_weekend'] = df['date'].dt.dayofweek.isin([5, 6]).astype(int)

In [2]:
X = df.drop(columns=['actual_productivity'])
y = df['actual_productivity']
imputer = SimpleImputer(strategy="median")
X['wip'] = imputer.fit_transform(X[['wip']])
tscv = TimeSeriesSplit(n_splits=3)
splits = list(tscv.split(X, y))
train_index = splits[0][0]  
val_index = splits[1][1]    
test_index = splits[2][1]
X_train = X.iloc[train_index]
y_train = y.iloc[train_index]
X_val = X.iloc[val_index]
y_val = y.iloc[val_index]
X_test = X.iloc[test_index]
y_test = y.iloc[test_index]
train_date_range = (df.iloc[train_index]['date'].min(), df.iloc[train_index]['date'].max())
val_date_range = (df.iloc[val_index]['date'].min(), df.iloc[val_index]['date'].max())
test_date_range = (df.iloc[test_index]['date'].min(), df.iloc[test_index]['date'].max())
print("Training set date range:", train_date_range)
print("Validation set date range:", val_date_range)
print("Test set date range:", test_date_range)



Training set date range: (Timestamp('2015-01-01 00:00:00'), Timestamp('2015-01-18 00:00:00'))
Validation set date range: (Timestamp('2015-02-04 00:00:00'), Timestamp('2015-02-23 00:00:00'))
Test set date range: (Timestamp('2015-02-23 00:00:00'), Timestamp('2015-03-11 00:00:00'))


In [3]:
X_train = X_train.drop(columns=['date'])
X_val = X_val.drop(columns=['date'])
X_test = X_test.drop(columns=['date'])
onehot_ftrs = ['department', 'team']
ordinal_ftrs = ['day', 'quarter']
std_ftrs = ['targeted_productivity', 'smv', 'wip', 'over_time', 'incentive',
            'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers',
            'day_of_month', 'week_of_year', 'year', 'month']

ordinal_encoder = OrdinalEncoder(categories=[
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    ['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4']
])

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False), onehot_ftrs),
        ('ordinal', ordinal_encoder, ordinal_ftrs),
        ('std', StandardScaler(), std_ftrs)
    ],
    remainder='passthrough'
)

clf = Pipeline(steps=[('preprocessor', preprocessor)])

X_train_prep = clf.fit_transform(X_train)
X_val_prep = clf.transform(X_val)
X_test_prep = clf.transform(X_test)

print("Original training shape:", X_train.shape)
print("Transformed training shape:", X_train_prep.shape)
print("Sample transformed training data:\n", X_train_prep[:5])

Original training shape: (300, 18)
Transformed training shape: (300, 30)
Sample transformed training data:
 [[ 0.          1.          0.          0.          0.          0.
   0.          0.          0.          1.          0.          0.
   0.          0.          3.          0.          0.47108153  1.01503979
   0.63397946  0.37210131  2.57881198  0.          0.          0.
   1.11789555 -1.7399449  -1.57461534  0.          0.          0.        ]
 [ 1.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   1.          0.          3.          0.         -0.57576631 -0.92543078
   0.25872569 -1.04408831 -0.95380717  0.          0.          0.
  -0.95160189 -1.7399449  -1.57461534  0.          0.          0.        ]
 [ 0.          1.          0.          0.          0.          1.
   0.          0.          0.          0.          0.          0.
   0.          0.          3.          0.         -1.09919024  0.7