In [5]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

df = pd.read_csv("/Users/erningxu/Desktop/data assignment/Final_Report/data/data_after_EDA.csv")
df = df.sort_values(by='date')
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_month'] = df['date'].dt.day
df['week_of_year'] = df['date'].dt.isocalendar().week
df['is_weekend'] = df['date'].dt.dayofweek.isin([5, 6]).astype(int)
X = df.drop(columns=['actual_productivity', 'date'])
y = df['actual_productivity']

tscv = TimeSeriesSplit(n_splits=3)
splits = list(tscv.split(X, y))
train_index = splits[0][0]
val_index = splits[1][1]
test_index = splits[2][1]

X_train, y_train = X.iloc[train_index], y.iloc[train_index]
X_val, y_val = X.iloc[val_index], y.iloc[val_index]
X_test, y_test = X.iloc[test_index], y.iloc[test_index]

onehot_ftrs = ['department', 'team']
ordinal_ftrs = ['day', 'quarter']
std_ftrs = ['targeted_productivity', 'smv', 'wip', 'over_time', 'incentive',
            'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers',
            'day_of_month', 'week_of_year', 'year', 'month']

ordinal_encoder = OrdinalEncoder(categories=[
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    ['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4']
])

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False), onehot_ftrs),
        ('ordinal', ordinal_encoder, ordinal_ftrs),
        ('std', StandardScaler(), std_ftrs)
    ],
    remainder='passthrough'
)

clf = Pipeline(steps=[('preprocessor', preprocessor)])

X_train_prep = clf.fit_transform(X_train)
X_val_prep = clf.transform(X_val)
X_test_prep = clf.transform(X_test)

def get_feature_names_from_column_transformer(preprocessor, original_feature_names):
    feature_names = []
    for name, transformer, cols in preprocessor.transformers_:
        if name == "remainder" and transformer == "passthrough":
            feature_names.extend(cols)
        elif hasattr(transformer, 'get_feature_names_out'):
            transformed_names = transformer.get_feature_names_out()
            feature_names.extend(transformed_names)
        else:
            feature_names.extend(cols)
    return feature_names

original_feature_names = X_train.columns.tolist()
feature_names = get_feature_names_from_column_transformer(clf.named_steps["preprocessor"], original_feature_names)

def evaluate_xgb_with_missing_values(X_train_prep, y_train, X_val_prep, y_val, seeds):
    scores = []
    for seed in seeds:
        model = XGBRegressor(random_state=seed, eval_metric='rmse')
        model.fit(X_train_prep, y_train)
        predictions = model.predict(X_val_prep)
        mse = mean_squared_error(y_val, predictions)
        scores.append(mse)
    return np.mean(scores)

def evaluate_with_multivariate_imputation(X_train, y_train, X_val, y_val, seeds, n_datasets=5):
    imputer = IterativeImputer(random_state=42, max_iter=10)
    imputed_datasets = [imputer.fit_transform(X_train) for _ in range(n_datasets)]
    scores = []
    for X_train_imputed in imputed_datasets:
        for seed in seeds:
            model = XGBRegressor(random_state=seed, eval_metric='rmse')
            model.fit(X_train_imputed, y_train)
            predictions = model.predict(X_val)
            mse = mean_squared_error(y_val, predictions)
            scores.append(mse)
    return np.mean(scores)

def evaluate_reduced_features_model(X_train, y_train, X_val, y_val, seeds):
    scores = []
    X_train_transformed = clf.named_steps['preprocessor'].transform(X_train)
    X_val_transformed = clf.named_steps['preprocessor'].transform(X_val)

    feature_names = get_feature_names_from_column_transformer(clf.named_steps["preprocessor"], X_train.columns)
    if "wip" in feature_names:
        wip_index = feature_names.index("wip")
    else:
        raise ValueError("The 'wip' feature is not found in the transformed feature names.")

    X_train_reduced = np.delete(X_train_transformed, wip_index, axis=1)
    X_val_reduced = np.delete(X_val_transformed, wip_index, axis=1)

    for seed in seeds:
        model = XGBRegressor(random_state=seed, eval_metric='rmse')
        model.fit(X_train_reduced, y_train)
        predictions = model.predict(X_val_reduced)
        mse = mean_squared_error(y_val, predictions)
        scores.append(mse)

    return np.mean(scores)


seeds = [42, 100, 200, 300, 400]

xgb_score = evaluate_xgb_with_missing_values(X_train_prep, y_train, X_val_prep, y_val, seeds)
imputation_score = evaluate_with_multivariate_imputation(X_train_prep, y_train, X_val_prep, y_val, seeds)
reduced_features_score = evaluate_reduced_features_model(X_train, y_train, X_val, y_val, seeds)

results = {
    "XGB Direct Handling": xgb_score,
    "Multivariate Imputation": imputation_score,
    "Reduced Features": reduced_features_score
}
ranked_methods = sorted(results.items(), key=lambda x: x[1])

print("Scores:")
for method, score in results.items():
    print(f"{method}: {score:.4f}")
print("\nRanked Methods (Best to Worst):")
for method, score in ranked_methods:
    print(f"{method}: {score:.4f}")



Scores:
XGB Direct Handling: 0.0472
Multivariate Imputation: 0.0429
Reduced Features: 0.0413

Ranked Methods (Best to Worst):
Reduced Features: 0.0413
Multivariate Imputation: 0.0429
XGB Direct Handling: 0.0472
