In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer, KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
df = pd.read_csv("/Users/erningxu/Desktop/data assignment/Final_Report/data/data_after_EDA.csv")
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_month'] = df['date'].dt.day
df['week_of_year'] = df['date'].dt.isocalendar().week
df['is_weekend'] = (df['date'].dt.dayofweek.isin([5, 6])).astype(int)
X = df.drop(columns=['actual_productivity'])
y = df['actual_productivity']
tscv = TimeSeriesSplit(n_splits=3)
splits = list(tscv.split(X, y))
train_index = splits[0][0]  
val_index = splits[1][1]    
test_index = splits[2][1]
X_train = X.iloc[train_index]
y_train = y.iloc[train_index]
X_val = X.iloc[val_index]
y_val = y.iloc[val_index]
X_test = X.iloc[test_index]
y_test = y.iloc[test_index]
X_train = X_train.drop(columns=['date'])
X_val = X_val.drop(columns=['date'])
X_test = X_test.drop(columns=['date'])
imputers = {
    "mean": SimpleImputer(strategy="mean"),
    "median": SimpleImputer(strategy="median"),
    "knn": KNNImputer(n_neighbors=5)
}
def evaluate_imputation_with_custom_split(X_train, y_train, X_val, y_val, imputers, seeds):
    results = {}

    for name, imputer in imputers.items():
        print(f"Evaluating imputer: {name}")
        X_train_imputed = X_train.copy()
        X_val_imputed = X_val.copy()
        X_train_imputed["wip"] = imputer.fit_transform(X_train[["wip"]])
        X_val_imputed["wip"] = imputer.transform(X_val[["wip"]])
        X_train_imputed = pd.get_dummies(X_train_imputed, drop_first=True)
        X_val_imputed = pd.get_dummies(X_val_imputed, drop_first=True)
        X_val_imputed = X_val_imputed.reindex(columns=X_train_imputed.columns, fill_value=0)
        scores = []
        for seed in seeds:
            model = XGBRegressor(random_state=seed, eval_metric='rmse')
            model.fit(X_train_imputed, y_train)
            predictions = model.predict(X_val_imputed)
            mse = mean_squared_error(y_val, predictions)
            scores.append(mse)

        results[name] = np.mean(scores)

    return results

def evaluate_reduced_features_with_custom_split(X_train, y_train, X_val, y_val, seeds):
    print("Evaluating reduced features model")
    scores = []
    X_train_reduced = X_train.drop(columns=["wip"])
    X_val_reduced = X_val.drop(columns=["wip"])
    X_train_reduced = pd.get_dummies(X_train_reduced, drop_first=True)
    X_val_reduced = pd.get_dummies(X_val_reduced, drop_first=True)
    X_val_reduced = X_val_reduced.reindex(columns=X_train_reduced.columns, fill_value=0)
    for seed in seeds:
        model = XGBRegressor(random_state=seed, eval_metric='rmse')
        model.fit(X_train_reduced, y_train)
        predictions = model.predict(X_val_reduced)
        mse = mean_squared_error(y_val, predictions)
        scores.append(mse)

    return np.mean(scores)
seeds = [42, 100, 200, 300, 400]

imputation_results = evaluate_imputation_with_custom_split(X_train, y_train, X_val, y_val, imputers, seeds)

reduced_features_score = evaluate_reduced_features_with_custom_split(X_train, y_train, X_val, y_val, seeds)

imputation_results["reduced_features"] = reduced_features_score
ranked_methods = sorted(imputation_results.items(), key=lambda x: x[1])

print("Ranked methods (best to worst):", ranked_methods)


Evaluating imputer: mean
Evaluating imputer: median
Evaluating imputer: knn
Evaluating reduced features model
Ranked methods (best to worst): [('median', 0.0443210035449742), ('reduced_features', 0.04670680859808863), ('mean', 0.04672902872546966), ('knn', 0.04672902872546966)]
