In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tensorflow import keras
from tensorflow.keras import layers, regularizers

In [None]:
df = pd.read_csv(r"C:\Users\fyl\OneDrive - UWE Bristol\Data Analysis Projects\Python\NLNG Projects.csv")
df

In [None]:
x = df.drop(columns=['Task Duration (hours)', 
                            'Project ID', 'Task Duration (days)' ,
                            'Start Date'])
y = df['Task Duration (hours)']

In [None]:
categorical_cols = ["Project Name", 
                    "Project Type", "Task Name", 
                    "Contractor Type", "Location", 
                    "Complexity Level", "season"]
numeric_cols = [col for col in x.columns if col not in categorical_cols]

In [None]:
# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False)
x_cat = encoder.fit_transform(x[categorical_cols])

In [None]:
# Scale numeric features
scaler = StandardScaler()
x_num = scaler.fit_transform(x[numeric_cols])

In [None]:
# Combine processed features
x_processed = np.hstack([x_num, x_cat])

In [None]:
# Train/test split
x_train, x_test, y_train, y_test = train_test_split(x_processed, y, test_size=0.2, random_state=42)


In [None]:
# Define the model
model = keras.Sequential([
    layers.Input(shape=(x_train.shape[1],)),
    layers.BatchNormalization(),
    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
    layers.Dropout(0.2),
    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
    layers.Dropout(0.2),
    layers.Dense(1)  # Predict continuous duration in days
])

In [None]:
model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mae', metrics=['mae'])
cb = [
    keras.callbacks.EarlyStopping(patience=50, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=20, factor=0.5)
]

In [None]:
# Train the model
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=1000, batch_size=32)


In [None]:
# Evaluate the model
loss, mae = model.evaluate(x_test, y_test, verbose=0)
mae = mae/24
print(f"Mean Absolute Error on test data: {mae:.2f} days")

In [None]:
user_data = {
    "Project Name": ["NLNG Plant Expansion"],
    "Project Type": ["Industrial"],
    "Task Name": ["Instrumentation"],
    "Contractor Type": ["General"],
    "Labor Count": [20],
    "Total Cost (£": [521840],  # Can include or drop depending on leakage concerns
    "Location": ["Calabar"],
    "Complexity Level": ["Medium"],
    "season": ["Raining season"],
}

# 1) Make sure your encoder can handle new/unseen categories
# (re-fit once with handle_unknown='ignore' before training)
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
x_cat = encoder.fit_transform(x[categorical_cols])  # re-fit once, then keep this encoder

# 2) Ensure your scaler is the same one already fit on training data (you already have 'scaler')

# 3) Create a DataFrame for the new row
user_df = pd.DataFrame(user_data)

# 4) Make sure the user_df has ALL columns expected by your preprocessing
#    (i.e., all categorical_cols and numeric_cols must be present).
#    If any numeric column is missing, create it with a neutral value (e.g., 0) or a sensible default.
for col in numeric_cols:
    if col not in user_df.columns:
        user_df[col] = 0  # or use a better default (e.g., training median)

for col in categorical_cols:
    if col not in user_df.columns:
        user_df[col] = ""  # empty string -> becomes all-zeros in one-hot

# 5) Apply the SAME transforms
user_x_num = scaler.transform(user_df[numeric_cols])
user_x_cat = encoder.transform(user_df[categorical_cols])

# 6) Combine in the SAME order as training
user_x = np.hstack([user_x_num, user_x_cat])

# 7) Predict
pred_hours = model.predict(user_x)[0, 0]
pred_days = int(pred_hours/24)
print(f"Predicted Task Duration: {pred_hours:.1f} hours")
print(f"Predicted Task Duration: {pred_days:.1f} days")