In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from tensorflow.keras import Model

from sklearn.metrics import accuracy_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Path to the dataset.
filepath = "datasets/data_globant_clean.csv"

# We will save the trained CNN model into a file. Specify the path where the model will be saved.
savemodelpath = "datasets/" # Make sure the directory already exists in your local computer.

# Define the size of the images.

# Read the dataset.
dataset = pd.read_csv(filepath)

In [None]:
dataset.head()

In [None]:
dataset["EmployeeID"] = dataset["Name"].astype("category").cat.codes

bins_personalizados = [0.0, 1.5, 3.5, 5.0]
labels_personalizados = ["Bajo", "Medio", "Alto"]
dataset["Engagement_D"] = pd.cut(dataset["Engagement"], bins = bins_personalizados, labels = labels_personalizados, include_lowest = True)
dataset.head(10)

In [None]:
dataset = dataset.drop(columns = ["Engagement", "Name", "Email Leader"])
dataset.head()

In [None]:
features = [
    "EmployeeID",
    "Seniority",
    "Month",
    "Day",
    "Position",
    "Location",
    "Studio",
    "Client Tag",
    "Project Tag",
    "Team Name",
]

target = "Engagement_D"

In [None]:
categorical_cols = [
    "Position",
    "Location",
    "Studio",
    "Client Tag",
    "Project Tag",
    "Team Name",
]

encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])
    encoders[col] = le

In [None]:
encoder_target = LabelEncoder()
dataset["Engagement_D"] = encoder_target.fit_transform(dataset["Engagement_D"].astype(str))

In [None]:
dataset["Date"] = pd.to_datetime(dataset["Date"], format="%d%b%y")
dataset = dataset.sort_values(["EmployeeID", "Date"])
dataset.head()

In [None]:
scaler = MinMaxScaler()

num_cols = ["Seniority", "Month", "Day"]

dataset[num_cols] = scaler.fit_transform(dataset[num_cols])

In [None]:
print(dataset[features].dtypes)


In [None]:
def create_windows(df, features, target, window=14):
    X, y = [], []

    for emp in df["EmployeeID"].unique():
        emp_data = df[df["EmployeeID"] == emp]

        if len(emp_data) <= window:
            continue  # empleado con pocos datos

        f_vals = emp_data[features].values
        t_vals = emp_data[target].values

        for i in range(len(emp_data) - window):
            X.append(f_vals[i:i+window])
            y.append(t_vals[i+window])

    return np.array(X), np.array(y)

X, y = create_windows(dataset, features, target, window=14)

print(X.shape, y.shape)

In [None]:
n = len(X)

test_size = 0.10
val_size = 0.10

train_end = int(n * (1 - test_size - val_size))  # 80%
val_end   = int(n * (1 - test_size))             # 90%

# Features
X_train = X[:train_end]
X_val   = X[train_end:val_end]
X_test  = X[val_end:]

# Labels
y_train = y[:train_end]
y_val   = y[train_end:val_end]
y_test  = y[val_end:]



In [None]:
plt.plot(y_train)
plt.xlabel("Tiempo")
plt.ylabel("Engagement")
plt.title("Conjunto de train")
plt.show()

plt.plot(y_test)
plt.xlabel("Tiempo")
plt.ylabel("Engagement")
plt.title("Conjunto de test")
plt.show()

In [None]:
subsequences = 2
timesteps = X_train.shape[1] // subsequences
features_n = X_train.shape[2]

X_train = X_train.reshape((X_train.shape[0], subsequences, timesteps, features_n))
X_test  = X_test.reshape((X_test.shape[0], subsequences, timesteps, features_n))
X_val = X_val.reshape((X_test.shape[0], subsequences, timesteps, features_n))

print(X_train.shape, X_test.shape)


In [None]:
from keras.models import Sequential
from keras.layers import TimeDistributed, Conv1D, MaxPooling1D, Flatten, LSTM, Dense

model = Sequential()

model.add(TimeDistributed(
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    input_shape=(subsequences, timesteps, features_n)
))

model.add(TimeDistributed(
    Conv1D(filters=64, kernel_size=2, activation='relu')
))

model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
model.add(TimeDistributed(Flatten()))

model.add(LSTM(50, activation='relu'))
model.add(Dense(3, activation='softmax'))  

model.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.005),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    verbose=1
)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test accuracy:", test_acc)