In [20]:
from keras.layers import Reshape, Embedding, Concatenate
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from keras import Input, Model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import GRU, Dense

# Load the data
data = pd.read_csv('internship_assignment.csv')
selected_cols = ['user_id_hashed', 'dt', 'selected_track_id', 'selected_project_id', 'step_id', 'action']
data = data.loc[:, selected_cols]

#get the number of ids
num_unique_ids = data['step_id'].nunique()

# get the unique ids and sort them
ids_step = sorted(data['step_id'].unique())

# create a dictionary to map the ids to the range 0 to len(ids)-1
id_step_map = {id_val: idx for idx, id_val in enumerate(ids_step)}

# create a new column 'id_mapped' using the map() method
data['step_id'] = data['step_id'].map(id_step_map)

# Encode the 'action' column using label encoding
label_encoder = LabelEncoder()
data['action'] = label_encoder.fit_transform(data['action'])

# One-hot encode the 'action' column using pd.get_dummies()
data = pd.get_dummies(data, columns=['action'])

# Split data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.3, shuffle=False)

# Get a list of action columns
action_cols = [col for col in train_data.columns if col.startswith('action_')]

# Create input/output sequences for training data
train_sequences = []
for user_id, user_data in train_data.groupby('user_id_hashed'):
    # Extract the columns that will be used as inputs and output
    user_data = user_data.reset_index(drop=True)
    input_cols = user_data.drop(['user_id_hashed', 'dt', 'selected_track_id', 'selected_project_id'] + action_cols, axis=1).columns
    output_cols = user_data[action_cols].columns

    # Separate the data on batches of 10 for the input and 5 for the output
    for i in range(0, len(user_data)-20, 10):
        inputs_step = user_data.loc[i:i+9, input_cols].values.astype('float32')
        inputs_action = user_data.loc[i:i+9, action_cols].values.astype('float32')
        outputs = user_data.loc[i+10:i+14, output_cols].values.astype('float32')
        train_sequences.append((inputs_step, inputs_action, outputs))

# Train the model
if len(train_sequences) > 0:
    X_train = np.array([i[0] for i in train_sequences])
    X_action_train = np.array([i[1] for i in train_sequences])
    y_train = np.array([i[2] for i in train_sequences])

    # Create GRU model
    input_step = Input(shape=(10,))
    step_embedding = Embedding(input_dim=num_unique_ids, output_dim=128, input_length=train_sequences[0][0].shape[0])(input_step)
    input_action = Input(shape=(10,31))
    action_dense = Dense(64, activation='relu')(input_action)
    concatenated = Concatenate()([step_embedding, action_dense])
    gru_layer = GRU(64)(concatenated)
    output_layer = Dense(5*test_data[action_cols].shape[1], activation='softmax')(gru_layer)
    reshaped_output = Reshape((5, 31))(output_layer)

    model = Model(inputs=[input_step, input_action], outputs=reshaped_output)

    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    # Train the model
    model.fit([X_train, X_action_train], y_train, epochs=15, shuffle=False)
else:
    print("No training data")

# Create input/output sequences for testing data
test_sequences = []
for user_id, user_data in test_data.groupby('user_id_hashed'):
    # Extract the columns that will be used as inputs and output
    user_data = user_data.reset_index(drop=True)
    input_cols = user_data.drop(['user_id_hashed', 'dt', 'selected_track_id', 'selected_project_id'] + action_cols, axis=1).columns
    output_cols = user_data[action_cols].columns

    # Separate the data on batches of 10 for the input and 5 for the output
    for i in range(0, len(user_data)-20, 10):
        inputs_step = user_data.loc[i:i+9, input_cols].values.astype('float32')
        inputs_action = user_data.loc[i:i+9, action_cols].values.astype('float32')
        outputs = user_data.loc[i+10:i+14, output_cols].values.astype('float32')
        test_sequences.append((inputs_step, inputs_action, outputs))

# Evaluate the model on the test data
if len(test_sequences) == 0:
    print("Error: no test data found")
else:
    total_loss = 0
    total_accuracy = 0
    num_samples = 0
    for inputs_step, inputs_action, outputs in test_sequences:
        loss, accuracy = model.evaluate([inputs_step[np.newaxis, :], inputs_action[np.newaxis, :]], outputs[np.newaxis, :])
        total_loss += loss
        total_accuracy += accuracy
        num_samples += 1
    average_loss = total_loss / num_samples
    average_accuracy = total_accuracy / num_samples
    print(f'Average loss: {average_loss:.4f}, Average accuracy: {average_accuracy:.4f}')

1.2.2


KeyboardInterrupt: 

In [16]:
import csv

# Create input sequences for all users
all_sequences = []
for user_id, user_data in data.groupby('user_id_hashed'):
    inputs = user_data.drop(['user_id_hashed','selected_project_id','selected_track_id', 'dt'] + action_cols, axis=1).values.astype('float32')[-10:]
    inputs_action = user_data[action_cols].values.astype('float32')[-10:]
    all_sequences.append((inputs, inputs_action))

input_step = np.array([i[0] for i in all_sequences])
input_action = np.array([i[1] for i in all_sequences])

# Make predictions using the trained model
all_predictions = model.predict([input_step, input_action])

# Decode the predicted actions using label encoding
decoded_predictions = list()
for i in range(0, data['user_id_hashed'].nunique()):
    decoded_predictions.append(label_encoder.inverse_transform(np.argmax(all_predictions[i], axis=1)))

# Extract the 5 next predicted actions for each user
next_actions = {}
for i, user_id in enumerate(data['user_id_hashed'].unique()):
    user_predictions = decoded_predictions[i]
    next_actions[user_id] = user_predictions

# Open a new CSV file for writing
with open('submission.csv', 'w', newline='') as csvfile:

    # Create a CSV writer object
    csvwriter = csv.writer(csvfile)

    # Write the header row
    csvwriter.writerow(['user_id_hashed', 'actions'])

    # Write the predicted next actions for each user to the CSV file
    for user_id, user_next_actions in next_actions.items():
        user_next_actions_str = ' '.join(user_next_actions)
        csvwriter.writerow([user_id, user_next_actions_str])

