In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from epic_5_utils import *

In [3]:
conn = connect_db(local=False)
df = get_final_df(conn)

(68699, 5)

In [5]:
df.drop(['fullDate'], axis=1, inplace=True)
df.drop_duplicates(inplace=True)

In [6]:
df.head()

Unnamed: 0,contactID,accountID,keyphrases,marketing_pressure
0,0000FEA0-AB47-E411-9EE6-005056B06EC4,169138A9-BF68-E111-B43A-00505680000A,"aanbod, andennes, architect, auto, automatisch...",-3
1,00029915-2042-E611-80D6-005056B06EC4,01029915-2042-E611-80D6-005056B06EC4,"gent, sleidinge, zaakvoerder",-3
2,0002DD39-E66E-E111-B43A-00505680000A,33A82B0F-C968-E111-B43A-00505680000A,"bestuurder, gedelegeerd, leiestreekmeetjesland...",-3
3,0003747A-026D-EB11-811C-001DD8B72B62,023B965B-C268-E111-B43A-00505680000A,"aanwez, afsprak, arn, band, bedrijf, besliss, ...",2
4,0006FCCF-6F1A-EA11-8109-001DD8B72B61,0106FCCF-6F1A-EA11-8109-001DD8B72B61,"sintniklaas, waasland, zaakvoerder",-4


In [7]:
df.shape

(68699, 4)

In [None]:
keyphrases = df['keyphrases'].tolist()

# Tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(keyphrases)

# Convert the text to sequences
sequences = tokenizer.texts_to_sequences(keyphrases)

# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=100)

# Create the embedding layer
embedding_dim = 128
embedding_layer = Embedding(len(tokenizer.word_index), embedding_dim)

# Create the LSTM layer
lstm_units = 128
lstm_layer = LSTM(lstm_units)

# Create the dense layer
dense_units = 64
dense_layer = Dense(dense_units, activation='relu')

# Create the output layer
output_layer = Dense(len(tokenizer.word_index), activation='softmax')

# Compile the model
model = tf.keras.Sequential([embedding_layer, lstm_layer, dense_layer, output_layer])
model.compile(loss=tf.nn.sparse_softmax_cross_entropy_with_logits, optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(padded_sequences, padded_sequences, epochs=10)

# Save the model
model.save('recommendation_model.h5')

In [None]:
new_item_keyphrases = ["networking", "ceo", "medewerker", "nieuwjaar", "borrel"]

# Convert the keyphrases to sequences
new_item_sequences = tokenizer.texts_to_sequences([new_item_keyphrases])

# Pad the sequences
new_item_padded_sequences = pad_sequences(new_item_sequences, maxlen=100)

# Predict the recommended items
recommendations = model.predict(new_item_padded_sequences)

# Sort the recommendations by their probability
sorted_recommendations = sorted(zip(recommendations, tokenizer.word_index), key=lambda x: x[0], reverse=True)

# Print the recommended items
for item, probability in sorted_recommendations[:10]:
    print(f"{item}: {probability:.3f}")
