#### Task 1
 Set up and Data preparation
 

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


#Load a part of the data to make things faster
df = pd.read_csv("../data/transactions_train.csv").head(500000)

#Preprocessing the IDS into continuous integer codes that the embedding layer can use.

#creating user and item encoders
user_encoder = LabelEncoder()
article_encoder = LabelEncoder()

# Fit and transform the IDs to integer codes
df['user_code'] = user_encoder.fit_transform(df['customer_id'])
df['article_code'] = article_encoder.fit_transform(df['article_id'])

# Get the number of unique users and articles
n_users = df['user_code'].nunique()
n_articles = df['article_code'].nunique()

print(f"Number of unique users: {n_users}")
print(f"Number of unique articles: {n_articles}")


#Creating Training Data
# Positive examples (the ones we have)
X = df[['user_code', 'article_code']].values #to create numpy array of the user and the item beside
y = np.ones(len(df)) # All are purchases, so target is 1,an array of 1's as the target

# In a full project, we would add "negative samples" here (target=0) so that we can allow the model know or understand when the user didn't buy 
# For today, we will skip this step to focus on the model architecture.







Number of unique users: 119904
Number of unique articles: 24942


#### Task 2
Building the Model Architecture

In [10]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Define embedding size
embedding_dim = 32

# --- User Input Pathway ---
user_input = layers.Input(shape=(1,), name='user_input')
user_embedding = layers.Embedding(n_users, embedding_dim, name='user_embedding')(user_input)
user_vec = layers.Flatten(name='flatten_user')(user_embedding)

# --- Article Input Pathway ---
article_input = layers.Input(shape=(1,), name='article_input')
article_embedding = layers.Embedding(n_articles, embedding_dim, name='article_embedding')(article_input)
article_vec = layers.Flatten(name='flatten_article')(article_embedding)

# --- Combine & Create Model ---
concatenated = layers.Concatenate()([user_vec, article_vec])
dense_1 = layers.Dense(128, activation='relu')(concatenated)
dense_2 = layers.Dense(64, activation='relu')(dense_1)
output = layers.Dense(1, activation='sigmoid')(dense_2) # Sigmoid for 0-1 probability

model = keras.Model(inputs=[user_input, article_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


Generating Negative samples

In [11]:
# Ensure you've run the code from yesterday to create df, user_encoder, etc.

# Create a set of all unique article codes for fast lookup
all_article_codes = set(df['article_code'].unique())

# Create a dictionary of items purchased by each user
user_purchases = df.groupby('user_code')['article_code'].apply(set)

neg_samples = []
for user_code in df['user_code'].unique():
    # Get the set of articles this user has purchased
    purchased_articles = user_purchases[user_code]

    # Find articles the user has NOT purchased
    unpurchased_articles = all_article_codes - purchased_articles

    # Randomly sample N negative items, where N is the number of positive items
    num_positives = len(purchased_articles)
    neg_items = np.random.choice(list(unpurchased_articles), size=num_positives, replace=False)

    for item_code in neg_items:
        neg_samples.append([user_code, item_code, 0]) # user, item, target=0

# Create the negative samples DataFrame
neg_df = pd.DataFrame(neg_samples, columns=['user_code', 'article_code', 'target'])

# Get positive samples from the original dataframe
pos_df = df[['user_code', 'article_code']]
pos_df['target'] = 1

# Concatenate positive and negative DataFrames
final_df = pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

print(f"Total training samples: {len(final_df)}")
display(final_df.head())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_df['target'] = 1


Total training samples: 947349


Unnamed: 0,user_code,article_code,target
0,43742,6709,1
1,51310,8760,1
2,111787,596,0
3,82086,8823,0
4,79752,1603,0


Training the model

In [None]:
X_train = [final_df['user_code'], final_df['article_code']]
y_train = final_df['target']

# (Re-run the cell where you defined and compiled your model architecture)

print("Starting model training on balanced data...")
history = model.fit(
    x=X_train,
    y=y_train,
    epochs=5,       # Keep epochs low for today's exercise
    batch_size=256,
    validation_split=0.1, # Use 10% of the data for validation
    verbose=1
)
print("Training finished.")


### Strategy for Generating Recommendations

To get the top 10 recommendations for a specific user (`user_X`):

1.  **Identify Candidate Items**: Get a list of all unique article codes that the user has *not* previously purchased.
2.  **Create User-Item Pairs**: Create a DataFrame with two columns: one repeating `user_X`'s code, and the other containing all the candidate article codes.
3.  **Predict Probabilities**: Use `model.predict()` on these pairs. The model will output a "purchase probability" for each candidate item.
4.  **Rank and Select**: Sort the articles based on their predicted probability in descending order.
5.  **Return Top-K**: The top 10 articles from this sorted list are the final recommendations.
