In [3]:
import pandas as pd

#load the transaction data

transactions_df = pd.read_csv('../data/transactions_train.csv')


#We only need the article_id column for now
transactions_df['article_id'] = transactions_df['article_id'].astype(str)


# Load article data to get item names
articles_df = pd.read_csv('../data/articles.csv')
articles_df['article_id'] = articles_df['article_id'].astype(str)


# Count the occurrences of each article_id
popularity_counts = transactions_df['article_id'].value_counts()


# Get the top 10 most popular article IDs
top_10_popular_ids = popularity_counts.head(10).index.tolist()

print("Top 10 most purchased article IDs:")
print(top_10_popular_ids)

# Filter the articles DataFrame to get details of the top 10 items
top_10_articles_details = articles_df[articles_df['article_id'].isin(top_10_popular_ids)]

print("\nDetails of the top 10 most popular articles:")

display(top_10_articles_details[['article_id', 'prod_name', 'product_type_name', 'product_group_name']])


def get_top_n_popular_articles(transactions_data, articles_data, n=10):
    """
    Finds the top N most popular articles based on transaction counts.

    Args:
        transactions_data (pd.DataFrame): DataFrame with transaction history.
        articles_data (pd.DataFrame): DataFrame with article details.
        n (int): The number of top articles to return.

    Returns:
     pd.DataFrame: A DataFrame with the details of the top N articles.
    """
    # Count purchases for each article
    popularity = transactions_data['article_id'].value_counts()

    # Get the top N article IDs
    top_n_ids = popularity.head(n).index.tolist()

    # Get the details from the articles dataframe
    top_n_details = articles_data[articles_data['article_id'].isin(top_n_ids)]

    return top_n_details

# Test your function
print("--- Testing the function for the top 5 items ---")
display(get_top_n_popular_articles(transactions_df, articles_df, n=5))





Top 10 most purchased article IDs:
['706016001', '706016002', '372860001', '610776002', '759871002', '464297007', '372860002', '610776001', '399223001', '706016003']

Details of the top 10 most popular articles:


Unnamed: 0,article_id,prod_name,product_type_name,product_group_name
1713,372860001,7p Basic Shaftless,Socks,Socks & Tights
1714,372860002,7p Basic Shaftless,Socks,Socks & Tights
2236,399223001,Curvy Jeggings HW Ankle,Trousers,Garment Lower body
3711,464297007,Greta Thong Mynta Low 3p,Underwear bottom,Underwear
24836,610776001,Tilly (1),T-shirt,Garment Upper body
24837,610776002,Tilly (1),T-shirt,Garment Upper body
53892,706016001,Jade HW Skinny Denim TRS,Trousers,Garment Lower body
53893,706016002,Jade HW Skinny Denim TRS,Trousers,Garment Lower body
53894,706016003,Jade HW Skinny Denim TRS,Trousers,Garment Lower body
70221,759871002,Tilda tank,Vest top,Garment Upper body


--- Testing the function for the top 5 items ---


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
1713,372860001,372860,7p Basic Shaftless,302,Socks,Socks & Tights,1010016,Solid,9,Black,...,Shopbasket Socks,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Fine-knit trainer socks in a soft cotton blend.
24837,610776002,610776,Tilly (1),255,T-shirt,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,T-shirt in lightweight jersey with a rounded h...
53892,706016001,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,9,Black,...,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...
53893,706016002,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,71,Light Blue,...,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...
70221,759871002,759871,Tilda tank,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,EQ Divided Basics,D,Divided,2,Divided,80,Divided Complements Other,1002,Jersey Basic,"Cropped, fitted top in cotton jersey with narr..."


In [6]:
#
# Step 2: Merge product_type info into transactions
merged_df = transactions_df.merge(articles_df[['article_id', 'product_type_name']], on='article_id', how='left')

# Step 3: Count popularity within each product type
popularity_by_type = merged_df.groupby(['product_type_name', 'article_id']).size().reset_index(name='count')

# Step 4: Pick top article in each product type
idx = popularity_by_type.groupby('product_type_name')['count'].idxmax()
top_articles_by_type = popularity_by_type.loc[idx]

# Step 5: Add full article details
final_result = top_articles_by_type.merge(articles_df, on='article_id', how='left')

# Display results
display(final_result[['article_id', 'product_type_name_x', 'prod_name', 'count']])

# display(final_result)

Unnamed: 0,article_id,product_type_name_x,prod_name,count
0,858306003,Accessories set,Esther set 2pcs,14
1,563993001,Alice band,HW Devil aliceband,343
2,757884002,Baby Bib,DIDIER BASIC 6PACK,74
3,682238003,Backpack,NAV MINI BACK PACK,1068
4,639448001,Bag,Day tote,3982
...,...,...,...,...
125,808698004,Wedge,Ninia espadrille,1732
126,753475001,Weekend/Gym bag,MINDY SHOPPER,895
127,916926003,Wireless earphone case,Air pod cases,175
128,850239001,Wood balls,TC - cederwood rings,35


POPULARITY-BASED PROBABILISTIC MODEL

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

subset_df = articles_df.head(5000).copy()
# Handle potential missing values in the product name
subset_df['prod_name'] = subset_df['prod_name'].fillna("")

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the product names to a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(subset_df['prod_name'])

# Check the shape of the matrix
print(f"Shape of TF-IDF matrix: {tfidf_matrix.shape}")


# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(f"Shape of Cosine Similarity matrix: {cosine_sim_matrix.shape}")









Shape of TF-IDF matrix: (5000, 1381)
Shape of Cosine Similarity matrix: (5000, 5000)


In [None]:
def get_content_based_recommendations(article_id, articles_data, cosine_sim, n=10):
    """
    Gets top N similar articles for a given article_id.

    Args:
        article_id (str): The ID of the article to find recommendations for.
        articles_data (pd.DataFrame): DataFrame with all article details.
        cosine_sim (np.array): The cosine similarity matrix.
        n (int): The number of recommendations to return.

    Returns:
        pd.DataFrame: A DataFrame with the top N most similar articles.
    """
    # Create a mapping from article_id to DataFrame index
    indices = pd.Series(articles_data.index, index=articles_data['article_id'])

    # Get the index of the article that matches the ID
    idx = indices[article_id]

    # Get the pairwise similarity scores of all articles with that article
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the articles based on the similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the N most similar articles (excluding the item itself)
    sim_scores = sim_scores[1:n+1]

    # Get the article indices
    article_indices = [i[0] for i in sim_scores]

    # Return the top N most similar articles
    return articles_data.iloc[article_indices]

# --- Test your function ---
# Pick a random article ID to test
test_article_id = '0706016001' 
print(f"--- Recommendations for article {test_article_id} ---")
recommendations = get_content_based_recommendations(test_article_id, articles_df, cosine_sim_matrix)
display(recommendations[['article_id', 'prod_name', 'product_type_name']])