POPULARITY BASED MODEL

In [1]:
import pandas as pd

#load the transaction data

transactions_df = pd.read_csv('../data/transactions_train.csv')


#We only need the article_id column for now
transactions_df['article_id'] = transactions_df['article_id'].astype(str)


# Load article data to get item names
articles_df = pd.read_csv('../data/articles.csv')
articles_df['article_id'] = articles_df['article_id'].astype(str)


# Count the occurrences of each article_id
popularity_counts = transactions_df['article_id'].value_counts()


# Get the top 10 most popular article IDs
top_10_popular_ids = popularity_counts.head(10).index.tolist()

print("Top 10 most purchased article IDs:")
print(top_10_popular_ids)

# Filter the articles DataFrame to get details of the top 10 items
top_10_articles_details = articles_df[articles_df['article_id'].isin(top_10_popular_ids)]

print("\nDetails of the top 10 most popular articles:")

display(top_10_articles_details[['article_id', 'prod_name', 'product_type_name', 'product_group_name']])


def get_top_n_popular_articles(transactions_data, articles_data, n=10):
    """
    Finds the top N most popular articles based on transaction counts.

    Args:
        transactions_data (pd.DataFrame): DataFrame with transaction history.
        articles_data (pd.DataFrame): DataFrame with article details.
        n (int): The number of top articles to return.

    Returns:
     pd.DataFrame: A DataFrame with the details of the top N articles.
    """
    # Count purchases for each article
    popularity = transactions_data['article_id'].value_counts()

    # Get the top N article IDs
    top_n_ids = popularity.head(n).index.tolist()

    # Get the details from the articles dataframe
    top_n_details = articles_data[articles_data['article_id'].isin(top_n_ids)]

    return top_n_details

# Test your function
print("--- Testing the function for the top 5 items ---")
display(get_top_n_popular_articles(transactions_df, articles_df, n=5))





0           000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...
1           000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...
2           00007d2de826758b65a93dd24ce629ed66842531df6699...
3           00007d2de826758b65a93dd24ce629ed66842531df6699...
4           00007d2de826758b65a93dd24ce629ed66842531df6699...
                                  ...                        
31788319    fff2282977442e327b45d8c89afde25617d00124d0f999...
31788320    fff2282977442e327b45d8c89afde25617d00124d0f999...
31788321    fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...
31788322    fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...
31788323    fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...
Name: customer_id, Length: 31788324, dtype: object

Top 10 most purchased article IDs:
['706016001', '706016002', '372860001', '610776002', '759871002', '464297007', '372860002', '610776001', '399223001', '706016003']

Details of the top 10 most popular articles:


Unnamed: 0,article_id,prod_name,product_type_name,product_group_name
1713,372860001,7p Basic Shaftless,Socks,Socks & Tights
1714,372860002,7p Basic Shaftless,Socks,Socks & Tights
2236,399223001,Curvy Jeggings HW Ankle,Trousers,Garment Lower body
3711,464297007,Greta Thong Mynta Low 3p,Underwear bottom,Underwear
24836,610776001,Tilly (1),T-shirt,Garment Upper body
24837,610776002,Tilly (1),T-shirt,Garment Upper body
53892,706016001,Jade HW Skinny Denim TRS,Trousers,Garment Lower body
53893,706016002,Jade HW Skinny Denim TRS,Trousers,Garment Lower body
53894,706016003,Jade HW Skinny Denim TRS,Trousers,Garment Lower body
70221,759871002,Tilda tank,Vest top,Garment Upper body


--- Testing the function for the top 5 items ---


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
1713,372860001,372860,7p Basic Shaftless,302,Socks,Socks & Tights,1010016,Solid,9,Black,...,Shopbasket Socks,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Fine-knit trainer socks in a soft cotton blend.
24837,610776002,610776,Tilly (1),255,T-shirt,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,T-shirt in lightweight jersey with a rounded h...
53892,706016001,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,9,Black,...,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...
53893,706016002,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,71,Light Blue,...,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...
70221,759871002,759871,Tilda tank,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,EQ Divided Basics,D,Divided,2,Divided,80,Divided Complements Other,1002,Jersey Basic,"Cropped, fitted top in cotton jersey with narr..."


FUNCTION FOR POPULARITY-BASED PROBABILISTIC MODEL ABOVE

In [20]:

def get_top_n_popular_articles_with_probability(transactions_df,n):
    #to count purchases per article
    article_counts = transactions_df['article_id'].value_counts().reset_index()
    # display(article_counts)
    #
    # article_counts.columns = ['article_id','count']
    
    #Total number of purchases
    total_purchases =article_counts['count'].sum()

    #Calculate probability for each article being chosen by customers
    article_counts['probability'] = article_counts['count']/total_purchases
    return article_counts.head(n)
display(get_top_n_popular_articles(transactions_df,n=10))

Unnamed: 0,article_id,count,probability
0,706016001,50287,0.001582
1,706016002,35043,0.001102
2,372860001,31718,0.000998
3,610776002,30199,0.00095
4,759871002,26329,0.000828
5,464297007,25025,0.000787
6,372860002,24458,0.000769
7,610776001,22451,0.000706
8,399223001,22236,0.0007
9,706016003,21241,0.000668


VECTORIZATION AND COSINE SIMILARITY

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

subset_df = articles_df.head(5000).copy()
# Handle potential missing values in the product name
subset_df['prod_name'] = subset_df['prod_name'].fillna("")

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the product names to a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(subset_df['prod_name'])

# Check the shape of the matrix
print(f"Shape of TF-IDF matrix: {tfidf_matrix.shape}")


# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(f"Shape of Cosine Similarity matrix: {cosine_sim_matrix.shape}")









Shape of TF-IDF matrix: (5000, 1381)
Shape of Cosine Similarity matrix: (5000, 5000)


CONTENT BASED RECOMMENDATION MODEL

In [23]:

def get_content_based_recommendations(article_id, articles_data, cosine_sim, n=10):
    """
    Gets top N similar articles for a given article_id.

    Args:
        article_id (str): The ID of the article to find recommendations for.
        articles_data (pd.DataFrame): DataFrame with all article details.
        cosine_sim (np.array): The cosine similarity matrix.
        n (int): The number of recommendations to return.

    Returns:
        pd.DataFrame: A DataFrame with the top N most similar articles.
    """
    # Create a mapping from article_id to DataFrame index
    indices = pd.Series(articles_data.index, index=articles_data['article_id'])

    # Get the index of the article that matches the ID
    idx = indices[article_id]
    
    # Get the pairwise similarity scores of all articles with that article
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the articles based on the similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the N most similar articles (excluding the item itself)
    sim_scores = sim_scores[1:n+1]
    

    # Get the article indices
    article_indices = [i[0] for i in sim_scores]

    # Return the top N most similar articles
    return articles_data.iloc[article_indices]

# --- Test your function ---
# Pick a random article ID to test
test_article_id = '372860001' 
print(f"--- Recommendations for article {test_article_id} ---")
recommendations = get_content_based_recommendations(test_article_id, subset_df, cosine_sim_matrix)
display(recommendations[['article_id', 'prod_name', 'product_type_name']])

--- Recommendations for article 372860001 ---


Unnamed: 0,article_id,prod_name,product_type_name
1714,372860002,7p Basic Shaftless,Socks
1715,372860024,Basic 7p Shaftless,Socks
1716,372860041,Basic 7p Shaftless,Socks
1717,372860043,Basic 7p Shaftless,Socks
1718,372860051,Basic 7p Shaftless,Socks
1719,372860052,Basic 7p Shaftless,Socks
1720,372860062,Basic 7p Shaftless,Socks
1721,372860063,Basic 7p Shaftless,Socks
1722,372860068,Basic 7p Shaftless,Socks
1723,372860069,Basic 7p Shaftless,Socks


FUNCTION FOR HYBRID RECOMMENDATION MODEL CONTAINING POPULARITY BASED RECOMMENDATION MODEL AND CONTENT BASED RECOMMENDATION MODEL

In [26]:
def get_hybrid_recommendations(article_id, transactions_data, articles_data, cosine_sim, n=10):
    """
    Generates a hybrid list of recommendations.

    Args:
        article_id (str): The ID of the article the user is viewing.
        transactions_data (pd.DataFrame): DataFrame with transaction history.
        articles_data (pd.DataFrame): DataFrame with article details.
        cosine_sim (np.array): The pre-computed cosine similarity matrix.
        n (int): The total number of recommendations to return.

    Returns:
        pd.DataFrame: A DataFrame with the hybrid recommendations.
    """
    # 1. Get content-based recommendations
    content_recs = get_content_based_recommendations(article_id, articles_data, cosine_sim, n=n)
    
    # 2. Get popularity-based recommendations
    # We'll get more than N here to have enough items to fill in
    popular_recs = get_top_n_popular_articles(transactions_data, articles_data, n*2)
    
    # 3. Combine the recommendations
    # Start with the personalized content recommendations
    combined_recs_ids = list(content_recs['article_id'])
    
    # Add unique popular recommendations to fill the list up to N items
    for rec_id in popular_recs['article_id']:
        #using this condition below to avoid duplicates and len so that we get exactly 10 that we requested since it would start from 0.
        if rec_id not in combined_recs_ids and len(combined_recs_ids) < n:
            combined_recs_ids.append(rec_id)
            
    # Get the full details for the recommended articles
    #To filter out those rows containing  what is in that list out from the articles dataframe based on the true values there when it checks
    final_recommendations = articles_data[articles_data['article_id'].isin(combined_recs_ids)].drop_duplicates(subset=['article_id'])
    
    # Preserve the ranked order
    final_recommendations = final_recommendations.set_index('article_id').loc[combined_recs_ids].reset_index()

    return final_recommendations

# --- Test your hybrid function ---
# Use the same test article as yesterday
test_article_id = '372860001' 
print(f"--- Hybrid Recommendations for article {test_article_id} ---")

hybrid_recs = get_hybrid_recommendations(test_article_id, transactions_df, articles_df, cosine_sim_matrix)
display(hybrid_recs[['article_id', 'prod_name', 'product_type_name']])


--- Hybrid Recommendations for article 372860001 ---


Unnamed: 0,article_id,prod_name,product_type_name
0,372860002,7p Basic Shaftless,Socks
1,372860024,Basic 7p Shaftless,Socks
2,372860041,Basic 7p Shaftless,Socks
3,372860043,Basic 7p Shaftless,Socks
4,372860051,Basic 7p Shaftless,Socks
5,372860052,Basic 7p Shaftless,Socks
6,372860062,Basic 7p Shaftless,Socks
7,372860063,Basic 7p Shaftless,Socks
8,372860068,Basic 7p Shaftless,Socks
9,372860069,Basic 7p Shaftless,Socks


### Evaluation Strategy Outline

1. **Create Training and Test Sets**  
   Split `transactions_train.csv` by date into:
   - Training set: purchases before a specific date.
   - Test set: purchases on or after that date.

2. **Identify Test Users**  
   Find users who appear in both sets.

3. **Loop Through Test Users**  
   For each test user:
   a. Get the last item they purchased in the training set.  
   b. Get the list of items they purchased in the test set (ground truth).  
   c. Use a recommendation model (e.g., hybrid or content-based) to generate top-10 recommendations.  
   d. Compare recommendations to ground truth:  
      - Count how many recommended items appear in the test purchases.  
      - Compute Precision@10 and Recall@10.

4. **Aggregate Results**  
   Average the Precision@10 and Recall@10 scores across all users to evaluate overall model performance.


In [37]:
import joblib  # Needed again when we want to reload the objects

# Save the trained TF-IDF vectorizer (so we can transform new product names later without retraining)
joblib.dump(tfidf_vectorizer, "../artifacts/tfidf_vectorizer.pkl")  

# Save the TF-IDF feature matrix (so we can quickly reuse it for similarity/recommendations without recalculating)
joblib.dump(tfidf_matrix, "../artifacts/tfidf_matrix.pkl")  

# Save the cosine similarity matrix (so we can quickly compute content-based recommendations)
joblib.dump(cosine_sim_matrix, "../artifacts/cosine_sim_matrix.pkl")


['../artifacts/cosine_sim_matrix.pkl']