In [1]:
!mkdir -p /kaggle/working/models/
!cd /kaggle/working/models/ && git lfs install && git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2


Git LFS initialized.
Cloning into 'all-MiniLM-L6-v2'...
remote: Enumerating objects: 112, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 112 (delta 20), reused 0 (delta 0), pack-reused 61 (from 1)[K
Receiving objects: 100% (112/112), 363.13 KiB | 5.04 MiB/s, done.
Resolving deltas: 100% (42/42), done.
Filtering content: 100% (15/15), 930.41 MiB | 166.68 MiB/s, done.


In [2]:
# ============================
#   MIND News Recommender (BERT-based) with Logs
# ============================

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.neighbors import NearestNeighbors
import time

# ----------------------------
# 1. Load Dataset
# ----------------------------

print("\n" + "="*60)
print("   MIND NEWS RECOMMENDER SYSTEM")
print("="*60)

start_time = time.time()
print("\n[1/7] Loading news dataset...")

news = pd.read_csv(
    "/kaggle/input/mind-news-dataset/MINDsmall_train/news.tsv",
    sep='\t', header=None,
    names=[
        "news_id", "category", "subcategory", "title",
        "abstract", "url", "title_entities", "abstract_entities"
    ]
)

print(f"      ✓ Loaded {len(news):,} news articles")
print(f"      ✓ Categories: {news['category'].nunique()}")
print(f"      ✓ Subcategories: {news['subcategory'].nunique()}")

print("\n      Loading user behavior data...")
behavior = pd.read_csv(
    "/kaggle/input/mind-news-dataset/MINDsmall_train/behaviors.tsv",
    sep='\t', header=None,
    names=["impression_id", "user_id", "time", "history", "impressions"]
)

print(f"      ✓ Loaded {len(behavior):,} user behavior logs")
print(f"      ✓ Unique users: {behavior['user_id'].nunique():,}")

load_time = time.time() - start_time
print(f"\n      Data loading completed in {load_time:.2f} seconds\n")

# ----------------------------
# 2. Parse Impressions (Clicked / Not Clicked)
# ----------------------------

print("[2/7] Parsing user impressions...")
parse_start = time.time()

def parse_impressions(row):
    pairs = row.split()
    parsed = []
    for pair in pairs:
        nid, click = pair.split('-')
        parsed.append((nid, int(click)))
    return parsed

behavior['parsed_impressions'] = behavior['impressions'].apply(parse_impressions)

parse_time = time.time() - parse_start
print(f"      ✓ Parsed impressions in {parse_time:.2f} seconds\n")

# ----------------------------
# 3. Choose One User (Example)
# ----------------------------

print("[3/7] Analyzing user behavior...")
user_id = behavior['user_id'].iloc[0]   # change to any user_id you want
print(f"      Selected user: {user_id}")

user_data = behavior[behavior['user_id'] == user_id]
print(f"      User has {len(user_data)} impression sessions")

pos_articles, neg_articles = [], []
for _, row in user_data.iterrows():
    for nid, click in row['parsed_impressions']:
        if click == 1:
            pos_articles.append(nid)
        else:
            neg_articles.append(nid)

print(f"      ✓ Positive interactions: {len(pos_articles)}")
print(f"      ✓ Negative interactions: {len(neg_articles)}")
print(f"      ✓ Click-through rate: {len(pos_articles)/(len(pos_articles)+len(neg_articles))*100:.2f}%\n")

# ----------------------------
# 4. Create Combined Text for Articles
# ----------------------------

print("[4/7] Preparing article text for embedding...")
text_start = time.time()

news['text'] = (
    news['category'].fillna('') + " " +
    news['subcategory'].fillna('') + " " +
    news['title'].fillna('') + " " +
    news['abstract'].fillna('')
)

avg_text_length = news['text'].str.len().mean()
print(f"      ✓ Combined text created (avg length: {avg_text_length:.0f} chars)")
print(f"      ✓ Time taken: {time.time() - text_start:.2f} seconds\n")

# ----------------------------
# 5. Generate BERT Embeddings
# ----------------------------

print("[5/7] Generating BERT embeddings...")
print("      Model: all-MiniLM-L6-v2")
print("      This may take a few minutes...")
embed_start = time.time()

model = SentenceTransformer('/kaggle/working/models/all-MiniLM-L6-v2')
print(f"      ✓ Model loaded")

embeddings = model.encode(news['text'].tolist(), show_progress_bar=True, batch_size=128)
embeddings = np.array(embeddings)

embed_time = time.time() - embed_start
print(f"\n      ✓ Generated embeddings: shape {embeddings.shape}")
print(f"      ✓ Embedding dimension: {embeddings.shape[1]}")
print(f"      ✓ Time taken: {embed_time:.2f} seconds ({embed_time/len(news)*1000:.2f} ms/article)\n")

# Create ID → index mapping
id_to_index = {nid: i for i, nid in enumerate(news['news_id'])}
print(f"      ✓ Created news ID to index mapping\n")

# ----------------------------
# 6. Extract +ve / -ve Article Vectors for This User
# ----------------------------

print("[6/7] Extracting user preference vectors...")

pos_idx = [id_to_index[nid] for nid in pos_articles if nid in id_to_index]
neg_idx = [id_to_index[nid] for nid in neg_articles if nid in id_to_index]

print(f"      ✓ Positive vectors found: {len(pos_idx)}/{len(pos_articles)}")
print(f"      ✓ Negative vectors found: {len(neg_idx)}/{len(neg_articles)}")

if len(pos_idx) == 0 or len(neg_idx) == 0:
    print("\n      ✗ ERROR: Insufficient click data for selected user")
    raise ValueError("Selected user has insufficient click data (no +ve or -ve examples).")

pos_vecs = embeddings[pos_idx]
neg_vecs = embeddings[neg_idx]

print(f"      ✓ Positive matrix shape: {pos_vecs.shape}")
print(f"      ✓ Negative matrix shape: {neg_vecs.shape}\n")

# ----------------------------
# 7. Compute Distances & Recommend
# ----------------------------

print("[7/7] Computing recommendations...")
rec_start = time.time()

# cluster positive articles (structure among liked items)
n_neighbors = min(5, len(pos_vecs))
print(f"      Building KNN index with k={n_neighbors}...")
knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
knn.fit(pos_vecs)
print(f"      ✓ KNN index built")

# compute centroid of negatives (represents disliked region)
neg_centroid = neg_vecs.mean(axis=0).reshape(1, -1)
print(f"      ✓ Computed negative centroid")

# compute distance of each positive vector from negative centroid
distances = cosine_distances(pos_vecs, neg_centroid).flatten()
print(f"      ✓ Computed cosine distances")
print(f"         - Min distance: {distances.min():.4f}")
print(f"         - Max distance: {distances.max():.4f}")
print(f"         - Mean distance: {distances.mean():.4f}")

# rank by farthest distance from negatives
rank_idx = np.argsort(-distances)  # descending
recommended_indices = [pos_idx[i] for i in rank_idx[:5]]

recommendations = news.iloc[recommended_indices][['news_id', 'title', 'category', 'subcategory']]

rec_time = time.time() - rec_start
print(f"\n      ✓ Recommendations computed in {rec_time:.2f} seconds")

total_time = time.time() - start_time
print(f"\n{'='*60}")
print(f"   TOTAL EXECUTION TIME: {total_time:.2f} seconds")
print(f"{'='*60}")

print("\n" + "="*60)
print(f"   TOP 5 RECOMMENDATIONS FOR USER: {user_id}")
print("="*60 + "\n")

for idx, (i, row) in enumerate(recommendations.iterrows(), 1):
    print(f"{idx}. [{row['category']:15s} / {row['subcategory']:20s}]")
    print(f"   {row['title']}")
    print(f"   (ID: {row['news_id']})\n")

print("="*60)

2025-10-26 17:55:23.500296: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761501323.704134      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761501323.766470      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



   MIND NEWS RECOMMENDER SYSTEM

[1/7] Loading news dataset...
      ✓ Loaded 51,282 news articles
      ✓ Categories: 17
      ✓ Subcategories: 264

      Loading user behavior data...
      ✓ Loaded 156,965 user behavior logs
      ✓ Unique users: 50,000

      Data loading completed in 4.17 seconds

[2/7] Parsing user impressions...
      ✓ Parsed impressions in 2.98 seconds

[3/7] Analyzing user behavior...
      Selected user: U13740
      User has 3 impression sessions
      ✓ Positive interactions: 3
      ✓ Negative interactions: 313
      ✓ Click-through rate: 0.95%

[4/7] Preparing article text for embedding...
      ✓ Combined text created (avg length: 291 chars)
      ✓ Time taken: 0.08 seconds

[5/7] Generating BERT embeddings...
      Model: all-MiniLM-L6-v2
      This may take a few minutes...




      ✓ Model loaded


Batches:   0%|          | 0/401 [00:00<?, ?it/s]


      ✓ Generated embeddings: shape (51282, 384)
      ✓ Embedding dimension: 384
      ✓ Time taken: 45.29 seconds (0.88 ms/article)

      ✓ Created news ID to index mapping

[6/7] Extracting user preference vectors...
      ✓ Positive vectors found: 3/3
      ✓ Negative vectors found: 313/313
      ✓ Positive matrix shape: (3, 384)
      ✓ Negative matrix shape: (313, 384)

[7/7] Computing recommendations...
      Building KNN index with k=3...
      ✓ KNN index built
      ✓ Computed negative centroid
      ✓ Computed cosine distances
         - Min distance: 0.8048
         - Max distance: 0.9969
         - Mean distance: 0.8787

      ✓ Recommendations computed in 0.01 seconds

   TOTAL EXECUTION TIME: 52.56 seconds

   TOP 5 RECOMMENDATIONS FOR USER: U13740

1. [travel          / traveltips          ]
   What Happens If Your Oxygen Mask Doesn't Inflate on a Flight?
   (ID: N28910)

2. [sports          / football_nfl        ]
   Charles Rogers, former Michigan State football, De