# Baseline

The baseline recommendation model recommends the most popular item. The recommender gives the most popular articles no matter the user, based on their history and impressions.

### Imports

In [3]:
import os
import pandas as pd
from tempfile import TemporaryDirectory
from recommenders.datasets.mind import download_mind
from recommenders.datasets.download_utils import unzip_file
from models.most_popular import MostPopularRecommender
from utils.evaluation import evaluate_model

### Import MIND dataset and set up tempdir

In [4]:
# Create tempdir, download and unzip datasets
tmpdir = TemporaryDirectory()
data_path = tmpdir.name

train_zip, valid_zip = download_mind(size="small", dest_path=data_path)
unzip_file(train_zip, os.path.join(data_path, 'train'), clean_zip_file=False)
unzip_file(valid_zip, os.path.join(data_path, 'valid'), clean_zip_file=False)

# Path to behaviors file
train_behaviors_path = os.path.join(data_path, "train", "behaviors.tsv")

100%|██████████| 51.8k/51.8k [00:05<00:00, 10.3kKB/s]
100%|██████████| 30.2k/30.2k [00:03<00:00, 8.85kKB/s]


### Load into dataframe

In [None]:
# Columns in behaviors.tsv
columns = ["id", "user_id", "timestamp", "history", "impressions"]

# Load behaviors data
test_behaviors_df = pd.read_csv(train_behaviors_path, sep="\t", header=None, names=columns)

# Preview dataset
test_behaviors_df.head()

Unnamed: 0,id,user_id,timestamp,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


### Train the models

In [6]:
# Initialize and train the recommender
most_popular = MostPopularRecommender(test_behaviors_df)
most_popular.train()

### Generate recommendations from test set

In [7]:
top_articles = most_popular.recommend(N=5)
print("Top 5 Recommended Articles:", top_articles)

Top 5 Recommended Articles: ['N306', 'N42620', 'N47020', 'N31801', 'N45794']


### Load validation set

In [8]:
valid_behaviors_path = os.path.join(data_path, "valid", "behaviors.tsv")

columns = ["id", "user_id", "timestamp", "history", "impressions"]

valid_behaviors_df = pd.read_csv(valid_behaviors_path, sep="\t", header=None, names=columns)

valid_behaviors_df.head()

Unnamed: 0,id,user_id,timestamp,history,impressions
0,1,U80234,11/15/2019 12:37:50 PM,N55189 N46039 N51741 N53234 N11276 N264 N40716...,N28682-0 N48740-0 N31958-1 N34130-0 N6916-0 N5...
1,2,U60458,11/15/2019 7:11:50 AM,N58715 N32109 N51180 N33438 N54827 N28488 N611...,N20036-0 N23513-1 N32536-0 N46976-0 N35216-0 N...
2,3,U44190,11/15/2019 9:55:12 AM,N56253 N1150 N55189 N16233 N61704 N51706 N5303...,N36779-0 N62365-0 N58098-0 N5472-0 N13408-0 N5...
3,4,U87380,11/15/2019 3:12:46 PM,N63554 N49153 N28678 N23232 N43369 N58518 N444...,N6950-0 N60215-0 N6074-0 N11930-0 N6916-0 N248...
4,5,U9444,11/15/2019 8:25:46 AM,N51692 N18285 N26015 N22679 N55556,N5940-1 N23513-0 N49285-0 N23355-0 N19990-0 N3...


### Evaluate the model

In [9]:
ndcg, auc, mrr = evaluate_model(most_popular, valid_behaviors_df, 5)
print(f"General Model - NDCG@5: {ndcg:.4f}, AUC@5: {auc:.4f}, MRR@5: {mrr:.4f}")

General Model - NDCG@5: 0.0007, AUC@5: 0.0009, MRR@5: 0.0006


### Cleanup the Tempdir

In [10]:
tmpdir.cleanup()