In [3]:
import pandas as pd 
import numpy as np 
import sys
sys.path.append('/home/alexabades/recsys')

In [9]:
dt = pd.read_csv("../data/processed/ml-1m/ml-1m.train.rating", sep="\t",names=["UserID", "MovieID", "Rating", "Timestamp"])

In [59]:
# Filter out rows where Rating is 0, then group by MovieID and count non-null Ratings
popular_items = list(dt[dt["Rating"] != 0].groupby("MovieID").count()["Rating"].sort_values(ascending=False).head(10).index)
popular_items

[104, 124, 44, 64, 113, 48, 97, 132, 22, 128]

In [27]:
dt_test = pd.read_csv("../data/processed/ml-1m/ml-1m.test.rating", sep="\t",names=["UserID", "MovieID", "Rating", "Timestamp"])

In [38]:
dt_test.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,0,25,5,978824351
1,1,133,3,978300174
2,2,207,4,978298504
3,3,208,4,978294282
4,4,222,2,978246585


In [34]:
popular_items

Index([104, 124, 44, 64, 113, 48, 97, 132, 22, 128], dtype='int64', name='MovieID')

In [56]:
hts = []
for row in dt_test.iterrows():
  if row[1]["MovieID"] in popular_items:
    hts.append(1)
  else:
    hts.append(0)
  

In [60]:
sum(hts)/len(hts)

0.022019867549668875

In [64]:
dt_test[["MovieID"]].map(lambda x : 1 if x in list(popular_items) else 0)["MovieID"].mean()

MovieID    0.02202
dtype: float64

In [65]:
# Calculate Hit Ratio using a more streamlined approach
average_ht = dt_test["MovieID"].apply(lambda x: 1 if x in popular_items else 0).mean()


In [66]:
average_ht

0.022019867549668875

In [67]:
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

In [69]:
# Assuming dt_test is your test DataFrame and popular_items is a list of top-K item IDs
hts = []
for _, row in dt_test.iterrows():
    hts.append(getHitRatio(popular_items, row["MovieID"]))

In [87]:
import math 

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

In [88]:
# Assuming dt_test is your test DataFrame and popular_items is a list of top-K item IDs
ncdgs = []
for _, row in dt_test.iterrows():
    ncdgs.append(getNDCG(popular_items, row["MovieID"]))

In [90]:
ncdg = sum(ncdgs)/len(ncdgs)

0.010542016489001741

In [91]:
import math

def getDCG(ranklist, gtItem):
    """Calculate DCG based on the position of the ground truth item in the ranked list."""
    for i, item in enumerate(ranklist):
        if item == gtItem:
            return math.log(2) / math.log(i + 2)  # Using log base 2
    return 0

def getIDCG(K):
    """Calculate Ideal DCG for the top K items. Assuming binary relevance (1 for relevant, 0 for not relevant),
    the ideal case would have the relevant item at the top of the list."""
    # For binary relevance, the best case is having the item at the top, hence K=1
    if K > 0:
        return math.log(2) / math.log(2)  # First position relevance
    else:
        return 0

# Assume popular_items is sorted by popularity
ncdgs = []
K = 10  # Assuming we're evaluating top-K items for NDCG
idcg = getIDCG(K)  # Calculate IDCG once since it's constant for binary relevance with K=1

for _, row in dt_test.iterrows():
    dcg = getDCG(popular_items[:K], row["MovieID"])
    ncdg = dcg / idcg if idcg > 0 else 0
    ncdgs.append(ncdg)

average_ncdg = sum(ncdgs) / len(ncdgs)


In [93]:
math.log(2) / math.log(2)

1.0

In [80]:
filepath = "../data/processed/ml-1m/ml-1m"

In [94]:
import pandas as pd


class PopularItem:
    def __init__(self, path) -> None:
        self.path = path
        # Load data as soon as an instance is created
        self.dt_train, self.dt_test = self._load_data()

    def _load_data(self):
        # Private method to load data
        train_extension = ".train.rating"
        test_extension = ".test.rating"

        dt_train = pd.read_csv(
            f"{self.path}{train_extension}",
            sep="\t",
            names=["UserID", "MovieID", "Rating", "Timestamp"],
        )
        dt_test = pd.read_csv(
            f"{self.path}{test_extension}",
            sep="\t",
            names=["UserID", "MovieID", "Rating", "Timestamp"],
        )

        return dt_train, dt_test

    def calculate_hit_ratio(self, K):
        # Calculate popular items based on training data
        self.popular_items = list(
            self.dt_train[self.dt_train["Rating"] != 0]
            .groupby("MovieID")
            .count()["Rating"]
            .sort_values(ascending=False)
            .head(K)
            .index
        )
        # Calculate hit ratio based on test data
        average_ht = (
            self.dt_test["MovieID"]
            .apply(lambda x: 1 if x in self.popular_items else 0)
            .mean()
        )
        return average_ht

    def getDCG(ranklist, gtItem):
        """Calculate DCG based on the position of the ground truth item in the ranked list."""
        for i, item in enumerate(ranklist):
            if item == gtItem:
                return math.log(2) / math.log(i + 2)  # Using log base 2
        return 0
    
    def calculate_ndcg_ratio_binary(self,K):
        ncdgs = []
        idcg = 1
        for _, row in dt_test.iterrows():
            dcg = getDCG(popular_items[:K], row["MovieID"])
            ncdg = dcg / idcg  
            ncdgs.append(ncdg)
        average_ncdg = sum(ncdgs) / len(ncdgs)
        return average_ncdg

In [95]:
model = PopularItem(path=filepath)

In [96]:
model.calculate_hit_ratio(10)

0.022019867549668875

In [97]:
model.calculate_ndcg_ratio_binary(10)

0.010542016489001741