In [None]:
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
import os 

### Ring buffer implementation

For behaviors: Gå gjennom en sortert liste med data (radene skal være sortert på tid). Så gå gjennom impressions-listen. Der det er en -"1", legg til artikkelens ID i ring-buffer (dette er en artikkel som brukeren har trykket på). 

Så ser vi på neste bruker/rad, som nå er ukjent, siden det er utenfor tidsvinduet. Vi ser tilbake n plsser i bufferet, og gir dette som impressions til denne brukeren. Deretter sjekker vi om det denne brukeren faktisk hadde en 1er på i sin impressionliste var noen av artiklene vi anbefalte. 

Et problem: vi vet ikke hvilken nettavis brukerne tilhører, og vi ikek hvordan impression-listen de faktisk fikk ble generert. To brukere kan komme etter hverandre på listen av brukere når vi sorterer på tid, og lese artikler på to helt forskjellige nettaviser. Dette vil i såfall gi lav score på evalueringen.

In [None]:
#path to the mindsmall training data
training_dataset = '../MINDsmall_train'

validation_dataset = '../MINDsmall_train'

entity_embedding = pd.read_csv("../MINDsmall_train/entity_embedding.vec", sep='\t')
relation_embedding = pd.read_csv("../MINDsmall_train/relation_embedding.vec", sep='\t')


news_data = pd.read_csv("../MINDsmall_train/news.tsv",
    sep='\t',
    names=["newsId", "category", "subcategory", "title","abstract", "url", "title_entities","abstract_entities"]
)

behaviors_data = pd.read_csv(
    "../MINDsmall_train/behaviors.tsv",
    sep='\t',
    names=["impressionId", "userId", "timestamp", "click_history", "impressions"],
    parse_dates=['timestamp'] 
)

### Preprocess data

In [None]:
behaviors_data['timestamp'] = pd.to_datetime(behaviors_data['timestamp'], format='%Y-%m-%d %H:%M:%S')
behaviors_data['clicks'] = behaviors_data['click_history'].str.split().str.len()
behaviors_data['impressions_list'] = behaviors_data['impressions'].str.split()
behaviors_data['impressions_count'] = behaviors_data['impressions_list'].str.len()

# remove null values
news_data = news_data.dropna().reset_index(drop=True)


### Manglende verdier for clicks og click_history

In [None]:
null_clicks_rows = behaviors_data[behaviors_data['clicks'].isnull() & behaviors_data['click_history'].isnull()]

# Remove the rows with null values
behaviors_data = behaviors_data.dropna().reset_index(drop=True)

In [None]:
news_data.head()

In [None]:
behaviors_data.head()

In [None]:
# sort behaviors_data by timestamp
behaviors_data = behaviors_data.sort_values(by='timestamp')

In [None]:
behaviors_data.head()

### Ring buffer

In [None]:
class RingBuffer:
    """ Ring buffer class. """
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.impressions = []

    class __Full:
        """ Full buffer."""
        def add(self, x):
            """ Append an element overwriting the oldest one."""
            self.impressions[self.pointer] = x
            self.pointer = (self.pointer+1) % self.buffer_size

        def get(self):
            """ Return list of elements in correct order."""
            return self.impressions[self.pointer:] + self.impressions[:self.pointer]

    def add(self,x):
        """ Add impression to the end of the list."""
        self.impressions.append(x)
        if len(self.impressions) == self.buffer_size:
            self.pointer = 0
            # from now on, self.__class__ refers to the __Full class
            self.__class__ = self.__Full

    def get(self):
        """ Returns the list of impressions."""
        return self.impressions


### Add a user's impressions to the buffer

We go through the dataframe that is sorted on the timestamps. The ring buffer is instantiated, and we don't start recommending articles until we have gone through a couple of users so that the buffer has a couple of articles in it. Then we check if the recommendation from the ring buffer match up with the actual impression list for the rest of the users. 

In [None]:
wait_count = 10
max_users = wait_count + 1000
last_user = 0
buffer_size = 20
ring_buffer = RingBuffer(buffer_size=buffer_size)
recommendation_size = 20

hits = 0
total_rows = 0

actual_impressions = []
recommended_impressions = []

for index, row in behaviors_data.iterrows():

    user_impressions = []

    if row['impressions_count'] > 0 and max_users > 0:
        user = row['userId']
        #the articles that the user clicked on
        [user_impressions.append(n.split("-")[0]) for n in row['impressions_list'] if n.split("-")[1] == "1"]

        # we wait for users to click on articles before we start recommending articles
        if wait_count == 0:
            # recommend the articles that are in the buffer 
            recommended_impressions.append((user, set(ring_buffer.get()[:recommendation_size])))
            # store the actual articles that the user clicked on
            actual_impressions.append((user, set(user_impressions)))

        #add the current user clicks to the buffer
        for impression in user_impressions:
            ring_buffer.add(impression)

    wait_count -= 1 if wait_count > 0 else 0
    # we can remove this condition to get all the users
    # max_users -= 1
    if max_users == 0:
        break

### Evaluate the Ring Buffer

In [None]:
def evaluate_ring_buffer(actual, recommended):
    total_hits = 0
    print(f'Number of entries evaluated: {len(actual)}')

    hit_rate_list = []

    for i in range(len(actual)):
        length = i + 1
        print(f'{(total_hits / length) * 100: 2f}%')
        hit_rate_list.append((total_hits / length) * 100)
        # measure the accuracy of the recommendation
        intersection = actual[i][1].intersection(recommended[i][1])
        if len(intersection) > 0:
            total_hits += 1
        # print(f'Actual clicks for user {actual[i][0]}: {actual[i][1]}')
        # print(f'Recommended articles: {recommended[i][1]}')
        
    # calculate the accuracy
    hit_rate = (total_hits / len(actual) * 100)
    print(f'Hit rate: {hit_rate}%')
    return hit_rate_list



results = evaluate_ring_buffer(actual_impressions, recommended_impressions)
print(f'Actual impressions: {actual_impressions}')


In [None]:
# plot the hit rate
import matplotlib.pyplot as plt
size = recommendation_size
plt.title(f'Hit Rate - Buffer Size {buffer_size} & Recommendation Size {recommendation_size}')

plt.plot(results)
plt.ylabel('Hit Rate (%)')
plt.xlabel('User Interactions')

#draw a line at the average hit rate
plt.axhline(y=sum(results) / len(results), color='r', linestyle='--')
plt.text(0, sum(results) / len(results), "",color='r')


plt.show()


