In [8]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [3]:
categories = [
    'frontpage',
    'news',
    'tech',
    'local',
    'opinion',
    'on-air',
    'misc',
    'weather',
    'msn-news',
    'health',
    'living',
    'business',
    'msn-sports',
    'sports',
    'summary',
    'bbs',
    'travel'
]

In [57]:
def parse_dataset(path='./data/msnbc/data.seq'):
    with open(path) as f:
        user = 1
        data = []

        for line in f:
            split_line = line.split()
            user_cateogories = [categories[int(i) - 1] for i in split_line]

            data.append(np.array(user_cateogories, dtype=object))
            user += 1

    return np.array(data, dtype=object)

def get_refresh_data(category, user_visits):
    """
    Sliding window approach to find the longest/shortest
    continuous subsequence in O(n) time
    """
    longest, shortest, start, end = 0, len(user_visits) + 1, 0, len(user_visits)
    current, refreshes = 0, 0

    while start < end:
        if user_visits[start] == category:
            current += 1
        else:
            if current >= 2:
                current -= 1 #substract the initial visit
                refreshes += current
                longest = current if current > longest else longest
                shortest = current if current < shortest else shortest
                current = 0

        start += 1

    if current >= 2:
        current -= 1 #substract the initial visit
        refreshes += current
        longest = current if current > longest else longest
        shortest = current if current < shortest else shortest
    
    return refreshes, longest, shortest if shortest < len(user_visits) + 1 else 0

def get_revisit_data(category, user_visits):
    revisits, longest, shortest = 0, 0, len(user_visits) + 1
    previous = -1

    for current, visit in enumerate(user_visits):
        if visit == category:
            if previous == -1:
                previous = current
            else:
                distance = current - previous
                previous = current

                if distance > 1: #not a refresh
                    distance -= 1 #substract the initial visit
                    revisits += 1
                    longest = distance if distance > longest else longest
                    shortest = distance if distance < shortest else shortest

    return revisits, longest, shortest if shortest < len(user_visits) + 1 else 0

def get_dataset_stats(data):
    categories_stats = {}

    for category in tqdm(categories):
        categories_stats[category] = {
            'visits': 0,           #1 2 2 1, here 1 and 2 were visited both 2 times
            'unique_visits': 0,    #1 2 2 1, here 1 and 2 were visited both 1 time
            'refreshes': 0,        #1 1 1, here 1 counts as refreshed 2 times
            'unique_refreshes': 0, #refreshes counted once per user
            'revisits': 0,         #1 2 2 1, here 1 counts as revisited, 2 as refreshed
            'unique_revisits': 0,  #revisits counted once per user 
            'longest_refresh': 0,  #1 1 1 1, here 1 was refreshed 4 times
            'shortest_refresh': 0, 
            'longest_revisit': 0,  #1 2 2 1, revisit distance of 2 sites for 1
            'shortest_revisit': 0,
        }

        for user_visits in data:
            categories_stats[category]['visits'] += (user_visits == category).sum()
            categories_stats[category]['unique_visits'] += 1 if category in user_visits else 0

            refresh_data = get_refresh_data(category, user_visits)
            categories_stats[category]['refreshes'] += refresh_data[0]
            categories_stats[category]['unique_refreshes'] += (1 if refresh_data[0] > 0 else 0)
            
            if refresh_data[1] > categories_stats[category]['longest_refresh']:
                categories_stats[category]['longest_refresh'] = refresh_data[1]

            if refresh_data[2] != 0 and (refresh_data[2] < categories_stats[category]['shortest_refresh']\
                or categories_stats[category]['shortest_refresh'] == 0):
                categories_stats[category]['shortest_refresh'] = refresh_data[2]

            revisit_data = get_revisit_data(category, user_visits)
            categories_stats[category]['revisits'] += revisit_data[0]
            categories_stats[category]['unique_revisits'] += (1 if revisit_data[0] > 0 else 0)

            if revisit_data[1] > categories_stats[category]['longest_revisit']:
                categories_stats[category]['longest_revisit'] = revisit_data[1]

            if revisit_data[2] != 0 and (revisit_data[2] < categories_stats[category]['shortest_revisit']\
                or categories_stats[category]['shortest_revisit'] == 0):
                categories_stats[category]['shortest_revisit'] = revisit_data[2]

    return categories_stats
        
def plot_categories_statistics(categories, categories_stats):
    max_rows, max_cols = 9, 2
    row, col = 1, 1

    fig = make_subplots(
        rows=max_rows, cols=max_cols,
        subplot_titles=categories)

    for category in categories:
        X, Y = [], []

        for key in categories_stats[category]:
            X.append(key)
            Y.append(categories_stats[category][key])

        fig.add_trace(
            go.Histogram(histfunc='sum', x=X, y=Y, name=category),
            row=row, col=col
        )

        if col % max_cols == 0:
            col = 0
            row += 1

        col += 1

    fig.update_layout(
        title_text='Categories statistics',
        bargap=0.1,
        height=max_rows * 300,
        width=max_cols * 650
    )

    fig.show()

def plot_statistics_categories(categories, categories_stats):
    max_rows, max_cols = 10, 1
    row, col = 1, 1

    statistics = []
    for key in categories_stats[categories[0]]:
        statistics.append(key)

    fig = make_subplots(
        rows=max_rows, cols=max_cols,
        subplot_titles=statistics)

    for statistic in statistics:
        X, Y = [], []

        for category in categories:
            X.append(category)
            Y.append(categories_stats[category][statistic])

        fig.add_trace(
            go.Histogram(histfunc='sum', x=X, y=Y, name=statistic),
            row=row, col=col
        )

        if col % max_cols == 0:
            col = 0
            row += 1

        col += 1

    fig.update_layout(
        title_text='Statistics categories',
        bargap=0.1,
        height=max_rows * 300,
        width=max_cols * 1200
    )

    fig.show()

In [5]:
data = parse_dataset('./data/msnbc/data.seq')
data[:5]

array([array(['frontpage', 'frontpage'], dtype=object),
       array(['news'], dtype=object),
       array(['tech', 'news', 'news', 'local', 'news', 'news', 'news', 'tech',
              'tech'], dtype=object)                                          ,
       array(['opinion'], dtype=object),
       array(['frontpage'], dtype=object)], dtype=object)

In [6]:
dataset_stats = get_dataset_stats(data)
dataset_stats

100%|██████████| 17/17 [04:21<00:00, 15.38s/it]


{'frontpage': {'visits': 940469,
  'unique_visits': 313181,
  'refreshes': 526123,
  'unique_refreshes': 183510,
  'revisits': 208524,
  'unique_revisits': 106246,
  'longest_refresh': 14794,
  'shortest_refresh': 1,
  'longest_revisit': 1783,
  'shortest_revisit': 1},
 'news': {'visits': 452387,
  'unique_visits': 175286,
  'refreshes': 247557,
  'unique_refreshes': 87452,
  'revisits': 54761,
  'unique_revisits': 32303,
  'longest_refresh': 725,
  'shortest_refresh': 1,
  'longest_revisit': 583,
  'shortest_revisit': 1},
 'tech': {'visits': 207479,
  'unique_visits': 121948,
  'refreshes': 80048,
  'unique_refreshes': 36169,
  'revisits': 13543,
  'unique_revisits': 9683,
  'longest_refresh': 2057,
  'shortest_refresh': 1,
  'longest_revisit': 775,
  'shortest_revisit': 1},
 'local': {'visits': 386217,
  'unique_visits': 121719,
  'refreshes': 245416,
  'unique_refreshes': 58429,
  'revisits': 39607,
  'unique_revisits': 22843,
  'longest_refresh': 534,
  'shortest_refresh': 1,
  'lo

In [58]:
plot_categories_statistics(categories, dataset_stats)

In [59]:
plot_statistics_categories(categories, dataset_stats)