<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Pipeline-part" data-toc-modified-id="Pipeline-part-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Pipeline part</a></span></li><li><span><a href="#Backend-part" data-toc-modified-id="Backend-part-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Backend part</a></span><ul class="toc-item"><li><span><a href="#sort-just-based-of-recency" data-toc-modified-id="sort-just-based-of-recency-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>sort just based of recency</a></span></li><li><span><a href="#sort-based-on-frecency" data-toc-modified-id="sort-based-on-frecency-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>sort based on frecency</a></span></li></ul></li><li><span><a href="#sort-based-on-modified-frecency" data-toc-modified-id="sort-based-on-modified-frecency-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>sort based on modified frecency</a></span></li></ul></div>

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

# Pipeline part

In [None]:
import pandas as pd
import glob

In [None]:
!pwd

In [None]:
article_snapshots = glob.glob("../articles/articles-*")

In [None]:
len(article_snapshots)

In [None]:
from pipeline.text_processing import load_article_snapshots

In [None]:
df = load_article_snapshots(article_snapshots)

In [None]:
from pipeline.text_processing import add_lemmatized_texts, fit_tf_idf

In [None]:
add_lemmatized_texts(df)

In [None]:
X, words = fit_tf_idf(df["lemmatized_texts"])

In [None]:
words

# Backend part

In [None]:
from backend.recommender.recommend import (
    recommend,
    get_daily_google_trends,
    get_relevant_words,
    estimate_popularity,
    calculate_frecency
)

In [None]:
top_trends = get_daily_google_trends()

In [None]:
top_trends

In [None]:
pop = estimate_popularity(top_trends, X, words)

In [None]:
(pd.Series(pop.squeeze()) == 0).value_counts()

## sort just based of recency 

In [None]:
df.sort_values("published", ascending=False).title.values[:20]

## sort based on frecency

In [None]:
recommend(df, X, words)

# sort based on modified frecency
(mitigate impact of popularity)

In [None]:
import time
import numpy as np
from math import log, e

In [None]:
def calculate_frecency(popularity, age):
    """
    see https://wiki.mozilla.org/User:Jesse/NewFrecency
    """
    # how much will be older articles penalized,
    # interpretation: the denomintaor is number of seconds after which the score halves
    lambda_const = log(2) / (7*24*60*60)  # 7 days
    return np.multiply(np.log(popularity), np.exp(-lambda_const * age))


In [None]:
daily_trends = get_daily_google_trends()
popularity = estimate_popularity(daily_trends, X, words) + 1
age = time.time() - df.published.map(time.mktime)

frecency = np.squeeze(np.asarray(calculate_frecency(popularity.T, age.values)))


In [None]:
(pd.Series(frecency) == 0).value_counts()

In [None]:
top_ids = frecency.argsort()[::-1][:10]
df.iloc[top_ids[:10], 0].values