In [52]:
import pandas as pd
db = pd.read_csv('history.csv', header=None, usecols=[0, 1, 2])
db.columns=['day', 'time', 'url']

In [53]:
def get_domain_name(url):
    if url[:4] == "http" :
        name = url.split("//")[-1].split("/")[0]
        if name[:4] == "www.":
            return name[4:]
        return name
    return None


def add_domain(df):
    df['domain'] = df.url.apply(get_domain_name)
    return df


def filter_rows(df):
    df = df.dropna()
    return df

In [54]:
db = add_domain(db)
db = filter_rows(db)

In [55]:
def find_indexes(url):
    return db.loc[db['url'] ==  url].index.tolist()

find_indexes('http://9gag.com/')

[2040, 2063, 2166, 2264, 2269, 2273, 2327, 2332]

In [56]:
def next_url(url):
    indexes = [element - 1 for element in find_indexes(url)]
    return db.domain[indexes].values

next_url('http://9gag.com/')

array(['9gag.com', 'translate.google.com', 'watch-episodes.com',
       'youtube.com', 'youtube.com', 'youtube.com', 'google.co.il',
       'google.co.il'], dtype=object)

In [123]:
def weighted_urls(db, url, previous_url):
    
    NUM_OF_NEXT = 5
    NUM_OF_PREVIOUS = 2
        
    indexes_after = [range(element - NUM_OF_NEXT, element) for element in find_indexes(url)]
    indexes_before = [range(element + 1, element + 1 + NUM_OF_PREVIOUS) for element in find_indexes(url)]


    weights = [float(1)/2**i for i in range(WEIGHTS_NUM)]
    urls_before = [db.domain[i].values  for i in indexes_before]
    urls_after = [db.domain[i].values  for i in indexes_after]
    urls_weighted = []
    
    for j in range(len(urls_after)):
        weights = [float(1)/2**(NUM_OF_NEXT - i -1) for i in range(NUM_OF_NEXT)]
        coef = 1
        i = 0
        while i < NUM_OF_PREVIOUS and previous_url[i] == urls_before[j][i]:
            weights = [weight*2 for weight in weights]
            i += 1
        urls_weighted.append([(urls_after[j][k], weights[k]) for k in range(len(weights))])        
        
    return urls_weighted
    

weighted_urls(db, 'http://9gag.com/', ['hive.itcapp.com', 'youtube.com'])

8


[[('9gag.com', 0.0625),
  ('9gag.com', 0.125),
  ('9gag.com', 0.25),
  ('9gag.com', 0.5),
  ('9gag.com', 1.0)],
 [('google.co.il', 0.25),
  ('lri.fr', 0.5),
  ('people.fas.harvard.edu', 1.0),
  ('google.co.il', 2.0),
  ('translate.google.com', 4.0)],
 [('watchepisodes3.com', 0.0625),
  ('google.co.il', 0.125),
  ('watchepisodes3.com', 0.25),
  ('watchepisodes3.com', 0.5),
  ('watch-episodes.com', 1.0)],
 [('youtube.com', 0.0625),
  ('youtube.com', 0.125),
  ('youtube.com', 0.25),
  ('youtube.com', 0.5),
  ('youtube.com', 1.0)],
 [('9gag.com', 0.0625),
  ('youtube.com', 0.125),
  ('youtube.com', 0.25),
  ('youtube.com', 0.5),
  ('youtube.com', 1.0)],
 [('youtube.com', 0.0625),
  ('9gag.com', 0.125),
  ('youtube.com', 0.25),
  ('lefigaro.fr', 0.5),
  ('youtube.com', 1.0)],
 [('tutorialspoint.com', 0.0625),
  ('google.co.il', 0.125),
  ('stackoverflow.com', 0.25),
  ('google.co.il', 0.5),
  ('google.co.il', 1.0)],
 [('9gag.com', 0.0625),
  ('google.co.il', 0.125),
  ('google.co.il', 0.25)

In [80]:
weights = [float(1)/2**i for i in range(5)]
weights

[1.0, 0.5, 0.25, 0.125, 0.0625]

In [67]:
from collections import Counter

input = next_page('http://9gag.com/')
c = Counter( input )
print( c.items() )

sorted(c.items(), key=lambda student: student[1], reverse=True)

[('youtube.com', 3), ('9gag.com', 1), ('google.co.il', 2), ('translate.google.com', 1), ('watch-episodes.com', 1)]


[('youtube.com', 3),
 ('google.co.il', 2),
 ('9gag.com', 1),
 ('translate.google.com', 1),
 ('watch-episodes.com', 1)]