In [26]:
import pandas as pd
import re
from collections import Counter
from operator import itemgetter
import os

In [27]:
def is_url_valid(url):
    if url[:4] != "http":
        return False
    return True
    

def domain_without_www(domain):
    if domain[:4] == "www.":
        return domain[4:]
    return domain


def filter_rows(df):
    df['is_valid'] = df.url.apply(is_url_valid)
    df = df[df['is_valid'] == True]
    df = df.drop(['is_valid'], axis=1)
    df['domain'] = df.domain.apply(domain_without_www)
    df = df.dropna()
    df = df.sort_values('visit_time_ms', ascending=False)
    return df.reset_index().drop('index', axis=1)


def fuck_of_this_domain(df, domains):
    for domain in domains:
        df = df.loc[df['domain'] != domain]
    return df.reset_index().drop('index', axis=1)
    

def clean_date(df):   
    df.day = [re.sub('[-:]', '/', df.day[i]) for i in range(len(df))]
    
    for i in range(len(df)):
        try:
            df.day[i] = pd.to_datetime(df.day[i] + '-'+ df.time[i])
        except ValueError:
            df.day[i] = None
    return filter_rows(df.drop('time', axis=1))


def find_indexes(df, url):
    return df.loc[df['url'] ==  url].index.tolist()


def replace_same_url(df, url, indexes):
    index = 0
    while index < len(indexes):
        if df.url[indexes[index]] == url:
            indexes.pop(index)
            last_i = indexes[-1] + 1
            while df.url[last_i] == url and last_i != df.index.max():
                last_i += 1
            indexes += [last_i]
        else:
            index += 1
    return indexes


def weighted_urls(df, url, previous_url, NUM_OF_NEXT=5, NUM_OF_PREVIOUS=2):
    indexes_after = [range(element - NUM_OF_NEXT, element) for element in find_indexes(df, url) if element - NUM_OF_NEXT > 0]
    for index in range(len(indexes_after)):
        indexes_after[index] = replace_same_url(df, url, indexes_after[index])
    indexes_before = [range(element + 1, element + 1 + NUM_OF_PREVIOUS) for element in find_indexes(df, url)]
    if indexes_after:
        if indexes_after[0] == []:
            indexes_after.pop(0)

    weights = [float(1)/2**i for i in range(NUM_OF_NEXT)]
    urls_before = [df.url[i].values  for i in indexes_before]
    urls_after = [df.url[i].values  for i in indexes_after]
    urls_weighted = []

    for j in range(len(urls_after)):
        weights = [float(1)/2**(NUM_OF_NEXT - i -1) for i in range(NUM_OF_NEXT)]
        coef = 1
        i = 0
        while i < NUM_OF_PREVIOUS and previous_url[i] == urls_before[j][i]:
            weights = [weight*2 for weight in weights]
            i += 1
        urls_weighted.append([(urls_after[j][k], weights[k]) for k in range(len(weights))])            
    return urls_weighted


def get_weigths(df, url, previous_url):
    
    weigth_dic = {}
    for url_list in weighted_urls(df, url, previous_url):
        for url in url_list:
            if url[0] in weigth_dic:
                weigth_dic[url[0]] += url[1]
            else:
                weigth_dic[url[0]] = url[1]

    return sorted(weigth_dic.items(), key=itemgetter(1), reverse=True)


def is_in_time(df, index, threshold):
    if df.day[index-1] - df.day[index] < pd.Timedelta(threshold, 's') :
        return False
    else:
        return True

In [28]:
def to_int(number):
    if not isinstance(number, float) and "," in number:
        number = number.replace(",", ".")
    return float(number)


def delta(df):
    df.visit_time_ms = df.visit_time_ms.apply(to_int)
    df = df.sort_values('visit_time_ms', ascending = False)
    df['delta'] = [df.visit_time_ms[index] - df.visit_time_ms[index + 1] for index in range(df.shape[0] - 1)] + [0]
    return df

In [29]:
def treat_data(df_path, filtered_domains):
    df = pd.read_csv(df_path, delimiter=";", header=None)
    df.columns = (['url', 'domain', 'root domain', 'visit_time_ms', 'visit_time_str', 'day of the week', 'transition_type', 'page title'])
    df = df[['url', 'domain', 'visit_time_ms', 'visit_time_str', 'transition_type']]
    df = filter_rows(df)
    df = fuck_of_this_domain(df, filtered_domains)
    return df


def get_other_databases(folder_path, filtered_domains):
    df_list = []
    for df in os.listdir(folder_path):
        if os.path.basename(df)[0] != ".":
            df_list.append(treat_data(folder_path + os.path.basename(df), filtered_domains))
    return df_list

In [30]:
filtered_domains = (['whatsapp.com','web.whatsapp.com', 'twitter.com', 'linkedin.com',
                     'google.co.il','fr-fr.messenger.com','youtube.com','facebook.com', 'localhost',
                     'plus.google.com','google.fr', 'mail.google.com', 'google.com','messenger.com',
                     'listenonrepeat.com', 'drive.google.com', 'docs.google.com', 'calendar.google.com',
                     'chrome.google.com', 'gmail.com', 'lefigaro.fr'])
df_path = 'databases/antho_secrets.csv'
folder_path = 'databases/other_databases/'
df = treat_data(df_path, filtered_domains)
df_list = get_other_databases(folder_path, filtered_domains)

In [31]:
arg2, arg3 = 'http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html', [ "", ""]


def sum_score(df_list, arg2, arg3):
    score_dic = {}
    for dataframe in df_list:
        for i in get_weigths(dataframe, arg2, arg3):
            if i[0] in score_dic:
                score_dic[i[0]] += i[1]
            else:
                score_dic[i[0]] = i[1]

    return sorted(score_dic.items(), key=itemgetter(1), reverse=True)

score_dic = sum_score(df_list, arg2, arg3)

In [32]:
print score_dic[:5]

[('http://stackoverflow.com/questions/29438265/stratified-train-test-split-in-scikit-learn', 2.5), ('http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html', 1.6875), ('http://pythoncentral.io/introduction-to-tweepy-twitter-for-python/', 1.5625), ('http://scikit-learn.org/stable/install.html#installation-instructions', 1.1875), ('http://stackoverflow.com/questions/14661701/how-to-drop-a-list-of-rows-from-pandas-dataframe', 1.0)]


In [33]:
arg2, arg3 = 'http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html', [ "", ""]
for dataframe in df_list:
    for i in get_weigths(dataframe, arg2, arg3):
        print i
    print "---------------------------------------------------------------------------------------------------\n\n"
    print "---------------------------------------------------------------------------------------------------"

('http://stackoverflow.com/questions/29438265/stratified-train-test-split-in-scikit-learn', 1.5)
('http://stackoverflow.com/questions/20076195/what-is-the-most-efficient-way-of-counting-occurrences-in-pandas', 1.0)
('http://datascience.stackexchange.com/questions/2368/machine-learning-features-engineering-from-date-time-data', 1.0)
('http://stackoverflow.com/questions/25039626/find-numeric-columns-in-pandas-python', 1.0)
('http://stackoverflow.com/questions/13411544/delete-column-from-pandas-dataframe', 1.0)
('http://stackoverflow.com/questions/25146121/extracting-just-month-and-year-from-pandas-datetime-column-python', 0.5625)
('http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.value_counts.html', 0.5)
('http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html', 0.5)
('https://fr-fr.facebook.com/', 0.5)
('https://www.quora.com/How-can-I-hide-mouse-cursor', 0.375)
('http://stackoverflow.com/questions/34842405/parameter-stratify-from-metho

In [34]:
list_set = [set(df.url.values)]
for dataframe in df_list:
    list_set.append(list(set(dataframe.url.values)))
a = [item for sublist in list_set for item in sublist]
b = Counter(a)
for i in b.most_common()[:5]:
    print i

('http://hive.itcapp.com/', 7)
('https://github.com/', 6)
('https://israeltechallenge.com/', 6)
('https://trello.com/', 5)
('http://hive.itcapp.com/#', 5)


In [41]:
def get_results_yourself(df_yourself, current_url, previous_urls):
    best_recommandations = [i[0] for i in get_weigths(df_yourself, current_url, previous_urls)]
    return best_recommandations[:5]


def get_results_others(df_list, current_url, previous_urls):
    best_recommandations = [i[0] for i in sum_score(df_list, arg2, arg3)]
    return best_recommandations[:5]

def split_website_videos(results):
    websites = []
    videos = []
    for result in results:
        if re.search(r"^https://www.youtube.com/watch", result):
            videos.append(result)
        else:
            websites.append(result)
    return websites, videos

def interface_front_end(df_yourself, current_url, previous_urls, others="Results/others.txt", yourself="Results/yourself.txt", youtube="Results/youtube.txt"):
    
    websites, videos = split_website_videos(get_results_others(df_list, current_url, previous_urls))
    
    file = open(others, "w")
    results_others = []
    for url in websites:
        file.write(url)
        file.write(os.linesep)
    file.close()

    file = open(yourself, "w")
    for url in get_results_yourself(df_yourself, current_url, previous_urls):
        file.write(url)
        file.write(os.linesep)
    file.close()

    results_youtube = []
    file = open(youtube, "w")
    for url in videos:
        file.write(url)
        file.write(os.linesep)
    file.close()

In [42]:
others = "Results/others.txt"
yourself = "Results/yourself.txt"
youtube = "Results/youtube.txt"

arg2 = 'http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html'
arg3 = [ "", ""]

interface_front_end(df, arg2, arg3)