### 2. Notebook setup

#### Imports

In [95]:
import pandas as pd
import numpy as np

import  csv

from time import sleep
from timeit import default_timer as timer

# custom general helper functions for this project
import custom_utils as cu
import importlib

In [102]:
# reload imports as needed
importlib.reload(cu);

#### Settings

In [2]:
# set up Pandas options
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 100)
pd.set_option('display.precision', 3)
pd.options.display.float_format = '{:.2f}'.format

#### Read in, check and clean the data

In [3]:
# Read the cleaned up EN clickstream tsv file into pandas
filepath = "../data/clickstream-enwiki-2018-12_clean.tsv"
df = pd.read_csv(filepath, sep='\t', names=['prev', 'curr', 'type', 'n'])

In [4]:
# Replace the false missing value NaNs with string "NaN"s
df['prev'] = df['prev'].fillna('NaN')
df['curr'] = df['curr'].fillna('NaN')

### 3. Data prep

#### Nodes

In [26]:
external_edges = df[df.type == "external"]
internal_edges = df[df.type != "external"]

##### Nodes with external traffic going to them

In [27]:
external_edges.head()

Unnamed: 0,prev,curr,type,n
0,other-empty,2019_Horizon_League_Baseball_Tournament,external,16
1,other-search,ForeverAtLast,external,40
2,other-empty,ForeverAtLast,external,85
6,other-search,Jehangir_Wadia,external,967
8,other-empty,Jehangir_Wadia,external,638


In [28]:
ext_nodes = set(external_edges["curr"])

In [29]:
len(ext_nodes)

5163109

In [30]:
list(ext_nodes)[:10]

['Martial_arts_therapy',
 'Arrondissement_of_Beauvais',
 'Hibiscadelphus_bombycinus',
 '2009_European_Touring_Car_Cup',
 'Leopold_Cassella',
 'Hugh_Wilson_(Northern_Ireland_politician)',
 'Conchylodes_gammaphora',
 'Hemin_Desai',
 'Gokcha_barbel',
 'Marduk_(band)']

In [72]:
def external_traffic(df, article, external_traffic_type):
    traffic = df.loc[(df.curr == article) & (df.prev == external_traffic_type), 'n'].values
    if len(traffic):
        return traffic[0]
    else:
        return ''

In [103]:
ext_nodes_list = list(ext_nodes)

In [None]:
start_time = timer()
with open("../data/external_nodes.tsv","w") as tsvfile:
    wr = csv.writer(tsvfile, delimiter='\t')
    
    # header
    wr.writerow(["title", 
                 "external_website_traffic", 
                 "other_wikimedia_traffic", 
                 "external_search_traffic", 
                 "empty_referer_traffic", 
                 "unknown_external_traffic"])
    #rows
    for article in ext_nodes_list:
        wr.writerow([article, 
                     external_traffic(external_edges, article, "other-external"), 
                     external_traffic(external_edges, article, "other-internal"), 
                     external_traffic(external_edges, article, "other-search"), 
                     external_traffic(external_edges, article, "other-empty"), 
                     external_traffic(external_edges, article, "other-other")])
        

cu.printRunTime(start_time)

In [None]:
!head ../data/external_nodes.tsv

In [None]:
# check the number of lines in the file
!wc -l ../data/external_nodes.tsv

##### Nodes without external traffic going to them

In [None]:
prev_nodes = set(internal_edges["prev"])
curr_nodes = set(internal_edges["curr"])

In [7]:
int_nodes = prev_nodes.union(curr_nodes)

In [8]:
len(int_nodes)

5185699

In [None]:
internal_only_nodes = int_nodes.difference(ext_nodes)

In [None]:
len(internal_only_nodes)

In [9]:
list(internal_only_nodes)[:10]

['Martial_arts_therapy',
 'Arrondissement_of_Beauvais',
 'Hibiscadelphus_bombycinus',
 '2009_European_Touring_Car_Cup',
 'Leopold_Cassella',
 'Hugh_Wilson_(Northern_Ireland_politician)',
 'Conchylodes_gammaphora',
 'Hemin_Desai',
 'Gokcha_barbel',
 'Marduk_(band)']

In [17]:
!ls ../data

clickstream-enwiki-2018-12.tsv       clickstream-enwiki-2018-12_clean.tsv


In [18]:
all_nodes_list = list(all_nodes)

with open("../data/all_nodes.tsv","w") as f:
    wr = csv.writer(f,delimiter="\n")
    wr.writerow(all_nodes_list)

In [23]:
!head ../data/all_nodes.tsv

head: ../data/all_nodes.tsv: No such file or directory


In [21]:
df.head(10)

Unnamed: 0,prev,curr,type,n
0,other-empty,2019_Horizon_League_Baseball_Tournament,external,16
1,other-search,ForeverAtLast,external,40
2,other-empty,ForeverAtLast,external,85
3,First_Families_of_Pakistan,Jehangir_Wadia,link,19
4,"The_Lawrence_School,_Sanawar",Jehangir_Wadia,link,36
5,Wadia_family,Jehangir_Wadia,link,715
6,other-search,Jehangir_Wadia,external,967
7,Ness_Wadia,Jehangir_Wadia,link,494
8,other-empty,Jehangir_Wadia,external,638
9,GoAir,Jehangir_Wadia,link,1191
