# Importing Packages

In [1]:
# Basic
import pandas as pd
import json
import collections
from collections import defaultdict
import itertools
from itertools import chain
import urllib
from urllib.parse import urlparse
import re
import numpy as np
import datetime
import mysql.connector

# Advanced
from pandas.io.json import json_normalize
import operator
from operator import itemgetter

# Language processing
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet as wn
from gensim.models import word2vec
from gensim import corpora
from gensim.models import CoherenceModel
from gensim.models.keyedvectors import KeyedVectors
from wordsegment import load, segment
load() # for spell checking
SpellCheck = segment

# Visualizations
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

# Establishing Connection

In [2]:
config_own = {
        'user' : 'root', 
        'password' :'MacBookAir1231', 
        'host' : '127.0.0.1',
        'database' : 'TargetDB',
        'ssl_disabled' : 'True'
    }
con_own = mysql.connector.connect(**config_own)

# Query Loader

In [3]:
query_loader = pd.read_sql_query('select entry from requestlog limit 10000', con_own)

In [4]:
query_loader.head(5)

Unnamed: 0,entry
0,"{""visitorId"":""63077af1-c15d-40e8-9b2b-64c36d18..."
1,"{""visitorId"":""0f4c7121-f5ca-42ef-aad4-e6ed09af..."
2,"{""visitorId"":""7f0ac3e8-0a1e-4ae3-88c2-5daa045c..."
3,"{""visitorId"":""318666e5-b268-4bb7-87c3-04141445..."
4,"{""visitorId"":""a66172dc-85b7-4814-97c1-65ce52a0..."


# Making Data Dump

In [5]:
query = 'select entry from requestlog limit 1000'
path = 'query_sample_paper'

In [6]:
def query_to_json_file(query, column, path_file):
        index = 0
        data_json = []

        while (index < len(query)):
            j = query.get_value(index, column)
            #my_json = j.decode('utf8')
            data = json.loads(j)
            data_json.append(data)
            index += 1

        print("DONE 1")

        if path_file != "":
            with open(path_file, 'w') as outfile:
                json.dump(data_json, outfile, indent=0)

            print("DONE 2")

        else:
            return data_json

In [7]:
query_to_json_file(query_loader,'entry',path)

  


DONE 1
DONE 2


# Testing Json Dump

In [8]:
def json_read(filepath, multiline=False):
        file = pd.read_json(filepath, lines=multiline, convert_dates=False)
        print("Step 1/7 - Reading, done...")
        return file

In [9]:
json_test = json_read(path)

Step 1/7 - Reading, done...


# Filtering

In [10]:
def filter_columns(data_frame):
        collector_data = json_normalize(data_frame['collectorData'])
        all_data = pd.concat([collector_data, data_frame], axis=1)
        keep_list = ['visitorId', 'timestamp', 'pageUrl',
                     'newVisit', 'pageId']
        processed_data = all_data[keep_list]
        print("Step 2/7 - Filtering, done...")
        return processed_data

In [11]:
filter_test = filter_columns(json_test)

Step 2/7 - Filtering, done...


In [12]:
# fixing unix timestamp
def formatTime(value):
    timestamp, ms = divmod(value, 1000)
    dt = datetime.datetime.fromtimestamp(timestamp) + datetime.timedelta(milliseconds=ms)
    formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    return formatted_time

In [13]:
filter_test['timestamp'] = filter_test.apply(lambda row: formatTime(row['timestamp']), axis=1)

In [14]:
filter_test.head(1)

Unnamed: 0,visitorId,timestamp,pageUrl,newVisit,pageId
0,63077af1-c15d-40e8-9b2b-64c36d188658,2016-08-29 18:46:05.809,https://www.onehippo.org/library/administratio...,True,hst:pages/documentation


# Sorting

In [15]:
def json_sort(file):
        sort_by = ["visitorId", "timestamp"]
        sorted_file = file.sort_values(by=sort_by)
        print("Step 3/7 - Sorting, done...")
        return sorted_file

In [16]:
sortedData=json_sort(filter_test)

Step 3/7 - Sorting, done...


# Get Transaction

In [17]:
## Go over it
class MFAlgorithm:

    @staticmethod
    def run_MF_algorithm(visitor, time, urls):
        url_pairs = [('', urls[0], 0, 0)]
        i = 0
        number_of_urls = len(urls)
        while (i + 1) < number_of_urls:
            url_pairs.append((urls[i], urls[i + 1], i, i + 1))
            i += 1

        i = 0
        end_transaction = False
        all_transactions = []
        current_transaction = []
        timestamp = time[0]
        number_of_pairs = len(url_pairs)

        while i < number_of_pairs:
            current_url, next_url, index_current, index_next = url_pairs[i]

            # Initialize the transaction for the first URL
            if current_url == '':
                current_transaction.append(next_url)
                timestamp = time[index_next]
                i += 1
                continue

            # If the URL exists in the transaction, end transaction and add it to list.
            # If not, we add the url to the transaction list and go on.
            if next_url in current_transaction:
                if not end_transaction:
                    if current_transaction not in all_transactions:
                        all_transactions.append((visitor, timestamp, current_transaction))
                this_index = current_transaction.index(next_url)
                current_transaction = current_transaction[0:this_index + 1]
                end_transaction = True
                i += 1
                continue
            else:
                if end_transaction:
                    end_transaction = False
                    timestamp = time[index_current]
                current_transaction.append(next_url)

            i += 1

        if current_transaction not in all_transactions:
            all_transactions.append((visitor, timestamp, current_transaction))
        return all_transactions

    @staticmethod
    def init_algorithm(sortedData):
        result = []
        grouped = sortedData.groupby('visitorId')
        i = 0
        visitor_length = len(grouped)
        print("Initializing Transaction Extraction...")
        for visitorId, group in grouped:
            time = grouped.get_group(visitorId).timestamp.tolist()
            path = grouped.get_group(visitorId).pageUrl.tolist()
            result_paths = MFAlgorithm.run_MF_algorithm(visitorId, time, path)
            result.extend(result_paths)
            i += 1
            if i % 1000 == 0:
                print("Progress:", round((i / visitor_length) * 100, 2), "%")
        return result

In [18]:
mfa = MFAlgorithm()
def get_transactions(sorted_data):
        transactions = mfa.init_algorithm(sorted_data)
        data_frame = pd.DataFrame(transactions, columns=['visitorId', 'timestamp', 'transactionPath'])
        data_frame = pd.merge(data_frame, sorted_data, on=['visitorId', 'timestamp'])
        print("Step 4/7 - Extract transactions, done...")
        return data_frame.drop(['timestamp', 'pageUrl'], axis=1)

In [19]:
fixed_transaction = get_transactions(sortedData)

Initializing Transaction Extraction...
Progress: 24.58 %
Progress: 49.15 %
Progress: 73.73 %
Progress: 98.3 %
Step 4/7 - Extract transactions, done...


In [20]:
fixed_transaction.head(1)

Unnamed: 0,visitorId,transactionPath,newVisit,pageId
0,000036b2-64a9-497b-9dc2-75f4dd5ee397,[https://www.onehippo.org/],True,hst:pages/home


# Get Keywords

In [21]:
def morphForms(word):
    if word == 'cms':
        return word
    examined_word = wn.morphy(word)
    if examined_word is None:
        return word
    else:
        return examined_word
    
# Example: discouraged(word) discourage (returned function) or cats(word) returned cat

In [22]:
def get_keywords(d):
    lemmatizer = WordNetLemmatizer()
    sw = set(stopwords.words('english'))
    badlinks=['search.yahoo','yandex', 'google']
    dashed = []
    dups = []
    split_one = urlparse(d)
    split_two = split_one._asdict()
    for k,v in split_two.items():
        if k == 'path':
            # Basic:
            value = v.replace(".html", "")
            value = value.replace(".xml", "")
            value = value.replace(".php", "")
            # Advanced: 
            value = re.sub("[\d+ % &]", " ", value) # removes numbers and replaces with empty space
            value = re.sub("(?<=[' '])[.,-]|[.,-](?=[' '])", " ", value) # remove dashes when there is empty space infront or behind
            value = re.sub('(?<=\D)[.,_]|[.,_](?=\D)', ' ', value) # removes dashes, dots, commas if there is a digit before/after
            #value = re.sub('[^A-Za-z0-9]+', '', value) # removes all dots,dashed etc
            value = value.replace(' ','-')
            value = re.sub('\s\s+', ' ',value) # remove double empty spaces
            #value = re.sub('\s+', '',value) # remove empty spaces
            value = value.split('/')
            value = list(filter(None, value))
            sw.add('en')
            sw.add('nl')
            sw.add('de')
            sw.add('txt')
            sw.add('oh')
            sw.add('net')
            sw.add('com')
            sw.add('1')
            sw.add('login')
            sw.remove('about')
            sw.remove('why')
            sw.remove('in')
            
            # manually splitting words with _ and -
            for a in value:
                if '_' in a:
                    b = a.replace('_','-')
                    dashed.append(b.split('-'))
                    dups.append(a)
                if '-' in a:
                    dashed.append(a.split('-'))
                    dups.append(a)
    
            value = [x for x in value if x not in dups]
            resultList=set(value)
            value_dash = set(list(chain.from_iterable(dashed))) # merge and keep unique ones
            resultList.update(value_dash)
            resultList = [w for w in resultList if w not in sw]
            resultList = [x.lower() for x in resultList]
            resultList = [morphForms(word) for word in resultList]
            resultList = list(filter(None, resultList)) # fastest
            resultList = set(resultList)
            '''
            # new
            final_list = []
            for i in resultList:
                word_tokens = nltk.word_tokenize(i) ###Tokenization
                stemmed_words = [lemmatizer.lemmatize(w.lower()) for w in word_tokens if w.isalpha()] ###Stemming., removes some git, example line 7
                final_list.append(stemmed_words)
                
            final_list =[item for sublist in final_list for item in sublist]
            # new until here
            '''
            final_list = list(resultList)
            #final_list =[item for sublist in resultList for item in sublist]
            
    return final_list
    

In [23]:
def checkForList(listd):
    tempList = []
    if isinstance(listd,(list,)):
        for i in listd:
            tempList.append(get_keywords(i))
    else:
        return get_keywords(listd)
    
    flat_list = [item for sublist in tempList for item in sublist]
    final_list = set(flat_list)
    #final_list = list(final_list) # used the set for readability in the comparison
    return final_list # we changed it for list for the comparison with Baseline approach


In [24]:
def get_content_page_and_keywords(data_frame):
        data_frame['keywords'] = data_frame['transactionPath'].map(lambda x: checkForList(x))
        data_frame['contentPage'] = data_frame.transactionPath.str[-1]
        print("Step 5/7 - Keep content pages and get path keywords, done...")
        return data_frame

In [25]:
keywords_df = get_content_page_and_keywords(fixed_transaction)

Step 5/7 - Keep content pages and get path keywords, done...


In [26]:
keywords_df.head(1)

Unnamed: 0,visitorId,transactionPath,newVisit,pageId,keywords,contentPage
0,000036b2-64a9-497b-9dc2-75f4dd5ee397,[https://www.onehippo.org/],True,hst:pages/home,{},https://www.onehippo.org/


# Remove homepage

In [27]:
def remove_homepage(data_frame):
        data_frame = data_frame.drop(
            data_frame[(
                        (data_frame.pageId == 'hst:pages/home') |
                        (data_frame.pageId == 'hst:pages/pagenotfound')
                       )
                       &
                       (data_frame.transactionPath.str.len() == 1)
                       ].index).reset_index(drop=True)
        print("Step 6/7 - Remove visitors that only visited the homepage, done...")
        return data_frame

In [28]:
clean_homepage = remove_homepage(keywords_df)
clean_homepage = clean_homepage[clean_homepage.astype(str)['keywords'] != '[]']

Step 6/7 - Remove visitors that only visited the homepage, done...


In [29]:
clean_homepage.head(1)

Unnamed: 0,visitorId,transactionPath,newVisit,pageId,keywords,contentPage
0,0017bf12-aece-4c5d-a0df-eb6e93e10030,[https://www.onehippo.com/de/warum-hippo-cms],True,hst:pages/why-hippo-cms-default,"{warum, hippo, cms}",https://www.onehippo.com/de/warum-hippo-cms


# Making Data For NeuralNetwork and Scrapping

In [None]:
# Saving the clean home page for Keywords evaluation
clean_homepage.to_csv('Evaluation.csv')

In [None]:
# Make for data Model
keywords_list = clean_homepage["keywords"].tolist()
data_NN = []
for i in keywords_list:
    data_NN.append(' '.join(i))
with open('data_NN.txt', 'w') as f:
    for item in data_NN:
        f.write("%s\n" % item)

In [None]:
# Prepare links for Scrapping
url_list = clean_homepage["contentPage"].tolist()
url_file = []

for i in url_list:
    url_file.append(i)
    
url_file = set(url_file) # Cleaning duplicates
url_file = list(url_file)

with open('url_file.txt', 'w') as f:
    for item in url_file:
        f.write("%s\n" % item)