# Crawl Webpages to Create a Sitemap

In [2]:
import urllib
import urllib.request
import requests
from bs4 import BeautifulSoup
from collections import deque
import csv
from textblob import TextBlob
import nltk
nltk.data.path.append("/Users/niklasstoehr/Libraries");

import json
import sys
import re

import networkx as nx
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

In [4]:
## <------------!!!!!!!!!! Change Crawl Behaviour

def get_links(url):

    for i in url:
        
        r = requests.get(i, allow_redirects = True, timeout=5)                                # <------------!!!!!!!!!!
        soup = BeautifulSoup(r.content, "lxml")
        
        hrefs = []

        for a in soup.find_all('a', href=True):
            hrefs.append(a['href'])
            
        hrefs = list(set(hrefs))   # remove duplicate links from webpage
        #print(hrefs)

        links = []
        for href in hrefs:
            if (href.startswith("http")): # inlude https://www.linkedin.com/company/11234128/     # <------------!!!!!!!!!!
                links.append(href)
                        
            if(href.startswith("#")): 
                if str(url[0])[-1] == "#":               # avoid domain//about (double backslash)
                    url[0] = url[0][:-1]
                    links.append(str(url[0]) + href)
                else:                                   # do domain/about (one backslash)
                    links.append(str(url[0]) + href)  
            if(href.startswith("/")):                    # include /ourwork
                if str(url[0])[-1] == "/":               # avoid domain//about (double backslash)
                    url[0] = url[0][:-1]
                    links.append(str(url[0]) + href)
                else:                                   # do domain/about (one backslash)
                    links.append(str(url[0]) + href)   
            else:
                pass

        #print(links)
        return links

In [5]:
def breadth_search(domain_name, search_depth, url):
    
    ## write crawl statistics
    crawl_txt = open('results/' + domain_name + '_crawl.csv', "w")
    crawl_txt.write('level,pages' + '\n')
    
    ## write to file and print
    crawl_txt.write(str(0) + ',' + str(1) + '\n')
    
    crawled = deque(url)  # nodes already used as starting points of crawl
    next_queue = deque([url])
    level = 0

    while level < search_depth:
        
        print ('\nlevel:', level ,'queue length:', len(next_queue))
        queue = next_queue
        next_queue = deque([])

        while (len(queue) > 0):
            
            url = queue.popleft()
            
            try:
                links = get_links(url)  # get links, maybe parse the result of last statement

                ## write to txt file
                new_page_count = 0
                for e in (links):
                    
                    if e not in url:  # check if link directs back to parent

                        #if ("www." + domain_name) in e:  # check if link is still on main page of company         # <------------!!!!!!!!!!
                        #if (domain_name) in e:  # check if link is still on main page of company         # <------------!!!!!!!!!!
 
                            #if (e.count('/') < 4+2): # if more backslashes than 5, stop the procedure              # <------------!!!!!!!!!!

                        file.write((str(url[0]) + '\t' + str(e) + '\n')) ## write to edges file --> save link

                        #____________________________________________________________________________________

                        if [e] not in next_queue: # check if node already in next queue

                            if e not in crawled: # check if node already used to crawl as starting point (source)

                                next_queue.append([e])
                                crawled.append(e) # nodes already used as starting points of crawl
                                new_page_count += 1

                print('\t ...added ', new_page_count, 'links to next queue\t', len(queue), ' left in current queue...')

            except:
                print("! cannot reach webpage !")
                pass
            
        level += 1
        print("\t ...total crawled pages:", len(crawled), "\n")
        
        ## write to file and print
        crawl_txt.write(str(level) + ',' + str(len(next_queue)) + '\n')

    crawl_txt.close()
    print("\n\ncrawler terminated")

In [6]:
#"https://www.vw.com"       -> no http hrefs, no #, depth 4+2, no redirects, no domain without www, redirect no
#"https://www.gm.com"       -> no http hrefs, depth 12+2, redirects, no domain without www, no #, redirect yes
#"https://www.toyota.com"   -> no http hrefs, no #, depth 3+2, www.domain, no redirects
#https://www.hyundai.co.uk  ("https://www.hyundai.com/worldwide/") -> no http hrefs, no #, depth 3+2, no domain without www, no redirects

url = "https://www.volkswagen.com"

domain_name = "vw"

## Start Crawling

In [7]:
file = open('results/links.txt', 'w')
search_depth = 2

breadth_search(domain_name, search_depth, [url])

file.close()


level: 0 queue length: 1
	 ...added  21 links to next queue	 0  left in current queue...
	 ...total crawled pages: 22 


level: 1 queue length: 21
	 ...added  7 links to next queue	 20  left in current queue...
	 ...added  7 links to next queue	 19  left in current queue...
! cannot reach webpage !
	 ...added  69 links to next queue	 17  left in current queue...
! cannot reach webpage !
	 ...added  20 links to next queue	 15  left in current queue...
	 ...added  23 links to next queue	 14  left in current queue...
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
! cannot reach webpage !
	 ...total crawled pages: 148 



crawler terminated


In [8]:
lines_seen = set() # holds lines already seen
outfile_txt = open('results/' + domain_name + '_edges.txt', "w")
outfile_csv = open('results/' + domain_name + '_edges.csv', "w")
outfile_csv.write('source,target\n')

for line in open('results/links.txt', "r"):
    if line not in lines_seen: # not a duplicate
        
        outfile_txt.write(line)
        
        csv_line = re.split(r'\t+', line)
        try:
            outfile_csv.write(str(csv_line[0])+ ',' + str(csv_line[1]))
        except:
            pass
        
        lines_seen.add(line)

## Close all CSVs
outfile_txt.close()
outfile_csv.close()

## Create Node List

In [None]:
nodes = list()

## find unique nodes____________________________________________

with open('results/' + domain_name + '_edges.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if (row['source'] not in nodes):
            nodes.append(row['source'])
        if (row['target'] not in nodes):
            nodes.append(row['target'])
            
print("nodes:", len(nodes))



## write nodes in nodes.csv_______________________________________

nodes_csv = open('results/' + domain_name + '_nodes.csv', "w")
nodes_csv.write('node' + '\n')
            
for node in nodes:
    
    nodes_csv.write(str(node) + '\n')

## close all CSVs
nodes_csv.close()
csvfile.close()

## Crawl Contents

### Topic Analysis

In [None]:
def topic_analysis(url, topic_dict):
                
    # Start Page Analsis_________________________________________________________________
    analysis_results = dict()
    
    try:  ## Webpage crawlable
        html = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(html)
            
    
        # Text Extraction _______________________________________________________________________

        # kill all script and style elements
        for script in soup(["script", "style", "head", "title", "[document]"]):
            script.extract()    # rip it out



        # get text
        text = soup.get_text()

        # break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)
        ## convert to lower case
        text = text.lower()
        ##exclude all text shorter than "min_characters""
        content = ""
        min_characters = 10
        for line in iter(text.splitlines()):
            if len(line) > min_characters:
                content = content + line + "\n"            


        # Check Amount of Videos _____________________________________________________________
        videos = len(soup.find_all('video', recursive=True))
        analysis_results["videos"] = videos



        # Check Connections to Social Media
        if "facebook" in content or "youtube" in content or "instagram" in content or "linkedin" in content or "twitter" in content:
            analysis_results["social_media"] = 1
        else:
            analysis_results["social_media"] = 0



        # Text Analysis _______________________________________________________________________

        for topic in topic_dict.keys():

            keyword_count = 0
            keywords = topic_dict[topic].split(",")

            for k in keywords:
                keyword_count += sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(k), content))

            analysis_results[topic] = keyword_count


        # Sentiment Analysis _______________________________________________________________________
        blob = TextBlob(content)
        sentiment = round(blob.sentiment.polarity,4)
        analysis_results["sentiment"] = sentiment


        # Language Analysis _______________________________________________________________________  
        language = blob.detect_language()
        analysis_results["language"] = language


        #print(analysis_results)

        return analysis_results
    
    
    
    except: ## Webpage not crawlable
        
        print("could not crawl webpage")
        analysis_results["emobility"] = 0
        analysis_results["autonomous"] = 0
        analysis_results["ai"] = 0
        analysis_results["sentiment"] = 0.0
        analysis_results["language"] = "en"
        analysis_results["videos"] = 0
        analysis_results["social_media"] = 0

        if "facebook" in url or "youtube" in url or "instagram" in url or "linkedin" in url or "twitter" in url:
            analysis_results["social_media"] = 1

        return analysis_results # 'exit' function and return to caller


#webpage = "https://www.audi.com/en.html"
#topic_results = topic_analysis(webpage, topic_dict)

### Define Topics

In [None]:
topic_dict = dict()
topic_dict["emobility"] = "emobility,battery,environment,bio,eco,ecological,electric,hybrid,environmental"
topic_dict["autonomous"] = "autonomous,self-driving"
topic_dict["ai"] = "ai,machine learning,artificial intelligence,intelligent,neural network,algorithm"                                     

## Analyse

In [None]:
## Reader
nodes_csv = open('results/' + domain_name + '_nodes.csv')
reader = csv.DictReader(nodes_csv)

## Writer
nodes_tags_csv = open('results/' + domain_name + '_nodes_tags.csv', "w")
nodes_tags_csv.write('id,label,videos,social_media,emobility,autonomous,ai,sentiment,language' + '\n')

## Do Analysis
i = 0
for row in reader:
    print("\n", i ,"of", len(nodes))
    webpage = row['node']

    topic_results = topic_analysis(webpage, topic_dict)
    print(topic_results)
    nodes_tags_csv.write(str(webpage) + "," + str(webpage) + "," + str(topic_results['videos']) + "," + str(topic_results['social_media']) + "," + str(topic_results['emobility']) + "," + str(topic_results['autonomous']) + "," + str(topic_results['ai']) + "," + str(topic_results['sentiment']) + "," + str(topic_results['language']) + "\n")

    i += 1

print("\n\n alaysis terminated")
    
nodes_csv.close()
nodes_tags_csv.close()

## Visualize as Network

In [None]:
fh=open('results/' + domain_name + '.txt', 'rb')
G=nx.read_edgelist(fh, delimiter='\t', nodetype = str)
fh.close()

nx.draw(G, with_labels = False, node_size=2, edge_size = 1)
#plt.savefig("results/"+ domain_name + ".png", format="PNG", dpi = 600)

#plt.show()