In [None]:
import json
import numpy as np
import os
import pandas as pd
import re
import requests

# Some global constants
url_base = "https://futurice.com/"
file_base = str(os.path.dirname(os.path.dirname(os.path.abspath("regex_scrapper_legacy")))) + "/data/"
sep = ';;;'  # separator for csv files

In [None]:
# download the raw text from url and return it as utf-8 string
def download_raw(url):
    headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"} 
    r_temp  = requests.get(url_base + url)#, headers)

    string = bytearray()

    for chunk in r_temp.iter_content(chunk_size=128):
        string.extend(chunk)

    return string.decode(encoding='utf-8')
            

In [None]:
# scrapes the text from input string and parses most of the unnecessary characters and marking out
def process_blog_post(string):
    
    texts = ""

    for item in re.findall('"text":".*?"}', string):
        
        text = item[len('"text":"'):-1]
        #print(text)

        # Idunno why this works but removing only the \n ones doesn't
        text = re.sub('\\\\n',  " ", text)
        text = re.sub('\\\n',  " ", text)
        text = re.sub('\\n',  "", text)
        text = re.sub('\n',  "", text)
        text = re.sub("\'s", "'s", text)
        text = re.sub("\'", "", text)
        text = re.sub("#", "", text)

        # Removing unnecessary unicodes?
        #text = re.sub("\\u.*? ", " ", text)

        text = re.sub("\((http.*?|s://)\)", "", text)
        text = re.sub("(s://.*?)", "", text)
        text = re.sub("!\[.*?\]\(.*?\)", "", text)


        text = re.sub("\[", "", text)
        text = re.sub("\]", "", text)
        
        texts += text

    return texts

In [None]:
def get_blog_links(string):
    
    ret = []
    for i in re.findall(r'blog/[a-zA-Z0-9/-]+[a-zA-Z0-9]+', string):
        if i not in ret:
            ret.append(i)
        
    return ret

In [None]:
def get_title(string): 
    #print("title: " + str(re.search('"title":".*?"', string).group()[len('"title":"'):-1]))
    temp = re.search('</title>.*?content=".*?"', string)
    if temp == None:
        print("----Error: title not found")
        return ""
    else:
        return temp.group()[len(re.search('</title>.*?content="', string).group()):-1]
    
def get_teaser(string):
    #print("teaser: " + str(re.search('"teaser":".*?"', string).group()[len('"teaser":"'):-1]))
    temp = re.search('"teaser":".*?"', string)
    if temp == None:
        print("----Error: teaser not found")
        return ""
    else:
        return temp.group()[len('"teaser":"'):-1]

def get_datetime(string):
    temp = re.search('<time dateTime=".*?"', string)
    if temp == None:
        print("----Error: date not found")
        return ""
    else:
        return temp.group()[len('<time dateTime="'):-1]

def get_category(string):
    temp = re.search('<span class="sc-7f8efa2d-1 gwaujb">.*?</span>', string)
    if temp == None:
        print("----Error: category not found")
        return ""
    else:
        return temp.group()[len('<span class="sc-7f8efa2d-1 gwaujb">'):-len('</span>')]

In [None]:
# takes string and exports it to file. If csv = True, appends it as csv row to file_name, otherwise creates
# a new json file. Returns a dictonary with found values.
def process_file_to_data(string, url, csv = False, file_name = "blog_text.csv"):
    
    title = get_title(string)
    teaser = get_teaser(string)
    time  = get_datetime(string)
    category = get_category(string)
    text = process_blog_post(string)
    
        
    if not csv:
        with open(file_base + url, "w", encoding='utf-8') as fd:
            json.dump({
                "url" : url,
                "title" :title,
                "time" : time,
                "category" : category,
                "text" : text
            }, fd, indent = 4)
    else:
        with open(file_base + file_name, "a", encoding='utf-8') as fd:
            fd.write(url + sep + title + sep + time + sep + category + sep + text + "\n")
    
    return {    "url" : url,
                "title" :title,
                "time" : time,
                "category" : category,
                "text" : text}

In [None]:
def get_base_urls():
    df = pd.read_csv(file_base + "blog_urls_all.csv", encoding = 'utf-8')
    df = df.iloc[:,0].str.replace('^/', '').tolist()
    return df
        
get_base_urls()

In [None]:
def base_loading(max_num = 900, to_csv = True, create_new = True):

    unread_urls = get_base_urls()
    linked = dict()
    count = 0


    if to_csv and create_new:
        # Writing labels to file
        file_name = "blog_text.csv"
        with open(file_base + file_name, "w", encoding='utf-8') as fd:
            fd.write("url" + sep + "title" + sep + "time"+ sep  + "category"+ sep  + "text" + "\n")

    #Looping through urls until max_num is reached or there is nothing in the unread_urls
    while len(unread_urls) > 0 and count < max_num:
        count += 1
        url = unread_urls.pop(0)
        print(str(count) + ": " + url)

        string = download_raw(url)
        temp_dict = process_file_to_data(string, url, csv = to_csv)

        temp_array = []
        for item in get_blog_links(string):
            temp_array.append(item)
        linked[url] = temp_array

    if not to_csv:
        # Writing a base file with information of all the url/names of the blog text files        
        with open(file_base + "urls.csv", "w", encoding='utf-8') as fd:
            for item in read_urls:
                fd.write(item)
                fd.write("\n")

    # saves the urls of the unread ones, if one doesn't want to run all the samples in one go        
    if len(unread_urls) > 0:
        with open(file_base + "unread.csv", "w", encoding='utf-8') as fd:
            for item in unread_urls:
                fd.write(item)
                fd.write("\n")

    # Saving to json file, which blog post linked to which other      
    if len(linked) > 0:
        with open(file_base + "linked.json", "w", encoding='utf-8') as fd:
            json.dump(linked, fd, indent = 4)

    print("Finished")

In [None]:
base_loading()

In [None]:
df = pd.read_csv(file_base + "blog_text.csv", sep = sep, engine='python', encoding = 'utf-8', names = ["url", "title", "timedate", "category", "text"])
df

In [None]:
df.info()