# Jupyter notebook for testing and breaking things

In [1]:
import os
import json
import natsort
from collections import OrderedDict

In [2]:
# hardcoding in path for testing
def get_people():
    filepath = '/home/osboxes/Code/forks/rse-skills-graph/people.json' 
    return get_file_contents(filepath)

# thanks to Colin Morris for adding this code originally
def get_skills_list():
    skills_list = {}
    json_results = get_people()
    for supervisor, data in json_results.items():
        for section in data:
            for item in json_results[str(supervisor)][section]:
                if item not in skills_list:
                    skills_list[item] = 1;
                else:
                    skills_list[item] = skills_list[item] +1;

    skills_list_new = OrderedDict(natsort.natsorted(skills_list.items()))
    return skills_list_new


def get_file_contents(filename):
    data = None

    try:
        fp = open(filename, 'rb')
        try:
            contents = fp.read()
            data = json.loads(contents)
        finally:
            fp.close()
    except IOError:
        print('Could not open JSON file:' + filename, file=sys.stderr)
        sys.exit(1)

    return data

In [3]:
get_skills_list()

OrderedDict([('ARC', 3),
             ('CIS', 1),
             ('Cloud', 2),
             ('Early 90s britpop', 1),
             ('HPC', 1),
             ('Linux', 4),
             ('Machine learning', 2),
             ('Natural language processing', 1),
             ('Perl', 1),
             ('Python', 3),
             ('R', 1),
             ('Software carpentry', 1),
             ('The 1970s', 1),
             ('Trains', 1),
             ('Web dev', 1),
             ('Web stuff', 1),
             ('git', 5),
             ('subversion', 3),
             ('sysadmin', 1)])

In [7]:
import sys

__file__ = 'rse-skills-graph'

In [8]:
os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/bin'
os.environ['GV_FILE_PATH'] = os.path.abspath(os.path.join(os.path.dirname(__file__), 'static/images/')) + '/'

print('PATH: ' + os.environ['PATH'], file=sys.stderr)
print('GV_FILE_PATH: ' + os.environ['GV_FILE_PATH'], file=sys.stderr)

PATH: /home/osboxes/anaconda3/envs/rse-graph/bin:/usr/local/go/bin:/home/osboxes/anaconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/osboxes/go/bin:/usr/local/bin:/usr/local/bin
GV_FILE_PATH: /home/osboxes/Code/forks/rse-skills-graph/scrapbook/static/images/


In [9]:
import urllib

In [10]:
def get_titles(topic):
    
    url = 'https://en.wikipedia.org/w/api.php'
    values = {
        'action' : 'query',
        'list' : 'search',
        'srwhat' : 'text',
        'srsearch' : topic.encode('utf8'),
        'format' : 'json',
        'srlimit' : '40'
    }

    data = urllib.parse.urlencode(values)
    print(data)
    data = data.encode('utf-8') # necessary for python 3
    print(data)
    
    # build wikipedia request from data values
    request = urllib.request.Request(url, data)
    # example full request 
    # https://en.wikipedia.org/w/api.php?action=query&list=search&srwhat=text&srsearch=Linux&format=json&srlimit=40
    
    # open API request and collect response
    response = urllib.request.urlopen(request)
    
    # read the response (open file)
    json_response = response.read()
    # load file as a json
    json_result = json.loads(json_response)
    
    # iterate over json returned and extract related titles in json
    # error if topic not found
    try:
        results = []
        for result in json_result['query']['search']:
            results.append(result['title'])
    except:
        results = []
        print("ERROR: no title found for topic: " + topic.encode('utf8'))
    
    # return full list of related titles
    return results

In [14]:
get_titles('git')

action=query&list=search&srwhat=text&srsearch=git&format=json&srlimit=40
b'action=query&list=search&srwhat=text&srsearch=git&format=json&srlimit=40'


['Git',
 'GitHub',
 'Git!',
 'Git Up, Git Out',
 'Git (disambiguation)',
 'Git (slang)',
 "I'm Gonna Git You Sucka",
 'The Git Up',
 'The Gits',
 'Comparison of Git GUIs',
 'Larry the Cable Guy',
 'GitLab',
 'Bitbucket',
 'Virtual File System for Git',
 'Blanco Brown',
 'Git-annex',
 'TortoiseGit',
 'Jet Li',
 'Cogito (software)',
 'Markdown',
 'List of Linux kernel names',
 'Comparison of operating system kernels',
 'Gitter',
 'Plastic SCM',
 'Facebook',
 'Commit (version control)',
 'Censorship of GitHub',
 'Marcus Brigstocke',
 'Git Fresh',
 'Git Gay',
 'Git Along, Little Dogies',
 'Tom Preston-Werner',
 'Atom (text editor)',
 'Comparison of open-source wireless drivers',
 'Spider-Man Unlimited',
 'Git It',
 'Electron (software framework)',
 'Distributed version control',
 'Wiki.js',
 'David Blaine']

## Graph vis stuff

In [19]:
import pygraphviz as pgv
from flask import Flask, render_template, request, redirect, url_for

def get_graph_string(graph):
    #output = StringIO.StringIO()
    output = io.BytesIO()
    graph.draw(output, format = 'svg')
    svg = output.getvalue()
    output.close()

    svg_parser = etree.XMLParser()
    svg_obj = etree.fromstring(svg, svg_parser)

    svg_obj.attrib['width'] = '100%'
    del svg_obj.attrib['height']

    images = svg_obj.findall('.//{http://www.w3.org/2000/svg}image')

    # Convert all image href links to match server
    # For example, anonymous.png becomes static/images/anonymous.png
    # Have to do this because graphviz doesn't allow you to specify the
    # URL to an image, only the file path.
    for image in images:
        image_filename = image.attrib['{http://www.w3.org/1999/xlink}href']
        image_url = url_for('static', filename = 'images/' + image_filename)
        image.attrib['{http://www.w3.org/1999/xlink}href'] = image_url

    return etree.tostring(svg_obj, pretty_print = True).decode('utf-8')

def get_image_files():
    image_files = []
    image_dir = '/home/osboxes/Code/forks/rse-skills-graph/static/images/' #os.environ['GV_FILE_PATH']

    for root, sub_folders, files in os.walk(image_dir):
        for filename in files:
            actual_file_name = os.path.join(root, filename)
            if filename.endswith('.png'):
                image_files.append(filename)

    return image_files

def build_graph(name, results, topics):
    graph = pgv.AGraph(overlap = 'false', name = name)
    people = get_people()

    for person in results:
        forename, surname = person.lower().split()

        image_file = 'anonymous.png'
        image_files = get_image_files()

        for filename in image_files:
            #newstr = starturlsource[index+len(pattern):index+len(pattern)+17]
            if str.find(filename, surname) != -1 and str.find(filename, forename) != -1:
                image_file = '%s' % (filename)

        # check added for _ in name e.g. Anja Le_Blanc; that is: convert _ to space
        myperson= person.replace(' ', '\n')
        graph.add_node(person, label = myperson.replace('_' , ' '), fontname = 'Helvetica', fixedsize = True, imagescale = True, width = '1.5', height = '1.5', fontcolor = 'white', shape = 'circle', style = 'filled', color = '#303030', URL = url_for('show_person', name = person), image = image_file)

        interests = people[person]['interests']
        for interest in interests:
            if interest in topics:
                color = '#A02020FF'
                shape = 'ellipse'
            else:
                color = '#105060EE'
                shape = 'ellipse'

            label = re.sub('\(.*\)', '', interest)

            graph.add_node(interest, label = label, style = 'filled', fontname = 'Helvetica', shape = shape, color = color, fontcolor = 'white', URL = url_for('show_topic', name = interest))
            graph.add_edge(person, interest, color = '#00000050')

        if 'technologies' in people[person]:
            for technology in people[person]['technologies']:
                if technology in topics:
                    color = '#B01050FF'
                    shape = 'ellipse'
                else:
                    color = '#701050EE'
                    shape = 'ellipse'

                label = re.sub('\(.*\)', '', technology)

                graph.add_node(technology, label = label, style = 'filled', fontname = 'Helvetica', shape = shape, color = color, fontcolor = 'white')
                graph.add_edge(person, technology, color = '#00000050')

    graph.layout(prog = 'neato')

    return graph

In [20]:
build_graph(name='Test', results=['Bob Up','Jane Down'], topics=['trains','planes'])

RuntimeError: Attempted to generate a URL without the application context being pushed. This has to be executed when application context is available.

In [21]:
get_image_files()

['anonymous.png']