In [1]:
#ALWAYS RUN FIRST!

#Import libraries and packages. Don't worry about the warning if running it on windows, so far not hit an issue. (yn)

import re
from pylab import *
import csv
import psycopg2
import psycopg2.extras
import spacy
spacy.load('en')
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora
from datetime import datetime
import pickle
import gensim
import os
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

#Set the names of the files in which you want to save data, these will all be in the data file.

affiliation_data_name = 'daisy_wheel_affiliation_data'
department_data_name = 'daisy_wheel_department_data'
meeting_data_name = 'daisy_wheel_meetings'



In [2]:
#Extract the data from the CRM. Don't run if you have an up to date copy of the CRM.

try:
    
    #Opens connection to the CRM asks for peoples name and description.
    #Output rows for row in rows row[0] - first name, row[1] - second name, row[2] - description.
    
    print("Trying to access CRM CSaP database ...")
    
    conn = psycopg2.connect(SERVER_INFO)
    
    cur = conn.cursor()
    
    cur.execute("""SELECT
    person.id,
    person.first_name,
    person.last_name,
    organization.name
    FROM people_person as person
    JOIN organizations_personorganizationrole ON organizations_personorganizationrole.person_id = person.id
    JOIN organizations_organization as organization ON organization.id = organizations_personorganizationrole.organization_id
    ;
    """)
    rows = cur.fetchall()

    #Saves data to the file called above.
    
    with open(os.getcwd() + '\data\\' + affiliation_data_name +'.csv','w+',newline ='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        for row in rows:
            wr.writerow([str(row[0]), row[1].encode('utf-8'), row[2].encode('utf-8'), row[3].encode('utf-8')])
    
    cur.execute("""SELECT
    people_policyfellowshipschedule.date,
    fellow.id,
    fellow.first_name,
    fellow.last_name,
    academic.id,
    academic.first_name,
    academic.last_name
    FROM people_policyfellowshipschedule_people
    JOIN people_person as academic ON people_policyfellowshipschedule_people.person_id = academic.id
    JOIN people_policyfellowshipschedule ON people_policyfellowshipschedule_people.policyfellowshipschedule_id = people_policyfellowshipschedule.id
    JOIN people_policyfellowship ON people_policyfellowshipschedule.policy_fellowship_id = people_policyfellowship.id
    JOIN people_person as fellow ON people_policyfellowship.policy_fellow_id = fellow.id
    ;
    """)
    
    rows = cur.fetchall()
    
    #Saves data to the file called above.
    
    with open(os.getcwd() + '\data\\' + meeting_data_name +'.csv','w+',newline ='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        for row in rows:
            wr.writerow([row[0].strftime("%Y-%m-%d"), str(row[1]), row[2].encode('utf-8'), row[3].encode('utf-8'), str(row[4]), row[5].encode('utf-8'), row[6].encode('utf-8')])
    print("... data downloaded and saved to disk.")
    
except Exception as e:
    
    print(e)
    
    #If server isn't online this collects data from the save file.
    
    print("... can't access server, is the tunnel set up? Can continue on previously saved data.")


Trying to access CRM CSaP database ...
... data downloaded and saved to disk.


In [3]:
#Functions for cleaning the text and getting it ready to be procced.

#Gets rid of HTML/utf-8 tags and end of line markers.
#Input: String of text from CRM or internet.
#Output: Cleaned up string of text without HTML tags or end of line markers.

def clean_text(text):
    
    #Removes HTML tags.
    
    clean = re.compile('<.*?>')
    temp_text = re.sub(clean, '', text)
    
    #Removes rouge utf-8 code.
    
    clean = re.compile('\\\\x\w\w')
    temp_text = re.sub(clean, '', temp_text)
    
    clean = re.compile('\\\\x\w')
    temp_text = re.sub(clean, '', temp_text)
    
    #Removes end of line indicators and other junk.
    
    tags = ['\\r','\\n','/','\\t','\\']
    
    for tag in tags:
        temp_text = temp_text.replace(tag,'')
    
    return temp_text

#Tokenizes text, seperates it into a string of words and grammar.
#Input: A string of text.
#Output: A list of words and grammar in order all in lower case.

parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

#Lemmatiser, this finds the root word (i.e. depluralises).
#Input: a token, i.e. a single word or grammar.
#Output: a lemma which is the base of the word and association 

def get_lemma(word):
    return WordNetLemmatizer().lemmatize(word)
    
#Prepares text for the analysis, tokenizes texts, gets rid of words length less than 4 and filters out non-useful words then
#Lemmatisers the text.
#Input: A string of text you want to analysis.
#Output: A list of Lemmas of the meaningful words.

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [tok[0] for tok in nltk.pos_tag(tokens) if tok[1][0] == 'N']
    tokens = [token for token in tokens if len(token) > 4]
    en_stop = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

#Specialised version of the above function for organisations.
#Input: A string of text you want to analysis.
#Output: A list of Lemmas of the meaningful words.

def perpare_organisation(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 2]
    en_stop = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if (token not in en_stop)]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [4]:
#This uploads the person data into the program from the file, cleaning the data whilst it does it.

print("Trying to load up data ...")

affiliations = []
meetings = []

try:
    with open(os.getcwd() + '\data\\' + affiliation_data_name +'.csv', 'r') as csvfile:
        dump = list(csv.reader(csvfile))
        for row in dump:
            affiliations.append([int(row[0]), clean_text(row[1][2:-1]), clean_text(row[2][2:-1]), clean_text(row[3][2:-1])])
    
    with open(os.getcwd() + '\data\\' + meeting_data_name +'.csv', 'r') as csvfile:
        dump = list(csv.reader(csvfile))
        for row in dump:
            meetings.append([datetime.strptime(row[0],'%Y-%m-%d'),int(row[1]),clean_text(row[2][2:-1]),clean_text(row[3][2:-1]),int(row[4]),clean_text(row[5][2:-1]),clean_text(row[6][2:-1])])
    
    print("... data successfully uploaded.")
    
except:
        
    print(".. no back up data, please connect to server.")

Trying to load up data ...
... data successfully uploaded.


In [5]:
##
## Data scrapping code.
##

#Attempts to get the content at 'url' by making an HTTP GET request. If the content-type of response is some kind of HTML/XML, 
#returns the text content, otherwise, return None.
#Input: The link to the website.
#Output: Either the HTML or None.

def simple_get(url):
    try:
        
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        
#Returns True if the response seems to be HTML, False otherwise.
#Input: The responce from a HTTP GET request.
#Output: Boolean if it is HTML.
        
def is_good_response(resp):
    content_type = resp.headers['Content-Type'].lower()
    
    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)

#Error Handler.
#Input: Error message.
#Output: Emtpy, to be changed later for when we put together a program. (E.g. make log file)

def log_error(e):
    print(e)

#Gets rid of HTML/utf-8 tags and end of line markers.
#Input: String of text from CRM or internet.
#Output: Cleaned up string of text without HTML tags or end of line markers.
    
def clean_text_scrape(text):
    
    #Removes HTML tags.
    
    clean = re.compile('<.*?>')
    temp_text = re.sub(clean, '', text)
    
    #Removes rouge utf-8 code.
    
    clean = re.compile('\\\\x\w\w')
    temp_text = re.sub(clean, '', temp_text)
    
    clean = re.compile('\\\\x\w')
    temp_text = re.sub(clean, '', temp_text)
    
    #Removes end of line indicators and other junk.
    
    tags = ['\r','\n', ':','\t','\\']
    
    for tag in tags:
        temp_text = temp_text.replace(tag,'')
    
    return temp_text
    
#Scrapes government site for the list of government organisations.
#Input: Empty
#Output: A dictionary containing the names of the current government departments and the tags with that the primary tag being
#Whitehall and secondary tag being the department name.

def get_whitehall_department_dictionary():
    raw_html = simple_get('https://www.gov.uk/government/organisations')

    html = BeautifulSoup(raw_html, 'html.parser')

    department_set = set()

    for entry in html.select('span'):
        department_set.add(entry.text.rstrip())

    department_dictionary = {}

    for department in department_set:
        if len(department) > 5 and 'page' not in department and 'website' not in department:
            department_name = str(department).replace('\r\n',' ')
            department_dictionary[department_name] = ['whitehall', department_name]
    
    return department_dictionary
    
#Boolean function to see if name appears as any entry in what is pulled from Cambridge Departments site.
#Input: Two strings name and table_entry.
#Output: True or Flase depending on it name appears as a full entry within table_entry.
    
def check_table(name, table_entry):
    if name == table_entry:
        return True
    if ('\n' + name + '\n') in table_entry or ('\t' + name + '\n') in table_entry:
        return True
    if len(table_entry) > (len(name) + 1): 
        if name + '\n' == table_entry[:(len(name) + 1)]:
            return True
        if ('\n' + name) == table_entry[-(len(name) + 1):]:
            return True
    return False
    
#Takes data associated to one specific department and classifies what schools belong to which faculty.
#Input: List table of data from the site, the department name and the dictionary I am filling.
#Output: Empty but adjoins to classification_dictionary entries that link there name to there guessed affiliated tags.
    
def classify(table, department, classification_dictionary):
    
    names = table[0].split('\n')
    
    names = [clean_text_scrape(name) for name in names if name != '' and name != department]
    
    del table[0]
    
    if len(table) > 0:
        del table[0]
    
    intable = False
    current_tag = []
    
    classification_dictionary[department] = [department,'']
    
    for name in names:
        if len(table) > 0:
            while len(table) > 1 and (('\n' + table[1]) in table[0] or ('\t' + table[1]) in table[0]):
                del table[1]
            if not intable and check_table(name, table[0]):
                intable = True
                classification_dictionary[name] = current_tag
            elif not intable and not check_table(name, table[0]):
                current_tag = [department,name]
                classification_dictionary[name] = current_tag
            elif intable and check_table(name, table[0]):
                classification_dictionary[name] = current_tag
            elif intable and not check_table(name, table[0]):
                intable = False
                current_tag = [department,name]
                classification_dictionary[name] = current_tag
                del table[0]
                    
        else:
            classification_dictionary[name] = [department,name]
    
#Pulls the Academic departments from the Cambridge site. Creasts a dictionary linking names of schools to the guessed tags.
#Input: Empty.
#Output: Dictionary connecting names of schools to guessed tags.

#BUG: Due to site formatiing 'Institutions independent of any School' appear under 'Technology' tag.
    
def get_academic_department_dictionary():
    
    raw_html = simple_get('https://www.cam.ac.uk/colleges-and-departments/department-a-z')

    html = BeautifulSoup(raw_html, 'html.parser')

    departments = []

    for entry in html.select('h2'):
        department = entry.text.rstrip()
        if 'Search' not in department and len(department) > 4:
            departments.append(department)

    organisations = []

    for entry in html.select('ul'):
        organisation = entry.text.rstrip()
        organisation = organisation.replace('\xa0',' ')
        organisations.append(organisation)

    tables = []    
    found = False

    id = 0

    classification_dictionary = {}

    for organisation in organisations:
        if not found:
            if ('\n' + departments[id]) == organisation[:len(departments[id]) + 1]:
                tables.append(organisation)
                found = True
                id += 1
        else:
            if id != len(departments):
                if ('\n' + departments[id]) == organisation[:len(departments[id]) + 1]:
                    classify(tables, departments[id-1],classification_dictionary)
                    id += 1
                    tables = [organisation]
                else:
                    tables.append(organisation)
            else:
                if 'Facebook' in organisation:
                    break
                else:
                    tables.append(organisation)

    classify(tables, departments[id-1],classification_dictionary)
    
    return classification_dictionary

In [6]:
##
## After uploading the data run this for the main daisy wheel tool.
##

#Filters the meetins by year.
#Input: List of all meetings, start_date and end_date.
#Output: List of meetings occuring after start_date and before end_date inclusive.

def filter_meetings(meetings, start_date, end_date):
    new_meetings = []
    for meeting in meetings:
        if meeting[0] >= start_date and meeting[0] <= end_date:
            new_meetings.append(meeting)
    
    return new_meetings

#Given a list of people and affiliations edits the list of people and attaches their affiliations attached.
#Input: A list of people [id,first_name,last_name] and a list of possible affiliations.
#Output: Empty but changes people to affiliated_peple [id,first_name,last_name, affiliations].

def affiliate_people(people, affiliations):
    for person in people:
        persons_affiliations = []
        for affiliation in affiliations:
            if person[0] == affiliation[0] and affiliation[3] not in persons_affiliations:
                persons_affiliations.append(affiliation[3])
        persons_affiliations.sort()
        person.append(persons_affiliations)

#Given a list of affiliated people returns a list of affiliations attached to those people.
#Input: List of people with affiliations [id,first_name,last_name, affiliations].
#Output: List of affiliations that those people belong to without repeats.

def get_affiliation_list(affiliated_people):
    organisations = []
    for person in affiliated_people:
        for affiliation in person[3]:
            if affiliation not in organisations:
                organisations.append(affiliation)
    organisations.sort()
    return organisations

#Returns a list of people attached to the meetings without repeats, either Academics or fellows.
#Input: The list of meetings, num = 0 for fellows or num = 1 for academics.
#Ouput: List of people person[0] - id, person [1] - first name and person[2] - last name.

def get_people(meetings, num):
    people = []
    for meeting in meetings:
        person = [meeting[num*3+1],meeting[num*3+2],meeting[num*3+3]]
        if person not in people:
            people.append(person)
    return people

#Checks to see if the words appearing in the group name appears in the persons affilliation.
#Input: two strings affil and group.
#Output: Boolean based on if the inportant words from the group name appear in the persons affiliation.

def is_in_group(affil,group):
    affil_tokenized = perpare_organisation(affil)
    for item in perpare_organisation(group):
        if item not in affil_tokenized:
            return False
    return True

#Given the guess dictionary, it will try to work out if the affiliation is close to something in its key set.
#Input: A string containing the affiliation and the dictionary linking affiliations to tag guesses.
#Output: Either a list of tags or None.

def guess_tag(affiliation, guess_dictionary):
    try:
        return guess_dictionary[affiliation]
    except:
        possibilities = []
        
        for group in guess_dictionary.keys():
            if is_in_group(affiliation,group):
                possibilities.append(group)

        if len(possibilities) == 0:
            return None
        elif len(possibilities) == 1:
            return guess_dictionary[possibilities[0]]
        else:
            for possibility_1 in possibilities:
                for possibility_2 in possibilities:
                    if possibility_1 == possibility_2:
                        continue
                    elif is_in_group(possibility_2,possibility_1):
                        possibilities.remove(possibility_1)
                        break
            return guess_dictionary[possibilities[0]]
                

#Creates an csv file with list of aggiliations in it.
#Input: A string file_name, a list of affiliations, and a dictionary containing guesses of affiliations.
#Output: Empty but creates a csv file in input\file_name, also if the user has that file open prompts them to close it.

def create_affiliation_csv(file_name,affiliation_list,guess_dictionary):
    while True:
        try:
            with open(os.getcwd() + '\input\\' + file_name +'.csv','w+',newline ='') as myfile:
                wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
                wr.writerow(['Affiliation', 'Primary Tag', 'Secondary Tag'])
                wr.writerow(['Centre for Science and Policy', 'University of Cambridge', 'Centre of Science and Policy', 'This is an example and will not be added to the actual affiliations list'])
                for affiliation in affiliation_list:
                    tag_guess = guess_tag(affiliation,guess_dictionary)
                    if tag_guess != None:
                        wr.writerow([affiliation,tag_guess[0],tag_guess[1]])
                    else:
                        wr.writerow([affiliation])
            break           
        except Exception as e:
            print(e)
            input('Please close ' + file_name + ' when you have done so press enter.')
            print('')
            
#Reads a csv file with the user input of the tags associated to affiliations.
#Input: A string file_name with the file name.
#Output: The sorting dictionary that assoicates affiliations with the tags and a dictionary of tags where the primary tag leads 
#to a list of secondary tags.
        
def export_affiliation_csv(file_name):
    sorting_dictionary = {}
    tag_dictionary = {}
    with open(os.getcwd() + '\input\\' + file_name +'.csv', 'r') as csvfile:
        dump = list(csv.reader(csvfile))
        for i in range(2,len(dump)):
            row = dump[i].copy()
            if len(row) == 1:
                row.append('other')
            if len(row) == 2:
                row.append('other')
            if row[1] == '':
                row[1] = 'other'
            else:
                row[1] = row[1].lower()
            if row[2] == '':
                row[2] = 'other'
            else:
                row[2] = row[2].lower()
            sorting_dictionary[row[0]] = [row[1], row[2]]
            if row[1] not in tag_dictionary.keys():
                tag_dictionary[row[1]] = [row[2]]
            else:
                if row[2] not in tag_dictionary[row[1]]:
                    tag_dictionary[row[1]].append(row[2])
    return sorting_dictionary, tag_dictionary

#Finds most numerous tag accosiated to their affiliations then writes a list of people with tags and affiliations.
#Input: List of affiliated people [id,first_name,last_name, affiliations], a string file_name and lastly the sorting and tag 
#dictionary.
#Output: Empty but writes a csv file.

def create_people_csv(affiliated_people,file_name,sorting_dictionary):
    affiliated_people.sort(key=lambda person: person[2])
    people_output = []
    for person in affiliated_people:
        new_person = [person[0],person[1],person[2]]
        if len(person[3]) == 0:
            new_person.append('other')
            new_person.append('other')
        elif len(person[3]) == 1:
            new_person.extend(sorting_dictionary[person[3][0]])
            new_person.append(person[3][0])
        else:
            tags = []
            for affiliation in person[3]:
                found = False
                this_tag = sorting_dictionary[affiliation].copy()
                for tag in tags:
                    if tag[0] == this_tag:
                        tag.append(affiliation)
                        found = True
                        break
                if not found:
                    this_tag.append(affiliation)
                    tags.append(this_tag)
            tags.sort(key=len)
            new_person.append(tags[0][0])
            new_person.append(tags[0][1])
            new_person.extend(person[3])
        people_output.append(new_person)
    while True:
        try:
            with open(os.getcwd() + '\input\\' + file_name +'.csv','w+',newline ='') as myfile:
                wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
                wr.writerow(['Person_ID', 'First Name', 'Second Name', 'Primary Tag', 'Secondary Tag', 'Affiliations:'])
                for person in people_output:
                    wr.writerow(person)
            break
        except:
            input('Please close ' + file_name + ' when you have done so press enter.')
            print('')

#This function reads the User input from create_people_csv file and exports it to dictionaries for later use.
#Input: A string file_name.
#Output: Two dictionaries one which associates peoples id's as strings to their tags, associates primary tags to a list of
#secondary tags.
            
def export_people_csv(file_name):
    sorting_dictionary = {}
    tag_dictionary = {}
    with open(os.getcwd() + '\input\\' + file_name +'.csv', 'r') as csvfile:
        dump = list(csv.reader(csvfile))
        for i in range(1,len(dump)):
            row = dump[i].copy()
            if row[3] == '':
                row[3] = 'other'
            else:
                row[3] = row[3].lower()
            if row[4] == '':
                row[4] = 'other'
            else:
                row[4] = row[4].lower()
            sorting_dictionary[row[0]] = [row[3], row[4]]
            if row[3] not in tag_dictionary.keys():
                tag_dictionary[row[3]] = [row[4]]
            else:
                if row[4] not in tag_dictionary[row[3]]:
                    tag_dictionary[row[3]].append(row[4])
    return sorting_dictionary, tag_dictionary

#Tabulates the number of meetings between departments in Cambridge and the sectors in Policy and outputs the associated daisy
#wheel data in a csv of the format specified.
#Input: Academic dictionaries, fellow dictionaries, list of meetings and the string file_name to save it to.
#Output: Empty but saves a CSV file with the above data.

def output_daisy_wheel_data(academic_sorting_dictionary, academic_tag_dictionary, fellow_sorting_dictionary, fellow_tag_dictionary, meetings,file_name):
    daisy_wheel_dictionary = {'meetings':{},'people':{}}
    for fellow in get_people(meetings,0):
        indicator = fellow_sorting_dictionary[str(fellow[0])][0] + '-/-' + fellow_sorting_dictionary[str(fellow[0])][1]
        if indicator not in daisy_wheel_dictionary['people'].keys():
            daisy_wheel_dictionary['people'][indicator] = 1
            daisy_wheel_dictionary['meetings'][indicator] = 0
        else:
            daisy_wheel_dictionary['people'][indicator] += 1
    for academic in get_people(meetings,1):
        indicator = academic_sorting_dictionary[str(academic[0])][0] + '-/-' + academic_sorting_dictionary[str(academic[0])][1]
        if indicator not in daisy_wheel_dictionary.keys():
            daisy_wheel_dictionary[indicator] = {'meetings':0,'people':1}
        else:
            daisy_wheel_dictionary[indicator]['people'] += 1
    for meeting in meetings:
        fellow_indicator = fellow_sorting_dictionary[str(meeting[1])][0] + '-/-' + fellow_sorting_dictionary[str(meeting[1])][1]
        academic_indicator = academic_sorting_dictionary[str(meeting[4])][0] + '-/-' + academic_sorting_dictionary[str(meeting[4])][1]
        if fellow_indicator not in daisy_wheel_dictionary[academic_indicator].keys():
            daisy_wheel_dictionary[academic_indicator][fellow_indicator] = 1
        else:
            daisy_wheel_dictionary[academic_indicator][fellow_indicator] += 1
        daisy_wheel_dictionary[academic_indicator]['meetings'] += 1
        daisy_wheel_dictionary['meetings'][fellow_indicator] += 1
    while True:
        try:
            with open(os.getcwd() + '\output\\' + file_name +'.csv','w+',newline ='') as myfile:
                first_line = ['Primary Tag:','','','']
                second_line = ['','Secondary Tag:','','']
                third_line = ['','','Number of people:', '']
                fourth_line = ['','','','Number of meetings:']
                for fellow_primary_tag in fellow_tag_dictionary.keys():
                    for fellow_secondary_tag in fellow_tag_dictionary[fellow_primary_tag]:
                        fellow_indicator = fellow_primary_tag + '-/-' + fellow_secondary_tag
                        first_line.append(fellow_primary_tag)
                        second_line.append(fellow_secondary_tag)
                        third_line.append(str(daisy_wheel_dictionary['people'][fellow_indicator]))
                        fourth_line.append(str(daisy_wheel_dictionary['meetings'][fellow_indicator]))
                wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
                wr.writerow(first_line)
                wr.writerow(second_line)
                wr.writerow(third_line)
                wr.writerow(fourth_line)
                for academic_primary_tag in academic_tag_dictionary.keys():
                    for academic_secondary_tag in academic_tag_dictionary[academic_primary_tag]:
                        academic_indicator = academic_primary_tag + '-/-' + academic_secondary_tag
                        row = [academic_primary_tag, academic_secondary_tag,str(daisy_wheel_dictionary[academic_indicator]['people']),str(daisy_wheel_dictionary[academic_indicator]['meetings'])]
                        for fellow_primary_tag in fellow_tag_dictionary.keys():
                            for fellow_secondary_tag in fellow_tag_dictionary[fellow_primary_tag]:
                                fellow_indicator = fellow_primary_tag + '-/-' + fellow_secondary_tag
                                if fellow_indicator in daisy_wheel_dictionary[academic_indicator].keys():
                                    row.append(str(daisy_wheel_dictionary[academic_indicator][fellow_indicator]))
                                else:
                                    row.append('0')
                        wr.writerow(row)
            break
        except:
            input('Please close ' + file_name + ' when you have done so press enter.')

#This is the daisy wheel tool. Walks the user through inputing the tag data and combines it all to output the daisy wheel data
#in the form requested.
#Input: The downloaded list of meeting and affiliations.
#Output: Empty but creates a csv file with Diasy wheel data in.
            
def daisy_wheel_tool(meetings, affiliations):
    print('Welcome to the Diasy Wheel Tool. To generate the daisy wheel data we are going to need your help!')
    print('')
    print('If this is not the first time you are running me, you might want to make a copy of the input/output files you used last time. Otherwise you might lose the data from then.')
    print('')
    
    #Get users time interval they want data from.
    
    while True:
        try:
            start_date = datetime.strptime(input('Please input the first date you want to record meetings from (YYYY-MM-DD):'),'%Y-%m-%d')
            break
        except:
            print('Sorry, it looks like the date was in the wrong format. I need them YYYY-MM-DD, so the 14th January 2019 is 2019-01-14. Should we try again?')
    
    while True:
        try:
            end_date = datetime.strptime(input('Please input the last date you want to record meetings to (YYYY-MM-DD):'),'%Y-%m-%d')
            break
        except:
            print('Sorry, it looks like the date was in the wrong format. I need them YYYY-MM-DD, so the 16th February 1993 is 1993-02-16. Should we try again?')
    filtered_meetings = filter_meetings(meetings, start_date, end_date)
    print('')
    print('There where', len(filtered_meetings), 'between those dates.')
    
    #Let the user assign the tags to academics as they would like.
    
    print('')
    print('Ok, got those meetings. Next we need to sort the Academics into their groups. I will just print the list of affiliations they have into a file in the inputs folder called academic_affiliations. If you go there, you can assign each affiliation a Primary and Secondary tag i.e. which department then school they are in. Any cell left blank I will assume goes in other. There is an example on the top line, do not worry this line will not get used.')
    print('')
    affiliated_academics = get_people(filtered_meetings,1)
    affiliate_people(affiliated_academics,affiliations)
    create_affiliation_csv('academic_affiliations',get_affiliation_list(affiliated_academics),get_academic_department_dictionary())
    input('Once your done filling that in just hit enter.')
    print('')
    
    #Check user is ok with there tag system, if not let them go back and change it.
    
    while True:
        academic_sorting_dictionary, academic_tag_dictionary = export_affiliation_csv('academic_affiliations')
        print('Ok got that. I am going to run you through your tag system as it currently stands.')
        for primary_tag in academic_tag_dictionary.keys():
            print('')
            print(primary_tag)
            for secondary_tag in academic_tag_dictionary[primary_tag]:
                print('     -', secondary_tag)
        print('')
        responce = input('Are you happy with this system of tags? If not you can go back change the file and I will upload it again. y/n: ').lower()
        print('')
        if (responce == 'y') or (responce == 'yes') or (responce == 'yeah') or (responce == 'ye'):
            break
        input('Ok, feel free to change the document just hit enter when you are done.')
        print('')
     
    #Assign the affiliations to academics let the user review the process.
    
    print('Great! I am just going to try and match academics to their tag, if they have no affiliation on the database I will add them as other. If they have multiple tags I will try to add the most frequently occuring one.')
    print('')
    create_people_csv(affiliated_academics,'academic_tagged_people',academic_sorting_dictionary)
    input('Ok, created it. If you look in the file called academic_tagged_people in the inputs folder you can see how I have allocated the tags. If there is anything you are unhappy about just change the primary and secondary tag. Let me know when you are done by hitting enter.')
    print('')
    
    #Let the user review the current tag system.
    
    while True:
        academic_sorting_dictionary, academic_tag_dictionary = export_people_csv('academic_tagged_people')
        print('Ok got that. I am going to run you through your tag system as it currently stands.')
        for primary_tag in academic_tag_dictionary.keys():
            print('')
            print(primary_tag)
            for secondary_tag in academic_tag_dictionary[primary_tag]:
                print('     -', secondary_tag)
        print('')
        responce = input('Are you happy with this system of tags? If not you can go back change the file and I will upload it again. y/n: ').lower()
        print('')
        if (responce == 'y') or (responce == 'yes') or (responce == 'yeah') or (responce == 'ye'):
            break
        input('Ok, feel free to change the document just hit enter when you are done.')
        print('')
    
    #Now move on to Fellows and do the same process again.
    
    print('Ok, That is the Academics sorted now the fellows. I will just print the list of affiliations they have into a file in the inputs folder called fellow_affiliations. If you go there, you can assign each affiliation a Primary and Secondary tag i.e. which sector then sub-sector they are in. Any cell left blank I will assume goes in other. There is an example on the top line, do not worry this line will not get used.')
    print('')
    affiliated_fellows = get_people(filtered_meetings,0)
    affiliate_people(affiliated_fellows,affiliations)
    create_affiliation_csv('fellow_affiliations',get_affiliation_list(affiliated_fellows), get_whitehall_department_dictionary())
    input('Once your done filling that in just hit enter.')
    print('')
    
    #Check user is ok with there tag system, if not let them go back and change it.
    
    while True:
        fellow_sorting_dictionary, fellow_tag_dictionary = export_affiliation_csv('fellow_affiliations')
        print('Ok got that. I am going to run you through your tag system as it currently stands.')
        for primary_tag in fellow_tag_dictionary.keys():
            print('')
            print(primary_tag)
            for secondary_tag in fellow_tag_dictionary[primary_tag]:
                print('     -', secondary_tag)
        print('')
        responce = input('Are you happy with this system of tags? If not you can go back change the file and I will upload it again. y/n: ').lower()
        print('')
        if (responce == 'y') or (responce == 'yes') or (responce == 'yeah') or (responce == 'ye'):
            break
        input('Ok, feel free to change the document just hit enter when you are done.')
        print('')
     
    #Assign the affiliations to fellows let the user review the process.
    
    print('Great! I am just going to try and match fellows to their tag, if they have no affiliation on the database I will add them as other. If they have multiple tags I will try to add the most frequently occuring one.')
    print('')
    create_people_csv(affiliated_fellows,'fellow_tagged_people',fellow_sorting_dictionary)
    input('Ok, created it. If you look in the file called fellow_tagged_people in the inputs folder you can see how I have allocated the tags. If there is anything you are unhappy about just change the primary and secondary tag. Let me know when you are done by hitting enter.')
    print('')
    
    #Let the user review the current tag system.
    
    while True:
        fellow_sorting_dictionary, fellow_tag_dictionary = export_people_csv('fellow_tagged_people')
        print('Ok got that. I am going to run you through your tag system as it currently stands.')
        for primary_tag in fellow_tag_dictionary.keys():
            print('')
            print(primary_tag)
            for secondary_tag in fellow_tag_dictionary[primary_tag]:
                print('     -', secondary_tag)
        print('')
        responce = input('Are you happy with this system of tags? If not you can go back change the file and I will upload it again. y/n: ').lower()
        print('')
        if (responce == 'y') or (responce == 'yes') or (responce == 'yeah') or (responce == 'ye'):
            break
        input('Ok, feel free to change the document just hit enter when you are done.')
        print('')
    
    #Generate the data spread sheet for the user.

    print('Awesome, I have everything I need from you now it is up to me.')
    print('')
    data_name = 'daisy_wheel_data_' + start_date.strftime("%Y_%m_%d") + '_to_' + end_date.strftime("%Y_%m_%d")
    output_daisy_wheel_data(academic_sorting_dictionary, academic_tag_dictionary, fellow_sorting_dictionary, fellow_tag_dictionary, filtered_meetings,data_name)
    print('The data should be in the outputs folder under the name', data_name, 'hope this is all ok. Have a great day!')
    
daisy_wheel_tool(meetings,affiliations)

Welcome to the Diasy Wheel Tool. To generate the daisy wheel data we are going to need your help!

If this is not the first time you are running me, you might want to make a copy of the input/output files you used last time. Otherwise you might lose the data from then.

Please input the first date you want to record meetings from (YYYY-MM-DD):2018-01-01
Please input the last date you want to record meetings to (YYYY-MM-DD):2018-12-31

There where 1421 between those dates.

Ok, got those meetings. Next we need to sort the Academics into their groups. I will just print the list of affiliations they have into a file in the inputs folder called academic_affiliations. If you go there, you can assign each affiliation a Primary and Secondary tag i.e. which department then school they are in. Any cell left blank I will assume goes in other. There is an example on the top line, do not worry this line will not get used.

Once your done filling that in just hit enter.

Ok got that. I am going to 