### Import libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import os
import codecs
import collections
import nltk
import pickle

### Import companieslist

In [2]:
with open('List_private_spaceflight_company.pickle','rb') as fp:
    companieslist = pickle.load(fp)

# The dictionary "complist" index the "ID" for each value
complist = {}
i = 0
for company in companieslist:
    i += 1
    complist[company] = i

# complist

### Initialization of localisation liste

In [3]:
localisation = ['Mercury', 'Venus', 'Moon', 'Mars',
                'Jupiter', 'Saturn', 'Uranus', 'Neptune']

# The dictionary "complist" index the "ID" for each value
loclist = {}
for loc in localisation:
    i += 1
    loclist[loc] = i

# loclist

### Initialization of the nodes list use in Gephi software

In [4]:
# Nodes dictionary
gephinodelist = {'Id': [], 'Label': [], 'Type': []}

for key in complist.keys():
    gephinodelist['Label'].append(key)
    gephinodelist['Id'].append(complist[key])
    gephinodelist['Type'].append('company')
    
for key in loclist.keys():
    gephinodelist['Label'].append(key)
    gephinodelist['Id'].append(loclist[key])
    gephinodelist['Type'].append('localisation')
    
df_1 = pd.DataFrame(data=gephinodelist)
df_1

Unnamed: 0,Id,Label,Type
0,1,ARCA Space Corporation,company
1,2,Australian Space Research Institute,company
2,3,Ventions,company
3,4,Blue Origin,company
4,5,Canadian Arrow,company
5,6,Exos Aerospace,company
6,7,Firefly Aerospace,company
7,8,Gilmour Space Technologies,company
8,9,Generation Orbit Launch Services,company
9,10,Independence-X Aerospace,company


In [6]:
df_1.to_excel('NodesList_Gephi.xlsx')

### Fonctions

In [74]:
# Collect all companies and locs in article
# Then companies and locs are stock in list
def analyse_html(html, companieslist):
    soup = BeautifulSoup(html)
    
    articles = soup.find_all('div', id='article-body')
    item = ''
    for article in articles:
        texts = article.find_all('p')
        for text in texts[0:len(texts)-1]:
            newitem = text.get_text()
            item = item + ' ' + newitem

    comps = set([company for company in companieslist if company in item])

    tockens = nltk.word_tokenize(item)
    locs = set([tocken for tocken in tockens if tocken in localisation])
    
    return comps, locs

In [79]:
# Init of edges dictionary
gephiedgeslist = {'Source': [], 'Target': [], 'ID': [], 'Type': [], 'Interval': [], 'Weight': []}

id_relationlist = {}
# Affecte une identitée (integer) à chaque relation
# {'13-30': 1, '13-29': 2, '22-29': 3, '22-30': 4
# Exemple : La relation 13-30 à pour Id 1

ite = 0
weight = {} 
for i in range(0,9): # Studies each year from 2010 to 2018
    print('201' + str(i))

    for file in os.listdir('US02_Python_Project_HTML/Ar_201' + str(i) + '/'):
        if file.endswith('.html'):
            with open('US02_Python_Project_HTML/Ar_201' + str(i) + '/' + file, encoding = 'utf-8') as fp:

                html = fp.read()
                comps, locs = analyse_html(html, companieslist)

                if len(comps) > 0 and len(locs) > 0:
                    # Change "complist" (string) in "indice_complist" (integer), so each companies is replace by her ID.
                    indice_complist = [complist[comp] for comp in comps]
                    # Change "loclist" (string) in "indice_loclist" (integer), so each companies is replace by her ID.
                    indice_loclist = [loclist[loc] for loc in locs]
                    
                    for comp in indice_complist:
                        for loc in indice_loclist:
                            # Creates relations
                            relation = str(comp) + '-' + str(loc) # Exemple: 13-30
                            if relation not in id_relationlist.keys(): # If relation is new
                                
                                ite += 1
                                # Affects an ID to relation
                                id_relationlist[relation] = ite
                                
                                gephiedgeslist['Source'].append(comp)
                                gephiedgeslist['Target'].append(loc)
                                gephiedgeslist['ID'].append(id_relationlist[relation])
                                gephiedgeslist['Type'].append('Directed')
                                gephiedgeslist['Interval'].append('[2010,2019]')
                                gephiedgeslist['Weight'].append(0)
                                weight[ite] = 1
                            else:
                                weight[id_relationlist[relation]] += 1 # Add 1 at weight
    
    # This code affect each value in "Weight" column in edges dictionary.
    # This dictionary is used to create gephi graph.
    # So value is right in "Weight" with a special string.
    # '[2010, 2011, 2]; [2011, 2012, 15]; [2012, 2013, 23]...'
    # For instance in 2010 weight is 2, in 2011, 15 ...
    for id_re in weight.keys():
        we = weight[id_re]
        if i == 0 or gephiedgeslist['Weight'][id_re-1] == 0:
            gephiedgeslist['Weight'][id_re-1] = ('[201' + str(i) + ', 201' + str(i+1) + ', ' + str(we) + ']')
        else:
            gephiedgeslist['Weight'][id_re-1] += '; [201' + str(i) + ', 201' + str(i+1) + ', ' + str(we) + ']'
        weight[id_re] = 0
        

2010
2011
2012
2013
2014
2015
2016
2017
2018


In [1]:
df_2 = pd.DataFrame(data=gephiedgeslist)
df_2

In [81]:
df_2.to_excel('EdgesList_Gephi.xlsx')