In [1]:
import pandas as pd
import random
import networkx as nx
import matplotlib.pyplot as plt
import pickle
from pyvis.network import Network
import community.community_louvain as community_louvain
from tqdm import tqdm

---

## Importation des données

In [2]:
project = pd.read_csv("../Data/CSV/Projects/project.csv", sep=";")

In [3]:
organization = pd.read_csv("../Data/CSV/Projects/organization.csv", sep=";")

On importera les pubications après

In [4]:
print(project.shape)
print(organization.shape)

(35381, 20)
(177190, 25)


---

## Création du graph

On récupère les ID unique des projets

In [6]:
project_id = list(project["id"].unique())

Pour chaque projet je récupère les id des organisations qui y participent. Tout ça dans un dictionnaire.

In [7]:
projet_orga = {}
for projet in project_id:
    projet_orga[projet] = list(organization[organization["projectID"] == projet]["organisationID"])

Création d'un graph vide

In [8]:
G = nx.Graph()

Fonction de création du graph (prendre plus d'une heure à faire tourner)

In [9]:
########### /!\ TOURNE INDEFINIEMENT /!\ ##############

yes_counter = 0
for i, tq in zip(range(1, len(projet_orga.keys())), tqdm(range(len(projet_orga.keys())))):
#     print(projet_orga[project_id[i]])
    for t in range(len(projet_orga.keys())):
#         G.add_node(project_id[i - 1])
        G.add_node(project_id[i - 1], label=project_id[i - 1])
        for y in projet_orga[project_id[i - 1]]:
#         print(f"i: {i} | i+1: {projet_orga[project_id[i+1]]} | y: {y}")
            if G.has_edge(project_id[i - 1], project_id[t]) is False:
                for x in projet_orga[project_id[t]]:
            
                    if y == x:
                        yes_counter += 1
                        G.add_edge(project_id[i-1], project_id[t])

G.remove_edges_from(nx.selfloop_edges(G))
# On supprime les boucles du graph car cela n'apporte pas d'information et cela prend de la place

  0%|                                                                              | 8/35381 [00:02<3:31:10,  2.79it/s]


KeyboardInterrupt: 

Pour importer un graph sauvegardé au format .pickle

In [10]:
 G = pickle.load(open("./graphs_save/graphe_3_labeled.pickle", "rb"))

In [11]:
G.number_of_nodes()

35381

In [12]:
G.number_of_edges()

9982209

---

## Détection de communauté

In [13]:
partition = community_louvain.best_partition(G)

In [19]:
print(f"Nombre de communautés : {len(set(partition.values()))}")

Nombre de communautés : 4377


On ajoute les communauté en tant qu'attributs aux noeuds du graph

In [34]:
nx.set_node_attributes(G, partition, 'partition')

Pour sauvegarder le graph au format .pickle

In [35]:
pickle.dump(G, open('./graphs_save/graphe_3_labeled.pickle', 'wb'))

Pour sauvegarder le graph en format gexf pour l'importer sur Gephi

In [None]:
nx.write_gexf(G, "./graphs_save/graphe_3_labeled.gexf")

On crée un DataFrame avec en index les id des noeuds et leur communauté

In [14]:
partition_df = pd.DataFrame.from_dict(partition, orient="index").astype({0:"int32"})

On ajoute les communautés au DataFrame de données

In [17]:
project = project.join(partition_df, on="id")
project.rename({0:"Community"}, axis=1, inplace=True)

In [18]:
project.head(2)

Unnamed: 0,id,acronym,status,title,startDate,endDate,totalCost,ecMaxContribution,legalBasis,topics,...,frameworkProgramme,masterCall,subCall,fundingScheme,nature,objective,contentUpdateDate,rcn,grantDoi,Community
0,817296,eDCaseMAN,CLOSED,"Defendable legal cases through an affordable, ...",2018-05-01,2018-09-30,71429,50000,H2020-EU.2.3.,EIC-SMEInst-2018-2020,...,H2020,H2020-EIC-SMEInst-2018-2020,H2020-SMEInst-2018-2020-1,SME-1,,"""Electronic discovery (known as e-discovery) i...",2022-08-10 13:07:21,217713,10.3030/817296,0
1,764717,WinWind,CLOSED,Winning social acceptance for wind energy in w...,2017-10-01,2020-03-31,21244625,21244625,H2020-EU.3.3.,LCE-21-2017,...,H2020,H2020-LCE-2016-2017,H2020-LCE-2017-RES-CSA,CSA,,The overall objective of WinWind is to enhance...,2022-08-17 10:54:17,211548,10.3030/764717,1


On calcule la proportion du chaque communauté par rapport à tous le data set 

In [32]:
proportions = project['Community'].value_counts(normalize=True).to_frame()
proportions.rename({"Community": "Proportion"}, inplace=True, axis=1)

In [33]:
proportions

Unnamed: 0,Proportion
1,0.250276
4,0.094599
16,0.073034
12,0.068596
7,0.063763
...,...
1639,0.000028
1640,0.000028
1641,0.000028
1642,0.000028


On récupère les index (id des projets) pour lesquels la proportion du label est supérieur à 1 

In [41]:
index_label = proportions[proportions["Proportion"] > 0.01].index.to_list()

In [42]:
index_label

[1, 4, 16, 12, 7, 3, 9, 2, 8, 10, 6, 15, 14, 13, 1087]

Pour créer un DataFrame avec que ces index (id)

In [44]:
project_reduced = project[project["Community"].isin(index_label)].copy()

In [45]:
project_reduced.shape

(30393, 21)

Pour récupèrer les autres (pour changer la communauté à autres)

In [46]:
index_label = proportions[proportions["Proportion"] <= 0.01].index.to_list()

In [62]:
def community_filter(x: int):
    if x in index_label:
        return -99
    else:
        return x

In [63]:
project_prepared = project.copy()
project_prepared["Community"] = project["Community"].apply(community_filter)

In [65]:
project_prepared.head(10)

Unnamed: 0,id,acronym,status,title,startDate,endDate,totalCost,ecMaxContribution,legalBasis,topics,...,frameworkProgramme,masterCall,subCall,fundingScheme,nature,objective,contentUpdateDate,rcn,grantDoi,Community
0,817296,eDCaseMAN,CLOSED,"Defendable legal cases through an affordable, ...",2018-05-01,2018-09-30,71429,50000,H2020-EU.2.3.,EIC-SMEInst-2018-2020,...,H2020,H2020-EIC-SMEInst-2018-2020,H2020-SMEInst-2018-2020-1,SME-1,,"""Electronic discovery (known as e-discovery) i...",2022-08-10 13:07:21,217713,10.3030/817296,-99
1,764717,WinWind,CLOSED,Winning social acceptance for wind energy in w...,2017-10-01,2020-03-31,21244625,21244625,H2020-EU.3.3.,LCE-21-2017,...,H2020,H2020-LCE-2016-2017,H2020-LCE-2017-RES-CSA,CSA,,The overall objective of WinWind is to enhance...,2022-08-17 10:54:17,211548,10.3030/764717,1
2,784994,PentaHelix,CLOSED,Multi stakeholder and governance approach for ...,2018-03-01,2021-09-30,180834375,180834375,H2020-EU.3.3.,EE-09-2016-2017,...,H2020,H2020-EE-2016-2017,H2020-EE-2017-CSA-PPI,CSA,,The PentaHelix project is focusing on developi...,2022-04-11 16:42:12,213566,10.3030/784994,1
3,841546,RES,TERMINATED,Rights for Ecosystem Services (RES): a framewo...,2020-01-13,2022-01-12,22493376,22493376,H2020-EU.1.3.,MSCA-IF-2018,...,H2020,H2020-MSCA-IF-2018,H2020-MSCA-IF-2018,MSCA-IF-EF-ST,,Is currentIs current legal protection adequate...,2021-12-30 09:44:32,222283,10.3030/841546,1
4,716923,LEVIATHAN,CLOSED,Taming the Leviathan? Legal and Political Acco...,2017-08-01,2022-01-31,1184595,1184595,H2020-EU.1.1.,ERC-2016-STG,...,H2020,ERC-2016-STG,ERC-2016-STG,ERC-STG,,The Euro Crisis has had a transformative effec...,2022-09-08 18:14:15,206991,10.3030/716923,15
5,814427,WIRE2018,CLOSED,Smart Choices für innovative regional ecosyste...,2018-02-01,2019-01-31,461125,250000,H2020-EU.4.f.,IBA-SEWP-WIRE-9-2018,...,H2020,H2020-IBA-SEWP-WIRE-9-2018,H2020-IBA-SEWP-WIRE-9-2018,CSA,,The 9th Week of Innovative Regions in Europe (...,2022-08-12 17:16:10,217803,10.3030/814427,1
6,751782,GLEC-LAW,CLOSED,Global Ecological Custodianship: Innovative In...,2018-01-04,2020-01-03,1834548,1834548,H2020-EU.1.3.,MSCA-IF-2016,...,H2020,H2020-MSCA-IF-2016,H2020-MSCA-IF-2016,MSCA-IF-EF-ST,,"International environmental law (IEL), with mu...",2022-08-17 10:35:28,209643,10.3030/751782,3
7,727745,Trillium II,CLOSED,Trillium Bridge II - Reinforcing the Bridges a...,2017-01-01,2019-06-30,11045475,100000125,H2020-EU.3.1.,SC1-HCO-14-2016,...,H2020,H2020-SC1-2016-2017,H2020-SC1-2016-CNECT,CSA,,Trillium-II steps forward with an outstanding ...,2022-09-04 00:39:58,206099,10.3030/727745,1
8,740634,PATHS,SIGNED,The Paths of International Law: Stability and ...,2017-10-01,2023-03-31,2475275,2475275,H2020-EU.1.1.,ERC-2016-ADG,...,H2020,ERC-2016-ADG,ERC-2016-ADG,ERC-ADG,,International law erects high hurdles for chan...,2022-08-16 00:37:09,210541,10.3030/740634,4
9,771082,DRONETHICS,SIGNED,Emergent Ethics of Drone Violence: Toward a Co...,2018-07-01,2023-06-30,1359348,1359348,H2020-EU.1.1.,ERC-2017-COG,...,H2020,ERC-2017-COG,ERC-2017-COG,ERC-COG,,"The increasing use of armed, uninhabited aircr...",2022-07-26 12:03:19,216034,10.3030/771082,1
