In [1]:
import pandas as pd
import random
import networkx as nx
import matplotlib.pyplot as plt
import pickle
from pyvis.network import Network
import community.community_louvain as community_louvain
from tqdm import tqdm

---

## Importation des données

In [2]:
project = pd.read_csv("../Data/CSV/Projects/project.csv", sep=";")

In [3]:
organization = pd.read_csv("../Data/CSV/Projects/organization.csv", sep=";")

On importera les pubications après

In [4]:
print(project.shape)
print(organization.shape)

(35381, 20)
(177190, 25)


---

## Création du graph

On récupère les ID unique des projets

In [6]:
project_id = list(project["id"].unique())

Pour chaque projet je récupère les id des organisations qui y participent. Tout ça dans un dictionnaire.

In [7]:
projet_orga = {}
for projet in project_id:
    projet_orga[projet] = list(organization[organization["projectID"] == projet]["organisationID"])

Création d'un graph vide

In [8]:
G = nx.Graph()

Fonction de création du graph (prendre plus d'une heure à faire tourner)

In [9]:
########### /!\ TOURNE INDEFINIEMENT /!\ ##############

yes_counter = 0
for i, tq in zip(range(1, len(projet_orga.keys())), tqdm(range(len(projet_orga.keys())))):
#     print(projet_orga[project_id[i]])
    for t in range(len(projet_orga.keys())):
#         G.add_node(project_id[i - 1])
        G.add_node(project_id[i - 1], label=project_id[i - 1])
        for y in projet_orga[project_id[i - 1]]:
#         print(f"i: {i} | i+1: {projet_orga[project_id[i+1]]} | y: {y}")
            if G.has_edge(project_id[i - 1], project_id[t]) is False:
                for x in projet_orga[project_id[t]]:
            
                    if y == x:
                        yes_counter += 1
                        G.add_edge(project_id[i-1], project_id[t])

G.remove_edges_from(nx.selfloop_edges(G))
# On supprime les boucles du graph car cela n'apporte pas d'information et cela prend de la place

  0%|                                                                              | 8/35381 [00:02<3:31:10,  2.79it/s]


KeyboardInterrupt: 

Pour sauvegarder le graph au format .pickle

In [None]:
pickle.dump(G, open('./graphs_save/graphe_3_labeled.pickle', 'wb'))

Pour importer un graph sauvegardé au format .pickle

In [10]:
 G = pickle.load(open("./graphs_save/graphe_3_labeled.pickle", "rb"))

In [11]:
G.number_of_nodes()

35381

In [12]:
G.number_of_edges()

9982209

---

## Détection de communauté

In [13]:
partition = community_louvain.best_partition(G)

In [19]:
print(f"Nombre de communautés : {len(set(partition.values()))}")

Nombre de communautés : 4377


On ajoute les communauté en tant qu'attributs aux noeuds du graph

In [None]:
nx.set_node_attributes(G, partition, 'partition')

Pour sauvegarder le graph en format gexf pour l'importer sur Gephi

In [None]:
nx.write_gexf(G, "./graphs_save/graphe_3_labeled.gexf")

On crée un DataFrame avec en index les id des noeuds et leur communauté

In [14]:
partition_df = pd.DataFrame.from_dict(partition, orient="index").astype({0:"int32"})

On ajoute les communautés au DataFrame de données

In [17]:
project = project.join(partition_df, on="id")
project.rename({0:"Community"}, axis=1, inplace=True)

In [18]:
project.head(2)

Unnamed: 0,id,acronym,status,title,startDate,endDate,totalCost,ecMaxContribution,legalBasis,topics,...,frameworkProgramme,masterCall,subCall,fundingScheme,nature,objective,contentUpdateDate,rcn,grantDoi,Community
0,817296,eDCaseMAN,CLOSED,"Defendable legal cases through an affordable, ...",2018-05-01,2018-09-30,71429,50000,H2020-EU.2.3.,EIC-SMEInst-2018-2020,...,H2020,H2020-EIC-SMEInst-2018-2020,H2020-SMEInst-2018-2020-1,SME-1,,"""Electronic discovery (known as e-discovery) i...",2022-08-10 13:07:21,217713,10.3030/817296,0
1,764717,WinWind,CLOSED,Winning social acceptance for wind energy in w...,2017-10-01,2020-03-31,21244625,21244625,H2020-EU.3.3.,LCE-21-2017,...,H2020,H2020-LCE-2016-2017,H2020-LCE-2017-RES-CSA,CSA,,The overall objective of WinWind is to enhance...,2022-08-17 10:54:17,211548,10.3030/764717,1


On calcule la proportion du chaque communauté par rapport à tous le data set 

In [32]:
proportions = project['Community'].value_counts(normalize=True).to_frame()
proportions.rename({"Community": "Proportion"}, inplace=True, axis=1)

In [33]:
proportions

Unnamed: 0,Proportion
1,0.250276
4,0.094599
16,0.073034
12,0.068596
7,0.063763
...,...
1639,0.000028
1640,0.000028
1641,0.000028
1642,0.000028
