# Analisi della social network tra istituti di ricerca e università.
L'obiettivo di questa analisi è quella di individuare se, all'interno della comunità scientifica, esistano
dei gruppi naturali (comunità) tra i diversi istituti di ricerca nel campo della Energia.

In [9]:
# Importiamo tutte le dipendenze

from importlib import reload
from datetime import datetime
from pathlib import Path
from tqdm.notebook import tqdm
from utils.graphing import FigSize, GLAYOUTS
import utils.graphing as graphing
import utils.metrics as metrics
import networkx as nx
import os
import pandas as pd
import utils.preproc as preproc
import warnings
import ipdb


CITATIONS_DIRECTED_GRAPH = "./data/cit-HepTh.txt"
CITATIONS_ABSTRACTS_DIR = "./data/cit-HepTh-abstracts"

ROR_DATA = "./data/ror-data.csv"
UNIVERSITIES_DATA = "./data/all_universities.csv"

warnings.filterwarnings("ignore")

Questa linea genera la session_id. Se la sovrascrivi si intende che hai fatto cambiamenti al dataset
perciò il resto del codice non farà più affidamento alla sessione precedente e quindi alcuni file
vanno rigenerati

Se invece vuoi usare una session precedente, usa il blocco sotto e definisci manualmente il numero di sessione


In [None]:
s = datetime.now().strftime("%y%m%d%H%M")
session_id = f"{s}"  # NUOVA SESSIONE
SESSION_PATH = f"data/sessions/{session_id}"
os.makedirs(SESSION_PATH, exist_ok=True)

In [10]:
session_id = "2511212247"  # RICARICA UNA SESSIONE
SESSION_PATH = f"data/sessions/{session_id}"

# Preprocessamento
eseguiamo le operazioni preliminari di caricamento dei dati

citations contiene il grafo diretto con colonne target e source

In [None]:
records = []

for abp in tqdm(Path(CITATIONS_ABSTRACTS_DIR).rglob("*")):
    if abp.is_file():
        with open(abp, "r", encoding="utf-8", errors="ignore") as f:
            abs = f.read()

        data = {"id": abp.stem}
        fields = preproc.extract_fields(abs)
        # preproc è una classe statica definita in utils.py

        if isinstance(fields, dict):
            data.update(fields)
            records.append(data)

papers = pd.DataFrame(records)
del records

Mapping dei paper alle rispettive università

In [None]:
ror = pd.read_csv(ROR_DATA)
ror["clean_url"] = (
    ror["links"].str.replace(r"^https?://", "", regex=True).str.split("/").str[0]
)
ror["tld2"] = ror["clean_url"].str.extract(r"([a-zA-Z0-9-]+\.[a-zA-Z0-9-]+)$")

universities = pd.read_csv(UNIVERSITIES_DATA)

domain_mapping = {
    str(row.id): preproc.extract_domain(row.email, ror, universities)
    for row in tqdm(papers.itertuples())
}

In [None]:
# leggi il file come edge-list: ignora righe che iniziano con '#' e usa whitespace come separatore

cit_hepth = pd.read_csv(
    CITATIONS_DIRECTED_GRAPH, comment="#", sep="\\s+", header=None, engine="python"
)

# Prendiamo le prime due colonne come source/target
citations = cit_hepth.iloc[:, :2].copy()
citations.columns = ["source", "target"]
citations["source"] = pd.to_numeric(citations["source"])
citations["target"] = pd.to_numeric(citations["target"])

del cit_hepth  # non ci serve più

citations_uni = citations.copy()
citations_country = citations.copy()


def safe_get_name(x):
    v = domain_mapping.get(x)
    if isinstance(v, dict):
        return v.get("name")
    return None


def safe_get_country(x):
    v = domain_mapping.get(x)
    if isinstance(v, dict):
        return v.get("country")
    return None


citations_uni["source"] = citations["source"].astype(str).map(safe_get_name)
citations_uni["target"] = citations["target"].astype(str).map(safe_get_name)

citations_country["source"] = citations["source"].astype(str).map(safe_get_country)
citations_country["target"] = citations["target"].astype(str).map(safe_get_country)

In [None]:
citations_uni.dropna().sample(n=3)

In [None]:
citations_country.dropna().sample(n=3)

# SALVATAGGIO o CARICAMENTO

## salvataggio

In [None]:
citations_uni.to_csv(f"{SESSION_PATH}/citations-uni.csv", index=False)
citations_country.to_csv(f"{SESSION_PATH}/citations-country.csv", index=False)

In [None]:
papers.to_csv(f"{SESSION_PATH}/papers.csv", index=False)

## caricamento

In [15]:
citations_uni = pd.read_csv(f"{SESSION_PATH}/citations-uni.csv")
citations_country = pd.read_csv(f"{SESSION_PATH}/citations-country.csv")

In [12]:
papers = pd.read_csv(f"{SESSION_PATH}/papers.csv")

# Grafi

In [30]:
# per testing e sviluppo delle librerie, rilanciare questo blocco ogni volta che viene
# aggiornata una libreria

# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0


graphing = reload(graphing)

## Circular Layout

In [None]:
name = "circular-wpg"
pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.circular
pos = lay(wpg)
data = graphing.gen_graph_data(wpg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}")

## ARF Layout

In [None]:
# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0
from importlib import reload  # Python 3.4+
graphing = reload(graphing)
name = "graph-arf-wpg"
pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.arf
pos = lay(wpg)
data = graphing.gen_graph_data(wpg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}")

In [None]:
# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0
from importlib import reload  # Python 3.4+
graphing = reload(graphing)
name = "graph-bfs"

pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.bipartite
pos = lay(pg)
data = graphing.gen_default(pg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}")

In [None]:
# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0
from importlib import reload  # Python 3.4+
graphing = reload(graphing)
name = "graph-bfs"
pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.bfs
pos = lay(pg)
data = graphing.gen_default(pg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}")

In [None]:
# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0
from importlib import reload  # Python 3.4+
graphing = reload(graphing)
name = "kamada-wpg"
pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.kamada
pos = lay(wpg, weight="w")
data = graphing.gen_graph_data(wpg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}")

In [None]:
# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0
from importlib import reload  # Python 3.4+
graphing = reload(graphing)
name = "planar"
pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.planar
pos = lay(pg)
data = graphing.gen_default(pg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}")

In [None]:
# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0
from importlib import reload  # Python 3.4+
graphing = reload(graphing)
name = "spring-base-wpg"
pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.spring
pos = lay(wpg, weight="w")
data = graphing.gen_graph_data(wpg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}")

In [None]:
# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0
from importlib import reload  # Python 3.4+
graphing = reload(graphing)
name = "spring-force-wpg"
pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.spring
pos = lay(wpg, weight="w", method="force")
data = graphing.gen_graph_data(wpg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}")

In [None]:
# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0
from importlib import reload  # Python 3.4+
graphing = reload(graphing)
name = "spring-energy-wpg"
pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.spring
pos = lay(wpg, weight="w", method="energy")
data = graphing.gen_graph_data(wpg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}", show_labels=False)

In [None]:
# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0
from importlib import reload  # Python 3.4+
graphing = reload(graphing)
name = "spiral"
pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.spiral
pos = lay(pg, resolution=1)
data = graphing.gen_graph_data(pg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}")

In [None]:
# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0
from importlib import reload  # Python 3.4+
graphing = reload(graphing)
name = "spiral-equidistant"
pg = nx.DiGraph()
add_edges(pg, citations_uni)
wpg = edge_collapse(pg, nx.DiGraph)
lay = graphing.GLAYOUTS.spiral
pos = lay(pg, resolution=1, equidistant=True)
data = graphing.gen_graph_data(pg, pos)
graphing.plot_graph(data, save_path=f"{SESSION_PATH}/{name}")

Visualizzazione del grafo

In [10]:
unique = len(pd.unique(citations_uni[['source', 'target']].dropna().values.ravel('K')))
self_loops = len(citations_uni[citations_uni['source'] == citations_uni["target"]].dropna())
edges = len(citations_uni.dropna())
print(f"Abbiamo {unique} universita e centri di ricerca")
print(f"        {edges} archi")
print(f"        {self_loops} self loops")

Abbiamo 560 universita e centri di ricerca
        153881 archi
        21490 self loops


In [11]:
unique = len(pd.unique(citations_country[['source', 'target']].dropna().values.ravel('K')))
self_loops = len(citations_country[citations_country['source'] == citations_country["target"]].dropna())
edges = len(citations_country.dropna())
print(f"Abbiamo {unique} stati")
print(f"        {edges} archi")
print(f"        {self_loops} self loops")

Abbiamo 63 stati
        153881 archi
        57976 self loops


In [None]:
citations_uni.count()

## Metriche

Qua calcoliamo:
- closeness centrality
- degree centrality
- betweenness centrality
- eigenvector centrality

### Definizione funzioni

In [41]:
# per testing e sviluppo delle librerie, rilanciare questo blocco ogni volta che viene
# aggiornata una libreria

# Source - https://stackoverflow.com/a/437591
# Posted by cdleary, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-21, License - CC BY-SA 4.0


metrics = reload(metrics)

### Calcolo Metriche

In [17]:
g = nx.DiGraph()
graphing.add_edges(g, citations_uni)
wg = g.copy()
wg = graphing.edge_collapse(g, nx.DiGraph)

In [None]:
gm  = metrics.calc_metrics(g)
wgm = metrics.calc_metrics(wg)

In [None]:
metrics.plot_distribution(list(gm[0].values()), SESSION_PATH, "Degree Centrality Distribuition - Unweighted Graph")

TypeError: cannot unpack non-iterable Figure object

<Figure size 640x480 with 0 Axes>

In [None]:
list(gm[0].values())
gm[0].values().

[0.5339246188196289,
 0.5463003550174349,
 0.6643598948265719,
 0.44359985944288266,
 0.6573016223715751,
 0.5128996079231047,
 0.6469910086873151,
 0.5071189770961024,
 0.5530146610567553,
 0.44509723169585974,
 0.5271013010072375,
 0.5761910612873062,
 0.5161085731447299,
 0.5276632426927463,
 0.5788866919833872,
 0.636998869557009,
 0.48667465255240516,
 0.6297049893712419,
 0.6512475284813106,
 0.5409269089025093,
 0.6035952702997512,
 0.540336377342572,
 0.49494812164579605,
 0.5397471337467787,
 0.4964374339476389,
 0.5695605542529298,
 0.5322022813395656,
 0.6171422963164539,
 0.5362384849900281,
 0.5984862414096688,
 0.5204501804897961,
 0.5316306354949474,
 0.6194594764027485,
 0.525981000686287,
 0.4595618585383436,
 0.5287907282540556,
 0.5573740108623829,
 0.49693586510622095,
 0.5182702844458598,
 0.5663021986794005,
 0.49944310963248845,
 0.4852432565154864,
 0.4989396387558428,
 0.512368655948029,
 0.4656144135896482,
 0.5171871699538099,
 0.5299230424473191,
 0.44913622