# Progetto di social
## Analisi di un dataset
Analisi di un dataset di articoli del dipartimento DMIF di uniud reperito da scopus.

In [21]:
# Lista delle librerie da controllare
required_packages = ["pandas", "networkx", "pyvis", "matplotlib"]
import importlib
import subprocess
import sys
for package in required_packages:
    try:
        importlib.import_module(package)
        print(f"{package} già installato ✅")
    except ImportError:
        print(f"{package} non trovato. Installazione in corso...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"{package} installato correttamente")

pandas già installato ✅
networkx già installato ✅
pyvis già installato ✅
matplotlib già installato ✅


In [12]:
import pandas as pd
df=pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,Authors,Author full names,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,...,ISBN,CODEN,PubMed ID,Language of Original Document,Abbreviated Source Title,Document Type,Publication Stage,Open Access,Source,EID
0,"Lancia, G.; Dalpasso, M.","Lancia, Giuseppe G. (6701584197); Dalpasso, Ma...",6701584197; 6603897248,Speeding Up Floyd–Warshall’s Algorithm to Comp...,2025,Algorithms,18.0,9.0,560.0,,...,,,,English,Algorithms,Article,Final,All Open Access; Gold Open Access,Scopus,2-s2.0-105017372448
1,"Da Ros, F.; Di Gaspero, L.; Kletzander, L.; La...","Da Ros, Francesca (58134137500); Di Gaspero, L...",58134137500; 6505944235; 57194560339; 57188717...,Dynamic Temperature Control of Simulated Annea...,2025,,,,,184.0,...,9798400714658,,,English,GECCO - Proc. Genetic Evol. Comput. Conf.,Conference paper,Final,,Scopus,2-s2.0-105013077196
2,"Bacchetti, E.; de Nardin, A.; Giannarini, G.; ...","Bacchetti, Emiliano (59995162800); de Nardin, ...",59995162800; 57259162600; 55883317500; 2448058...,A Deep Learning Model Integrating Clinical and...,2025,Cancers,17.0,13.0,2257.0,,...,,,,English,Cancers,Article,Final,All Open Access; Gold Open Access; Green Accep...,Scopus,2-s2.0-105010643815
3,"Ozkilinc, O.; Soler, M.A.; Giannozzi, P.; Apar...","Ozkilinc, Ozge (58138235700); Soler, Miguel A....",58138235700; 8718855800; 7004488307; 572046647...,The Single-Parameter Bragg–Williams Model for ...,2025,International Journal of Molecular Sciences,26.0,3.0,997.0,,...,,,39940766.0,English,Int. J. Mol. Sci.,Article,Final,All Open Access; Gold Open Access; Green Accep...,Scopus,2-s2.0-85217742685
4,"Faletič, S.; Micheli, M.; Pospiec, G.","Faletič, Sergej (56595108900); Micheli, Marisa...",56595108900; 59664609000; 59665606800,Teaching and learning quantum entanglement: a ...,2025,Journal of Physics: Conference Series,2950.0,1.0,12025.0,,...,9788394593742; 9781628905861,,,English,J. Phys. Conf. Ser.,Conference paper,Final,All Open Access; Gold Open Access,Scopus,2-s2.0-85219573912


In [13]:
#Test per pulizia dataframe

#conta max val
num_articoli = df.shape[0]
# Conta valori nulli per colonna
null_counts = df.isnull().sum()
# Conta valori pari a 0 per colonna
zero_counts = (df == 0).sum()
print(f"Numero di articoli: {num_articoli}")
# Combina i due risultati in un DataFrame
missing_summary = pd.DataFrame({
    "null_values": null_counts,
    "zero_values": zero_counts
})
missing_summary


Numero di articoli: 143


Unnamed: 0,null_values,zero_values
Authors,0,0
Author full names,0,0
Author(s) ID,0,0
Title,0,0
Year,0,0
Source title,20,0
Volume,23,0
Issue,77,0
Art. No.,95,0
Page start,62,0


In [22]:
import pandas as pd

# Carica il dataset originale
df = pd.read_csv("dataset.csv")

# Soglia: un terzo dei valori
limite = len(df) / 3

# Lista delle colonne da eliminare
cols_to_drop = []

for col in df.columns:
    num_null = df[col].isna().sum()
    
    # conta gli zeri solo se la colonna è numerica
    num_zero = 0
    if pd.api.types.is_numeric_dtype(df[col]):
        num_zero = (df[col] == 0).sum()
    
    # Se la somma di nulli + zeri supera un terzo, togli la colonna
    if (num_null + num_zero) >= limite:
        cols_to_drop.append(col)

# Crea il nuovo dataset pulito
df_clean = df.drop(columns=cols_to_drop)

# Salva come nuovo file
df_clean.to_csv("nuovo_dataset.csv", index=False)

# Mostra le colonne rimosse
cols_to_drop


['Issue',
 'Art. No.',
 'Page start',
 'Page end',
 'Page count',
 'Molecular Sequence Numbers',
 'Chemicals/CAS',
 'Tradenames',
 'Manufacturers',
 'Funding Details',
 'Funding Texts',
 'Editors',
 'Sponsors',
 'Conference name',
 'Conference date',
 'Conference location',
 'Conference code',
 'ISBN',
 'CODEN',
 'PubMed ID',
 'Open Access']

In [26]:
print(df.columns)
df=pd.read_csv("nuovo_dataset.csv")
df


Index(['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year',
       'Source title', 'Volume', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Author Keywords',
       'Index Keywords', 'References', 'Correspondence Address', 'Publisher',
       'ISSN', 'Language of Original Document', 'Abbreviated Source Title',
       'Document Type', 'Publication Stage', 'Source', 'EID'],
      dtype='object')


Unnamed: 0,Authors,Author full names,Author(s) ID,Title,Year,Source title,Volume,Cited by,DOI,Link,...,References,Correspondence Address,Publisher,ISSN,Language of Original Document,Abbreviated Source Title,Document Type,Publication Stage,Source,EID
0,"Lancia, G.; Dalpasso, M.","Lancia, Giuseppe G. (6701584197); Dalpasso, Ma...",6701584197; 6603897248,Speeding Up Floyd–Warshall’s Algorithm to Comp...,2025,Algorithms,18,0,10.3390/a18090560,https://www.scopus.com/inward/record.uri?eid=2...,...,"Floyd, Robert W., Algorithm 97: Shortest path,...","G. Lancia; Department of Mathematics, Computer...",Multidisciplinary Digital Publishing Institute...,19994893,English,Algorithms,Article,Final,Scopus,2-s2.0-105017372448
1,"Da Ros, F.; Di Gaspero, L.; Kletzander, L.; La...","Da Ros, Francesca (58134137500); Di Gaspero, L...",58134137500; 6505944235; 57194560339; 57188717...,Dynamic Temperature Control of Simulated Annea...,2025,,,0,10.1145/3712256.3726390,https://www.scopus.com/inward/record.uri?eid=2...,...,"Adriaensen, Steven, Fair-share ILS: A simple s...",,"Association for Computing Machinery, Inc",,English,GECCO - Proc. Genetic Evol. Comput. Conf.,Conference paper,Final,Scopus,2-s2.0-105013077196
2,"Bacchetti, E.; de Nardin, A.; Giannarini, G.; ...","Bacchetti, Emiliano (59995162800); de Nardin, ...",59995162800; 57259162600; 55883317500; 2448058...,A Deep Learning Model Integrating Clinical and...,2025,Cancers,17,0,10.3390/cancers17132257,https://www.scopus.com/inward/record.uri?eid=2...,...,"Cornford, Philip A., EAU-EANM-ESTRO-ESUR-ISUP-...","G. Giannarini; Urology Unit, University Hospit...",Multidisciplinary Digital Publishing Institute...,20726694,English,Cancers,Article,Final,Scopus,2-s2.0-105010643815
3,"Ozkilinc, O.; Soler, M.A.; Giannozzi, P.; Apar...","Ozkilinc, Ozge (58138235700); Soler, Miguel A....",58138235700; 8718855800; 7004488307; 572046647...,The Single-Parameter Bragg–Williams Model for ...,2025,International Journal of Molecular Sciences,26,0,10.3390/ijms26030997,https://www.scopus.com/inward/record.uri?eid=2...,...,"Abbott, Andrew P., Deep Eutectic Solvents form...",F. Fogolari; Dipartimento di Scienze Matematic...,Multidisciplinary Digital Publishing Institute...,14220067; 16616596,English,Int. J. Mol. Sci.,Article,Final,Scopus,2-s2.0-85217742685
4,"Faletič, S.; Micheli, M.; Pospiec, G.","Faletič, Sergej (56595108900); Micheli, Marisa...",56595108900; 59664609000; 59665606800,Teaching and learning quantum entanglement: a ...,2025,Journal of Physics: Conference Series,2950,0,10.1088/1742-6596/2950/1/012025,https://www.scopus.com/inward/record.uri?eid=2...,...,"J Phys Conf Ser, (2021); Qtedu Consortium 2021...","S. Faletič; University of Ljubljana, Faculty o...",Institute of Physics,17426588; 17426596,English,J. Phys. Conf. Ser.,Conference paper,Final,Scopus,2-s2.0-85219573912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,"Alpuente, M.; Ballis, D.; Sapiña, J.","Alpuente, María (6701736591); Ballis, Demis (8...",6701736591; 8672549500; 55608584000,Inferring Safe Maude Programs with ÁTAME,2018,Lecture Notes in Computer Science,10931 LNCS,2,10.1007/978-3-319-96418-8_1,https://www.scopus.com/inward/record.uri?eid=2...,...,"Alpuente, María, Exploring conditional rewriti...","D. Ballis; DMIF, University of Udine, Udine, V...",Springer Verlag service@springer.de,03029743; 16113349,English,Lect. Notes Comput. Sci.,Conference paper,Final,Scopus,2-s2.0-85050638409
139,"Dattolo, A.; de March, C.; Luccio, F.L.","Dattolo, Antonina (6602802183); de March, Chia...",6602802183; 57201154884; 7005244352,Usable and Accessible Tourism Websites for Chi...,2018,Lecture Notes of the Institute for Computer Sc...,233,2,10.1007/978-3-319-76111-4_20,https://www.scopus.com/inward/record.uri?eid=2...,...,I Leoni Del Tempo Tourism App for Children 201...,"A. Dattolo; SASWEB Lab, DMIF, University of Ud...",Springer Verlag service@springer.de,18678211; 1867822X,English,Lect. Notes Inst. Comput. Sci. Soc. Informatic...,Conference paper,Final,Scopus,2-s2.0-85043602058
140,"Altenkirch, T.; Chaudhuri, K.; Dowek, G.; Felt...","Altenkirch, Thorsten (22333517400); Chaudhuri,...",22333517400; 8984178700; 6602138388; 660248807...,Preface,2017,ACM International Conference Proceeding Series,Part F130531,0,,https://www.scopus.com/inward/record.uri?eid=2...,...,,,Association for Computing Machinery acmhelp@ac...,21531633,English,ACM Int. Conf. Proc. Ser.,Editorial,Final,Scopus,2-s2.0-85030486017
141,"Corbatto, M.","Corbatto, Marco (6507516644)",6507516644,Modeling and developing a learning design syst...,2017,,,3,10.1145/3099023.3099028,https://www.scopus.com/inward/record.uri?eid=2...,...,"Compendiumld Learning Design, (2017); Curricul...","M. Corbatto; SASWEB Lab DMIF, University of Ud...","Association for Computing Machinery, Inc acmhe...",,English,"UMAP - Adjun. Publ. Conf. User Model., Adapt. ...",Conference paper,Final,Scopus,2-s2.0-85026884064


In [15]:
#librerie
import pandas as pd
import networkx as nx
from pyvis.network import Network
import itertools
from collections import Counter
import matplotlib.pyplot as plt

In [27]:
df["authors_list"] = df["Authors"].str.split(",")
df["authors_list"] = df["authors_list"].apply(
    lambda x: [a.strip() for a in x] if isinstance(x, list) else x
)
df["authors_list"]



0                             [Lancia, G.; Dalpasso, M.]
1      [Da Ros, F.; Di Gaspero, L.; Kletzander, L.; L...
2      [Bacchetti, E.; de Nardin, A.; Giannarini, G.;...
3      [Ozkilinc, O.; Soler, M.A.; Giannozzi, P.; Apa...
4                [Faletič, S.; Micheli, M.; Pospiec, G.]
                             ...                        
138               [Alpuente, M.; Ballis, D.; Sapiña, J.]
139            [Dattolo, A.; de March, C.; Luccio, F.L.]
140    [Altenkirch, T.; Chaudhuri, K.; Dowek, G.; Fel...
141                                       [Corbatto, M.]
142         [Omodeo, E.G.; Policriti, A.; Tomescu, A.I.]
Name: authors_list, Length: 143, dtype: object

## Numero di articoli pubblicati dagli autori

In [28]:
all_authors = list(itertools.chain.from_iterable(df["authors_list"].dropna()))
pd.Series(all_authors).value_counts()


M.                    45
A.                    14
G.L.                  14
F.                     9
G.                     9
                      ..
R.; Foresti            1
A.; Girometti          1
C.S.; Sojakova         1
W.; Sacerdoti Coen     1
F.; Ricciotti          1
Name: count, Length: 580, dtype: int64

## Costruzione del grafo
Corretto con non dia in output nulla perchè è necessario vederlo come file html che si genera a fine documente

In [29]:
G = nx.Graph()

for authors in df["authors_list"].dropna():
    for a1, a2 in itertools.combinations(authors, 2):
        if G.has_edge(a1, a2):
            G[a1][a2]["weight"] += 1
        else:
            G.add_edge(a1, a2, weight=1)


## Calcolo delle misure di centralità
Spiaccicarci un po di teoria

In [30]:
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

pd.DataFrame({
    "degree": degree_centrality,
    "betweenness": betweenness_centrality,
    "closeness": closeness_centrality
}).sort_values("degree", ascending=False)


Unnamed: 0,degree,betweenness,closeness
M.,0.255613,0.362447,0.415752
M.; Gigli,0.186528,0.155681,0.370456
G.L.; Valente,0.162349,0.129741,0.369020
G.; Antelmi,0.145078,0.001093,0.294213
Z.,0.145078,0.001093,0.294213
...,...,...,...
Ganjali Koli,0.003454,0.000000,0.205011
M.; Lancia,0.003454,0.000000,0.232894
Stefanel,0.001727,0.000000,0.227878
T.,0.001727,0.000000,0.001727


## Creazione di grafi
Pezzo di codice usato per la visualizzazione

In [31]:
net = Network(height="750px", width="100%", notebook=True)
net.from_nx(G)
net.show("coauthors_graph.html")


coauthors_graph.html
