In [2]:
import pandas as pd
import json

In [3]:
df_wikidata_articles = pd.read_json("data/wikidata_ready4net.json", orient='records')

with open("data/article_TitleID_dict.json", "r") as f:
    article_TitleID_dict = json.load(f)

***
## Network

In [3]:
import networkx as nx
import igraph as ig

In [4]:
df_edges = pd.read_csv('data/wikipedia_edges_filtered.csv')

# Create the set of valid article names
valid_articles = set(article_TitleID_dict.keys())

# Check which sources and targets are NOT in the valid set
invalid_sources = df_edges[~df_edges["source"].isin(valid_articles)]
invalid_targets = df_edges[~df_edges["target"].isin(valid_articles)]

df_edges["source"] = df_edges["source"].map(article_TitleID_dict)
df_edges["target"] = df_edges["target"].map(article_TitleID_dict)

list_edges = list(zip(df_edges["source"], df_edges["target"]))

In [5]:
G_wiki = nx.DiGraph()
G_wiki.add_edges_from(list_edges)

In [6]:
ig_graph = ig.Graph.from_networkx(G_wiki)

def node_reciprocity(graph):
    reciprocities = []
    for v in graph.vs:
        out_neighbors = graph.successors(v.index)
        if not out_neighbors:
            reciprocities.append(0)  # No outgoing edges means reciprocity 0 by definition
            continue
        
        # Count how many out_neighbors also link back to v
        mutual_count = sum(1 for nbr in out_neighbors if graph.are_adjacent(nbr, v.index))
        
        reciprocity = mutual_count / len(out_neighbors)
        reciprocities.append(reciprocity)
    return reciprocities

df_wikinetmetrics = pd.DataFrame({
    'pageid': df_wikidata_articles['pageid'],
    'degree_centrality': ig_graph.degree(),
    'pagerank': ig_graph.personalized_pagerank(),
    'hub' : ig_graph.hub_score(),
    'authority' : ig_graph.authority_score(),
    'eigen' : ig_graph.eigenvector_centrality(directed=True), 
    'reciprocity': node_reciprocity(ig_graph),
})

In [7]:
# Compute indegrees
indegree = dict(G_wiki.in_degree())
outdegree = dict(G_wiki.out_degree())

# Identify nodes with indegree of 1
nodes_to_remove_1 = [node for node, deg in indegree.items() if deg < 2]
nodes_to_remove_2 = [node for node, deg in outdegree.items() if deg < 2]

G_wiki.remove_nodes_from(nodes_to_remove_1)
G_wiki.remove_nodes_from(nodes_to_remove_2)

In [8]:
df_wiki_fullfeatures = pd.merge(df_wikidata_articles, df_wikinetmetrics, how="inner", on="pageid")

***

In [9]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

df_wiki_fullfeatures = pd.merge(df_wikidata_articles, df_wikinetmetrics, how="inner", on="pageid")

numeric_columns = df_wiki_fullfeatures.select_dtypes(include='number').columns
numeric_columns = numeric_columns[3:]

def Frame_Scaler(scaler_label, Scaler):
    df_wiki_fullfeatures_scaled = df_wiki_fullfeatures.copy()

    scaler = Scaler
    df_wiki_fullfeatures_scaled[numeric_columns] = scaler.fit_transform(df_wiki_fullfeatures_scaled[numeric_columns])

    G_wiki_Scaled = G_wiki.copy()
    attr_dict = df_wiki_fullfeatures_scaled.set_index("pageid").to_dict(orient="index")
    nx.set_node_attributes(G_wiki_Scaled, attr_dict)

    nx.write_graphml(G_wiki_Scaled, f"data/G_wiki_{scaler_label}.graphml")

    return G_wiki_Scaled, df_wiki_fullfeatures_scaled

In [10]:
G_wiki_standard, df_wiki_standard = Frame_Scaler("standard", StandardScaler())
G_wiki_robust, df_wiki_robust = Frame_Scaler("robust", RobustScaler())
G_wiki_minmax, df_wiki_minmax = Frame_Scaler("minmax", MinMaxScaler())

In [11]:
print("Wiki Quality Labels:", nx.attribute_assortativity_coefficient(G_wiki_minmax, "QC_cat"))
print("Wiki Aggregated Labels:", nx.attribute_assortativity_coefficient(G_wiki_minmax, "QC_aggcat"))
print("Wiki Numeric Labels", nx.numeric_assortativity_coefficient(G_wiki_minmax, "QC_num"))

Wiki Quality Labels: 0.08986329017222558
Wiki Aggregated Labels: 0.17759418898709142
Wiki Numeric Labels 0.19018533585048822
