In [None]:

import pandas as pd
import spacy
from collections import Counter
import networkx as nx
import matplotlib.pyplot as plt

df = pd.read_excel("318NewsDataSet.xlsx")
nlp = spacy.load("en_core_web_sm")

entity_list = []
entity_types = []

for doc in nlp.pipe(df["Content"].astype(str).tolist(), disable=["tagger", "parser"]):
    ents = [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE"]]
    labels = [ent.label_ for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE"]]
    entity_list.append(ents)
    entity_types.extend(labels)

df["Entities"] = entity_list


In [None]:

from itertools import combinations
from collections import Counter

co_occurrence = Counter()
for entity_set in df["Entities"]:
    unique_entities = list(set(entity_set))
    for pair in combinations(sorted(unique_entities), 2):
        co_occurrence[pair] += 1


In [None]:

G = nx.Graph()
for (ent1, ent2), weight in co_occurrence.items():
    G.add_edge(ent1, ent2, weight=weight)


In [None]:

plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, k=0.15)
nx.draw_networkx_nodes(G, pos, node_size=50, alpha=0.7)
nx.draw_networkx_edges(G, pos, width=0.5, alpha=0.4)
nx.draw_networkx_labels(G, pos, font_size=8, font_color='black')
plt.axis("off")
plt.title("Entity Co-Occurrence Network")
plt.savefig("entity_network.png", dpi=300)
plt.show()

nx.write_graphml(G, "entity_network.graphml")


In [None]:

label_counts = Counter(entity_types)
labels, counts = zip(*label_counts.items())

plt.figure(figsize=(8, 5))
sns.barplot(x=list(labels), y=list(counts), palette="Set2")
plt.title("Frequency of Named Entity Types")
plt.xlabel("Entity Label")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.savefig("entity_labels.png")
plt.show()


In [None]:

degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)

top_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
top_betweenness = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:10]

print("Top 10 Nodes by Degree Centrality")
for node, score in top_degree:
    print(f"{node}: {score:.3f}")

print("\nTop 10 Nodes by Betweenness Centrality")
for node, score in top_betweenness:
    print(f"{node}: {score:.3f}")


In [None]:

import csv

with open("centrality_scores.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Entity", "DegreeCentrality", "BetweennessCentrality"])
    for node in G.nodes():
        writer.writerow([
            node,
            degree_centrality.get(node, 0),
            betweenness_centrality.get(node, 0)
        ])
