In [1]:
import os
import re
import chardet
import pandas as pd
import jieba
from collections import defaultdict

############################
# Configuration variables
############################
SOURCE_FILE = "Data/posts.csv"                 # Original CSV file (may not be UTF-8)
UTF8_FILE = "Data/posts_utf8.csv"              # Transcoded UTF-8 CSV
PREPROCESSED_FILE = "Data/posts_preprocessed.csv"
EDGE_LIST_FILE = "Data/weibo_cooccurrence_edges.csv"

STOPWORDS_FILES = [
    "Stopwords/baidu_stopwords.txt",
    "Stopwords/cn_stopwords.txt",
    "Stopwords/hit_stopwords.txt",
    "Stopwords/scu_stopwords.txt"
]

POST_COL_NAME = "post"                         # Column that stores raw text
POST_TOKENS_COL_NAME = "post_tokens"           # New column that stores tokenized text
WINDOW_SIZE = 5                                # Co-occurrence window size

############################
# Function section
############################

def detect_encoding(file_path):
    """
    Detect file encoding using chardet and return (encoding, confidence).
    May return None if detection fails.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File does not exist: {file_path}")
    with open(file_path, 'rb') as f:
        data = f.read()
        result = chardet.detect(data)
    encoding = result["encoding"]
    confidence = result["confidence"]
    print(f"[INFO] Detected encoding by chardet: {encoding}, confidence: {confidence}")
    return encoding, confidence


def convert_to_utf8(src_file, dst_file, src_encoding=None):
    """
    Decode src_file with src_encoding, then write it out as UTF-8 to dst_file.
    If src_encoding is None, default to gb18030.
    """
    if src_encoding is None:
        src_encoding = "gb18030"  # Broadest coverage for CJK encodings
    try:
        with open(src_file, "rb") as f_in:
            data = f_in.read()
        text = data.decode(src_encoding, errors="replace")
        with open(dst_file, "w", encoding="utf-8") as f_out:
            f_out.write(text)
        print(f"[INFO] Successfully transcoded '{src_file}' to UTF-8 => '{dst_file}'")
    except Exception as e:
        print(f"[ERROR] Transcoding failed: {e}")


def load_stopwords(paths):
    """
    Merge multiple stopword files into one set and return it.
    """
    sw = set()
    for p in paths:
        if os.path.exists(p):
            with open(p, "r", encoding="utf-8", errors="replace") as f:
                for line in f:
                    w = line.strip()
                    if w:
                        sw.add(w)
        else:
            print(f"[WARNING] Stopword file not found: {p}")
    print(f"[INFO] Stopwords loaded. Total: {len(sw)}")
    return sw


def clean_text(text):
    """
    Clean text by:
    1) Removing URLs
    2) Removing @username mentions
    3) Removing the outer #...# symbols while keeping the inner text
       (If you want to delete the entire hashtag, use: text = re.sub(r'#[^#]*#', '', text))
    4) Keeping only Chinese characters, English letters and digits;
       all other symbols are replaced with spaces.
    """
    # 1) Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # 2) Remove @username
    text = re.sub(r'@\S+', '', text)
    # 3) Strip hashtag shells but keep content
    text = re.sub(r'#([^#]+)#', r'\1', text)
    # 4) Keep CJK, alphabetic, and numeric chars; replace others with spaces
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]+', ' ', text)
    return text


def preprocess_text(text, stopwords):
    """
    For a single post:
    - clean_text
    - jieba.lcut
    - remove stopwords
    - remove empty strings
    Return a list of tokens.
    """
    text = clean_text(text)
    tokens = jieba.lcut(text)
    tokens = [t for t in tokens if t.strip() and t not in stopwords]
    return tokens


def build_cooccurrence_edge_list(list_of_token_lists, window_size, outfile):
    """
    Build a co-occurrence edge list from tokenized posts and write it to outfile (CSV).
    Skips empty strings and writes only w1 < w2 to avoid duplicates.
    """
    co_dict = defaultdict(lambda: defaultdict(int))
    for tokens in list_of_token_lists:
        if not tokens:                      # Skip posts with no valid tokens
            continue
        length = len(tokens)
        for i in range(length):
            w1 = tokens[i].strip()
            if not w1:
                continue
            for j in range(i + 1, min(i + window_size, length)):
                w2 = tokens[j].strip()
                if w2 and w1 != w2:
                    co_dict[w1][w2] += 1
                    co_dict[w2][w1] += 1

    # Write CSV
    import csv
    with open(outfile, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Source", "Target", "Weight"])
        for w1, neighbors in co_dict.items():
            for w2, weight in neighbors.items():
                if w1 < w2:                # Avoid duplicate edges
                    writer.writerow([w1, w2, weight])

    print(f"[INFO] Edge list saved to: {outfile}")


############################
# Main pipeline
############################
if __name__ == "__main__":
    # 1. Detect encoding of the original file
    if not os.path.exists(SOURCE_FILE):
        print(f"[ERROR] Source file not found: {SOURCE_FILE}")
        exit()

    encoding, confidence = detect_encoding(SOURCE_FILE)

    # 2. Transcode to UTF-8 (if not already UTF-8)
    if encoding is None:
        print("[WARNING] Encoding not detected. Defaulting to gb18030 for transcoding.")
        encoding = "gb18030"
    elif encoding.lower() == "utf-8":
        print("[INFO] File is already UTF-8. Creating a direct copy.")
        import shutil
        shutil.copyfile(SOURCE_FILE, UTF8_FILE)
    else:
        print(f"[INFO] File is likely {encoding}. Attempting gb18030 → UTF-8 transcoding...")
        convert_to_utf8(SOURCE_FILE, UTF8_FILE, src_encoding="gb18030")

    # 3. Load the UTF-8 file into pandas
    try:
        df = pd.read_csv(UTF8_FILE, encoding="utf-8")
    except Exception as e:
        print(f"[ERROR] Unable to read {UTF8_FILE} with UTF-8: {e}")
        exit()

    # 4. Verify the target column exists
    if POST_COL_NAME not in df.columns:
        print(f"[ERROR] DataFrame does not contain column '{POST_COL_NAME}'. Please check column names.")
        print(f"Available columns: {df.columns.tolist()}")
        exit()

    # 5. Load stopwords
    stopwords = load_stopwords(STOPWORDS_FILES)

    # 6. Preprocess and tokenize each post
    all_token_lists = []
    for text in df[POST_COL_NAME]:
        text = str(text)                   # Ensure string type
        tokens = preprocess_text(text, stopwords)
        all_token_lists.append(tokens)

    # 7. Store tokenization results back to DataFrame
    df[POST_TOKENS_COL_NAME] = [" ".join(toks) for toks in all_token_lists]
    df.to_csv(PREPROCESSED_FILE, index=False, encoding="utf-8")
    print(f"[INFO] Preprocessing complete. Saved to: {PREPROCESSED_FILE}")

    # 8. Build co-occurrence edge list
    build_cooccurrence_edge_list(all_token_lists, WINDOW_SIZE, EDGE_LIST_FILE)
    print("[INFO] Pipeline finished!")


Building prefix dict from the default dictionary ...


[INFO] Detected encoding by chardet: GB2312, confidence: 0.99
[INFO] File is likely GB2312. Attempting gb18030 → UTF-8 transcoding...
[INFO] Successfully transcoded 'Data/posts.csv' to UTF-8 => 'Data/posts_utf8.csv'
[INFO] Stopwords loaded. Total: 2323


Dumping model to file cache C:\Users\gyc\AppData\Local\Temp\jieba.cache
Loading model cost 0.405 seconds.
Prefix dict has been built successfully.


[INFO] Preprocessing complete. Saved to: Data/posts_preprocessed.csv
[INFO] Edge list saved to: Data/weibo_cooccurrence_edges.csv
[INFO] Pipeline finished!


In [2]:
"""
analyze_top100_edges.py
────────────────────────────────────────────────────────────
• Load an edge list   Source, Target, Weight   (CSV)
• Get the 100 nodes with the highest total Weight (frequency)
• Build the sub‑graph of those nodes
      – compute PMI for every edge
      – save the edge table sorted by PMI ↓
• Compute three node‑level metrics on that sub‑graph
      – degree_centrality     (un‑weighted)
      – strength_weight       (weighted degree = Σ Weight)
      – eigenvector_centrality   (edge weight = PMI)
      – save the node table sorted by eigenvector_centrality ↓
Outputs
───────
top100_freq.csv          word, freq
top100_edges_pmi.csv     Source, Target, Weight, PMI     (sorted by PMI ↓)
top100_centrality.csv    word, degree_centrality, strength_weight,
                         eigenvector_centrality          (sorted by eigencent ↓)
Dependencies :  pandas  networkx  numpy  (scipy optional but faster)
pip install pandas networkx numpy scipy
────────────────────────────────────────────────────────────
"""

import pandas as pd
import networkx as nx
import numpy as np
from math import log2

# ── configuration ─────────────────────────────────────────
EDGE_FILE = "Data/weibo_cooccurrence_edges.csv"   # input CSV
TOP_N     = 100                                   # how many top nodes
OUT_FREQ  = "top100_freq.csv"
OUT_EDGE  = "top100_edges_pmi.csv"
OUT_CENT  = "top100_centrality.csv"
# ──────────────────────────────────────────────────────────

print("\n[1] Loading edge list …")
df = pd.read_csv(EDGE_FILE, encoding="utf-8")

required_cols = {"Source", "Target", "Weight"}
if not required_cols.issubset(df.columns):
    raise KeyError(f"CSV must contain columns: {required_cols}")

# ── 2. frequency (node strength) ──────────────────────────
print("[2] Computing node frequencies …")
strength = {}
for _, row in df.iterrows():
    s, t, w = row["Source"], row["Target"], row["Weight"]
    strength[s] = strength.get(s, 0) + w
    strength[t] = strength.get(t, 0) + w

top_nodes = sorted(strength.items(), key=lambda x: x[1], reverse=True)[:TOP_N]
pd.DataFrame(top_nodes, columns=["word", "freq"])\
  .to_csv(OUT_FREQ, index=False, encoding="utf-8")
print(f"    Top‑{TOP_N} freq table  →  {OUT_FREQ}")

top_set = {w for w, _ in top_nodes}

# ── 3. build sub‑graph and compute PMI ────────────────────
print(f"[3] Building Top‑{TOP_N} sub‑graph …")
G = nx.Graph()
for _, row in df.iterrows():
    s, t, w = row["Source"], row["Target"], row["Weight"]
    if G.has_edge(s, t):
        G[s][t]["weight"] += w          # accumulate duplicates
    else:
        G.add_edge(s, t, weight=w)

H = G.subgraph(top_set).copy()
print(f"    Sub‑graph: {H.number_of_nodes()} nodes, {H.number_of_edges()} edges")

# total edge weight in sub‑graph
T = sum(d["weight"] for _, _, d in H.edges(data=True))
# node frequencies restricted to sub‑graph
f = {n: sum(d["weight"] for _, _, d in H.edges(n, data=True)) for n in H.nodes}

# compute PMI per edge
for u, v, d in H.edges(data=True):
    w_uv = d["weight"]
    pmi  = log2((w_uv * T) / (f[u] * f[v])) if w_uv > 0 else 0.0
    d["pmi"] = pmi

# positive shift if negative PMI exists (for algorithms needing ≥0)
min_pmi = min(d["pmi"] for _, _, d in H.edges(data=True))
if min_pmi < 0:
    shift = abs(min_pmi) + 1e-6
    for _, _, d in H.edges(data=True):
        d["pmi_pos"] = d["pmi"] + shift
    pmi_key = "pmi_pos"
else:
    pmi_key = "pmi"

# save edge list sorted by PMI
edges_sorted = sorted(
    ((u, v, d["weight"], d["pmi"]) for u, v, d in H.edges(data=True)),
    key=lambda x: x[3], reverse=True
)
pd.DataFrame(edges_sorted,
             columns=["Source", "Target", "Weight", "PMI"])\
  .to_csv(OUT_EDGE, index=False, encoding="utf-8")
print(f"    Edge list + PMI  →  {OUT_EDGE}")

# ── 4. centrality metrics ─────────────────────────────────
print("[4] Computing centrality metrics …")
deg_cent = nx.degree_centrality(H)
strength_weight = {n: f[n] for n in H.nodes}

# eigenvector centrality : prefer SciPy backend if available
try:
    import scipy  # noqa: F401
    eig_cent = nx.eigenvector_centrality_numpy(H, weight=pmi_key)
except ImportError:
    eig_cent = nx.eigenvector_centrality(H, weight=pmi_key,
                                         max_iter=500, tol=1e-6)

rows = [{
    "word": n,
    "degree_centrality":       deg_cent[n],
    "strength_weight":         strength_weight[n],
    "eigenvector_centrality":  eig_cent[n]
} for n in top_set]

(pd.DataFrame(rows)
     .sort_values("eigenvector_centrality", ascending=False)
     .to_csv(OUT_CENT, index=False, encoding="utf-8"))
print(f"    Centrality table  →  {OUT_CENT}\n✓ Done.")



[1] Loading edge list …
[2] Computing node frequencies …
    Top‑100 freq table  →  top100_freq.csv
[3] Building Top‑100 sub‑graph …
    Sub‑graph: 100 nodes, 2599 edges
    Edge list + PMI  →  top100_edges_pmi.csv
[4] Computing centrality metrics …
    Centrality table  →  top100_centrality.csv
✓ Done.
