In [17]:
import xml.etree.ElementTree as ET  
from collections import defaultdict  
import re  
import json  

def parse_xml(xml_file, output_prefix):  
    meta = defaultdict(dict)  # Maps post IDs to metadata  
    text_file = open(f"{output_prefix}_text.tsv", "w")
    
    context = ET.iterparse(xml_file, events=("start",))  
    _, root = next(context)  # Skip root tag
    
    for event, elem in context:  
        if elem.tag == "row" and elem.attrib.get("PostTypeId") == "2":  # Filter answers  
            post_id = int(elem.attrib["Id"])  
            parent_id = int(elem.attrib.get("ParentId", -1))  
            score = int(elem.attrib.get("Score", 0))  
            body = elem.attrib["Body"]
            
            # Store metadata  
            meta[post_id] = {  
                "ParentId": parent_id,  
                "Score": score,  
                "Body": body  
            }  
            text_file.write(f"{post_id}\t{body}\n")  
            elem.clear()  # Free memory
    
    root.clear()  
    text_file.close()  
    with open(f"{output_prefix}_meta.json", "w") as f:  
        json.dump(meta, f)  

# Example usage:  
parse_xml("Posts.xml", "stack_data")
# 然后从 stack_data_meta.json 里重新加载 meta：
import json
with open("stack_data_meta.json") as f:
    meta = json.load(f)

In [18]:
from collections import defaultdict  
import numpy as np  

def create_balanced_labels(meta, num_questions=10000):  
    question_answers = defaultdict(list)  
    for aid, data in meta.items():  
        if data["ParentId"] != -1:  # Skip questions  
            question_answers[data["ParentId"]].append(aid)
    
    # Select top and bottom scoring answers per question  
    selected_aids = []  
    for qid, aids in list(question_answers.items())[:num_questions]:  
        if len(aids) < 2:  
            continue  
        scores = [meta[aid]["Score"] for aid in aids]  
        top_aid = aids[np.argmax(scores)]  
        bottom_aid = aids[np.argmin(scores)]  
        selected_aids.extend([top_aid, bottom_aid])
    
    # Create labels (Score > 0 as good)  
    X = [meta[aid]["Body"] for aid in selected_aids]  
    Y = np.array([meta[aid]["Score"] > 0 for aid in selected_aids])  
    return X, Y  

X, Y = create_balanced_labels(meta)  # meta from parse_xml  
print(f"Label distribution: {np.bincount(Y)}")  # Should be ~50% each

Label distribution: [ 34 182]


In [23]:
import re  
from nltk.tokenize import word_tokenize, sent_tokenize  
import nltk  
nltk.download("punkt")

def extract_features(text):  
    features = {}
    
    # HTML links (excluding those in code blocks)  
    link_re = re.compile(r'<a href="http://.*?">.*?</a>', re.IGNORECASE | re.DOTALL)  
    code_re = re.compile(r'<pre>(.*?)</pre>', re.DOTALL)  
    code_blocks = code_re.findall(text)  
    text_no_code = code_re.sub("", text)  
    links = link_re.findall(text_no_code)  
    features["link_count"] = len(links)
    
    # Code lines  
    code_lines = sum(len(block.split("\n")) for block in code_blocks)  
    features["code_lines"] = code_lines
    
    # Text complexity  
    text_clean = re.sub(r'<.*?>', "", text_no_code).strip()  # Remove HTML tags  
    tokens = word_tokenize(text_clean)  
    features["word_count"] = len(tokens)
    
    if tokens:  
        sentences = sent_tokenize(text_clean)  
        features["avg_sent_len"] = np.mean([len(word_tokenize(s)) for s in sentences])  
        features["avg_word_len"] = np.mean([len(w) for w in tokens])
    
    # Stylistic features  
    features["all_caps"] = sum(1 for w in tokens if w.isupper())  
    features["exclams"] = text_clean.count("!")
    
    return features  



[nltk_data] Downloading package punkt to /Users/muxiaohui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/muxiaohui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
from sklearn.feature_extraction import DictVectorizer  
from sklearn.preprocessing import StandardScaler  
from sklearn.pipeline import make_pipeline  

# Convert dict features to matrix  
features = [extract_features(text) for text in X] 
vec = DictVectorizer()  
X_matrix = vec.fit_transform(features)  

# Standardize features  
scaler = StandardScaler()  
X_standardized = scaler.fit_transform(X_matrix)  

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/muxiaohui/nltk_data'
    - '/opt/anaconda3/envs/.conda/nltk_data'
    - '/opt/anaconda3/envs/.conda/share/nltk_data'
    - '/opt/anaconda3/envs/.conda/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
