In [1]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Download the complete dataset (both train and test)
newsgroups = fetch_20newsgroups(subset='all', remove=())

# Create DataFrame
df = pd.DataFrame({
    'document_id': range(len(newsgroups.data)),
    'text': newsgroups.data,
    'true_label': [newsgroups.target_names[label] for label in newsgroups.target]
})

print(f"Loaded {len(df)} documents")
print(f"\nDataset shape: {df.shape}")
print(f"\nCategories ({len(df['true_label'].unique())}): {sorted(df['true_label'].unique())}")
print(f"\nClass distribution:")
print(df['true_label'].value_counts().sort_index())
print(f"\nFirst few rows:")
print(df.head())
print(f"\nSample text from first document:")
print(df['text'].iloc[0][:500])  # First 500 characters

Loaded 18846 documents

Dataset shape: (18846, 3)

Categories (20): ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

Class distribution:
true_label
alt.atheism                 799
comp.graphics               973
comp.os.ms-windows.misc     985
comp.sys.ibm.pc.hardware    982
comp.sys.mac.hardware       963
comp.windows.x              988
misc.forsale                975
rec.autos                   990
rec.motorcycles             996
rec.sport.baseball          994
rec.sport.hockey            999
sci.crypt                   991
sci.electronics             984
sci.med                     990
sci.space                   987
soc.religion.christian    

In [2]:
import re

print("="*70)
print("DATA EXPLORATION AND INCONSISTENCY CHECK")
print("="*70)

# 1. Basic statistics
print("\n1. BASIC STATISTICS:")
print(f"Total documents: {len(df)}")
print(f"Number of categories: {df['true_label'].nunique()}")
print(f"Categories: {sorted(df['true_label'].unique())}")

# 2. Check for null/empty values
print("\n2. NULL/EMPTY VALUES:")
print(df.isnull().sum())
print(f"Empty strings: {(df['text'].str.strip() == '').sum()}")

# 3. Text length distribution
print("\n3. TEXT LENGTH DISTRIBUTION:")
df['text_length'] = df['text'].str.len()
print(df['text_length'].describe())
print(f"Documents with < 50 characters: {(df['text_length'] < 50).sum()}")
print(f"Documents with < 100 characters: {(df['text_length'] < 100).sum()}")

# 4. Sample a few documents to see their structure
print("\n4. SAMPLE DOCUMENTS (First 3):")
for i in range(min(3, len(df))):
    print(f"\n--- Document {i} (Category: {df['true_label'].iloc[i]}) ---")
    print(f"Length: {df['text_length'].iloc[i]} characters")
    print(f"First 800 characters:")
    print(df['text'].iloc[i][:800])
    print("...")

# 5. Check for common patterns that need cleaning
print("\n5. PATTERNS DETECTED:")

# Check for email headers
has_headers = df['text'].str.contains(r'^(From|Subject|Organization):', case=False, regex=True, na=False)
print(f"Documents with email headers: {has_headers.sum()} ({has_headers.sum()/len(df)*100:.1f}%)")

# Check for quoted text
has_quotes = df['text'].str.contains(r'^>+', regex=True, flags=re.MULTILINE, na=False)
print(f"Documents with quoted text (>): {has_quotes.sum()} ({has_quotes.sum()/len(df)*100:.1f}%)")

# Check for email addresses
has_emails = df['text'].str.contains(r'\S+@\S+', regex=True, na=False)
print(f"Documents with email addresses: {has_emails.sum()} ({has_emails.sum()/len(df)*100:.1f}%)")

# Check for URLs
has_urls = df['text'].str.contains(r'http\S+|www\.\S+', regex=True, na=False)
print(f"Documents with URLs: {has_urls.sum()} ({has_urls.sum()/len(df)*100:.1f}%)")

# Check for excessive whitespace
has_excess_space = df['text'].str.contains(r'\s{3,}', regex=True, na=False)
print(f"Documents with excessive whitespace: {has_excess_space.sum()} ({has_excess_space.sum()/len(df)*100:.1f}%)")

# Check for special characters
has_special_chars = df['text'].str.contains(r'[^\w\s.,!?;:\'"()-]', regex=True, na=False)
print(f"Documents with special characters: {has_special_chars.sum()} ({has_special_chars.sum()/len(df)*100:.1f}%)")

# 6. Class balance
print("\n6. CLASS DISTRIBUTION:")
class_counts = df['true_label'].value_counts().sort_index()
print(class_counts)
print(f"\nMost common: {class_counts.max()} documents")
print(f"Least common: {class_counts.min()} documents")
print(f"Balance ratio (max/min): {class_counts.max()/class_counts.min():.2f}")

print("\n" + "="*70)
print("EXPLORATION COMPLETE")
print("="*70)

DATA EXPLORATION AND INCONSISTENCY CHECK

1. BASIC STATISTICS:
Total documents: 18846
Number of categories: 20
Categories: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

2. NULL/EMPTY VALUES:
document_id    0
text           0
true_label     0
dtype: int64
Empty strings: 0

3. TEXT LENGTH DISTRIBUTION:
count     18846.000000
mean       1902.525894
std        3984.970264
min         115.000000
25%         751.000000
50%        1175.000000
75%        1874.750000
max      160616.000000
Name: text_length, dtype: float64
Documents with < 50 characters: 0
Documents with < 100 characters: 0

4. SAMPLE DOCUMENTS (First 3):

--- Document 0 (Category: rec

  has_headers = df['text'].str.contains(r'^(From|Subject|Organization):', case=False, regex=True, na=False)


Documents with URLs: 3 (0.0%)
Documents with excessive whitespace: 15945 (84.6%)
Documents with special characters: 18843 (100.0%)

6. CLASS DISTRIBUTION:
true_label
alt.atheism                 799
comp.graphics               973
comp.os.ms-windows.misc     985
comp.sys.ibm.pc.hardware    982
comp.sys.mac.hardware       963
comp.windows.x              988
misc.forsale                975
rec.autos                   990
rec.motorcycles             996
rec.sport.baseball          994
rec.sport.hockey            999
sci.crypt                   991
sci.electronics             984
sci.med                     990
sci.space                   987
soc.religion.christian      997
talk.politics.guns          910
talk.politics.mideast       940
talk.politics.misc          775
talk.religion.misc          628
Name: count, dtype: int64

Most common: 999 documents
Least common: 628 documents
Balance ratio (max/min): 1.59

EXPLORATION COMPLETE


In [3]:
import re

def clean_newsgroup_text(text):
    """Clean newsgroup text based on identified patterns."""
    
    # 1. Remove email headers (lines starting with common patterns)
    # Common headers: From:, Subject:, Organization:, Lines:, NNTP-Posting-Host:, etc.
    lines = text.split('\n')
    cleaned_lines = []
    in_header = True
    
    for line in lines:
        # Skip header lines (lines with key: value format at the start)
        if in_header and re.match(r'^[\w-]+:', line):
            continue
        elif in_header and line.strip() == '':
            # Empty line often marks end of headers
            in_header = False
            continue
        else:
            in_header = False
            cleaned_lines.append(line)
    
    text = '\n'.join(cleaned_lines)
    
    # 2. Remove quoted text (lines starting with > or >>)
    text = re.sub(r'^>+.*$', '', text, flags=re.MULTILINE)
    
    # 3. Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # 4. Remove URLs (minimal presence but good to clean)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # 5. Convert to lowercase
    text = text.lower()
    
    # 6. Normalize whitespace (replace multiple spaces/newlines with single space)
    text = re.sub(r'\s+', ' ', text)
    
    # 7. Strip leading/trailing whitespace
    text = text.strip()
    
    return text

# Apply cleaning
print("Applying cleaning to all documents...")
df['text_cleaned'] = df['text'].apply(clean_newsgroup_text)

# Check results
print("\n" + "="*70)
print("CLEANING RESULTS")
print("="*70)

# Calculate new text lengths
df['cleaned_length'] = df['text_cleaned'].str.len()

print("\nText length comparison:")
print("Original:")
print(df['text_length'].describe())
print("\nCleaned:")
print(df['cleaned_length'].describe())

# Check for very short documents after cleaning
print(f"\nDocuments with < 50 characters after cleaning: {(df['cleaned_length'] < 50).sum()}")
print(f"Documents with < 100 characters after cleaning: {(df['cleaned_length'] < 100).sum()}")

# Show before/after examples
print("\n" + "="*70)
print("BEFORE/AFTER EXAMPLES")
print("="*70)

for i in [0, 1, 2]:
    print(f"\n--- Document {i} (Category: {df['true_label'].iloc[i]}) ---")
    print(f"\nORIGINAL (first 400 chars):")
    print(df['text'].iloc[i][:400])
    print(f"\nCLEANED (first 400 chars):")
    print(df['text_cleaned'].iloc[i][:400])
    print(f"\nLength: {df['text_length'].iloc[i]} → {df['cleaned_length'].iloc[i]}")

# Filter out very short documents (optional - set threshold)
min_length = 50
df_final = df[df['cleaned_length'] >= min_length].copy()

# Reset document IDs and keep only necessary columns
df_final = df_final[['true_label', 'text_cleaned']].reset_index(drop=True)
df_final['document_id'] = range(len(df_final))
df_final = df_final[['document_id', 'text_cleaned', 'true_label']]
df_final.rename(columns={'text_cleaned': 'text'}, inplace=True)

print("\n" + "="*70)
print("FINAL DATASET")
print("="*70)
print(f"Original documents: {len(df)}")
print(f"After cleaning and filtering (min_length={min_length}): {len(df_final)}")
print(f"Removed: {len(df) - len(df_final)} documents")
print(f"\nFinal shape: {df_final.shape}")
print(f"\nSample of final dataset:")
print(df_final.head())

Applying cleaning to all documents...

CLEANING RESULTS

Text length comparison:
Original:
count     18846.000000
mean       1902.525894
std        3984.970264
min         115.000000
25%         751.000000
50%        1175.000000
75%        1874.750000
max      160616.000000
Name: text_length, dtype: float64

Cleaned:
count    18846.000000
mean      1262.492041
std       3367.285109
min          0.000000
25%        359.000000
50%        627.500000
75%       1143.750000
max      82312.000000
Name: cleaned_length, dtype: float64

Documents with < 50 characters after cleaning: 164
Documents with < 100 characters after cleaning: 533

BEFORE/AFTER EXAMPLES

--- Document 0 (Category: rec.sport.hockey) ---

ORIGINAL (first 400 chars):
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack


In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and efficient
# Or: model = SentenceTransformer('all-mpnet-base-v2')  # More accurate but slower

# Generate embeddings for all documents
embeddings = model.encode(
    df['text'].tolist(),
    show_progress_bar=True,
    batch_size=32,
    convert_to_numpy=True,
    device='cpu'  
)

# Save embeddings
np.save('document_embeddings.npy', embeddings)

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

# Load your embeddings (if saved)
embeddings = np.load('document_embeddings.npy')

# Choose number of clusters (k)
# Common range: 200-500, or you could use k = number of categories × multiplier
k = 100  # Adjust based on your dataset size

# Run k-means clustering
print(f"Running k-means with k={k} clusters...")
kmeans = KMeans(
    n_clusters=k,
    random_state=42,
    n_init=10,
    max_iter=300
)
cluster_labels = kmeans.fit_predict(embeddings)

# Add cluster assignments to your dataframe
df['cluster'] = cluster_labels

# Select representative sample: document closest to each centroid
sampled_indices = []

for cluster_id in range(k):
    # Get all documents in this cluster
    cluster_mask = cluster_labels == cluster_id
    cluster_indices = np.where(cluster_mask)[0]
    
    if len(cluster_indices) == 0:
        continue
    
    # Get embeddings for this cluster
    cluster_embeddings = embeddings[cluster_indices]
    centroid = kmeans.cluster_centers_[cluster_id]
    
    # Find document closest to centroid
    distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
    closest_idx = cluster_indices[np.argmin(distances)]
    
    sampled_indices.append(closest_idx)

# Create sampled subset
sampled_df = df.iloc[sampled_indices].copy()

print(f"Original dataset size: {len(df)}")
print(f"Sampled dataset size: {len(sampled_df)}")
print(f"Reduction: {len(sampled_df)/len(df)*100:.1f}%")

# Save the sampled subset
sampled_df.to_csv('sampled_documents_lts.csv', index=False)

Running k-means with k=100 clusters...
Original dataset size: 18846
Sampled dataset size: 100
Reduction: 0.5%


: 

: 

: 

In [None]:
# LLM Labelling starts here
import pandas as pd
sampled_df = pd.read_csv('sampled_documents_lts.csv')
sampled_df.head()

Unnamed: 0,document_id,text,true_label,text_length,text_cleaned,cleaned_length,cluster
0,12800,From: 02106@ravel.udel.edu (Samuel Ross)\nSubj...,misc.forsale,948,someone please buy these books!!!!! i am not a...,696,0
1,10497,From: cdt@sw.stratus.com (C. D. Tavares)\nSubj...,talk.politics.guns,1831,"in article () writes: hey, joe -- assuming you...",578,1
2,9913,From: drozinst@db.erau.edu (Drozinski Tim)\nSu...,rec.sport.hockey,2303,(joseph b stiehm) writes: as if an aluminum st...,869,2
3,6094,From: sfp@lemur.cit.cornell.edu (Sheila Patter...,soc.religion.christian,2419,in article (anni dozier) writes: |> after read...,2155,3
4,10250,From: glang@slee01.srl.ford.com (Gordon Lang)\...,comp.sys.ibm.pc.hardware,1331,volker voecking wrote: : : hello : : i have pr...,1069,4


: 

: 

: 

In [None]:
# We chose OpenAI, a cloud LLM, to assist us with the LLM labelling
# We retrieved the API key from the link: https://platform.openai.com/api-keys
import sys

# Install/upgrade the OpenAI Python client in the SAME environment Jupyter is using
!{sys.executable} -m pip install --quiet "openai>=1.0.0"


: 

: 

: 

In [None]:
# OpenAI couldn't originally be founded, so I was trying to identify the path for the active environment

import sys
print(sys.executable)


/opt/conda/bin/python


: 

: 

: 

In [None]:
!pip show openai


Name: openai
Version: 2.9.0
Summary: The official Python library for the openai API
Home-page: https://github.com/openai/openai-python
Author: 
Author-email: OpenAI <support@openai.com>
License: Apache-2.0
Location: /home/kgt238_nyu_edu/.local/lib/python3.11/site-packages
Requires: anyio, distro, httpx, jiter, pydantic, sniffio, tqdm, typing-extensions
Required-by: 


: 

: 

: 

In [None]:
# Tryingt to get Jupyter-notebook to use the OpenAI LLM model
import site, sys

# Make sure Python can see packages installed in ~/.local/lib/python3.11/site-packages
user_site = site.getusersitepackages()
if user_site not in sys.path:
    sys.path.append(user_site)

print("User site-packages:", user_site)
print("openai in path?", any("openai" in p for p in sys.path))


User site-packages: /home/kgt238_nyu_edu/.local/lib/python3.11/site-packages
openai in path? False


: 

: 

: 

In [None]:
# Testing the import, since it outputs: OpenAI version: 2.9.0, it means that it works now

import openai
print("OpenAI version:", openai.__version__)


OpenAI version: 2.9.0


: 

: 

: 

In [None]:
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = "INSERT KEY HERE"

client = OpenAI()

api_key = os.getenv("OPENAI_API_KEY")
api_url = "https://api.openai.com/v1/chat/completions"


: 

: 

: 

In [None]:
import time
import json
import requests

def classify_text_openai(text: str, retries: int = 3, delay: float = 2.0) -> dict:
    # Clean and truncate extremely long docs
    text = text.replace("\r", " ").replace("\n", " ")
    text = text[:1000]  # optional: truncate to 1000 chars

    prompt = f"""You are a text annotator. Classify the document below.

DOCUMENT:
{text}

Return ONLY a single JSON object, no extra text, in this exact format:
{{
  "topic": "...",
  "intent": "...",
  "sentiment": "..."
}}"""

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }

    body = {
        "model": "gpt-4o-mini",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0,
    }

    for attempt in range(1, retries + 1):
        resp = requests.post(api_url, headers=headers, json=body)

        # If HTTP-level error (e.g. quota, rate limit)
        if resp.status_code != 200:
            return {
                "topic": "Personal / Culture / Other",
                "intent": "Other",
                "sentiment": "Neutral",
                "error": f"HTTP {resp.status_code}: {resp.text[:200]}",
            }

        try:
            data = resp.json()
            content = data["choices"][0]["message"]["content"].strip()
        except Exception as e:
            # Couldn’t parse the API JSON wrapper
            return {
                "topic": "Personal / Culture / Other",
                "intent": "Other",
                "sentiment": "Neutral",
                "error": f"API JSON parse error: {e}; raw={resp.text[:200]}",
            }

        # Try to parse the model’s message as JSON
        try:
            parsed = json.loads(content)
        except json.JSONDecodeError:
            # Try to salvage JSON substring if the model wrapped it in text
            start = content.find("{")
            end = content.rfind("}")
            if start != -1 and end != -1:
                try:
                    parsed = json.loads(content[start:end+1])
                except json.JSONDecodeError:
                    if attempt < retries:
                        print(f"Bad JSON, retrying ({attempt}/{retries})...")
                        time.sleep(delay)
                        continue
                    return {
                        "topic": "Personal / Culture / Other",
                        "intent": "Other",
                        "sentiment": "Neutral",
                        "error": f"Could not parse JSON: {content[:200]}",
                    }
            else:
                if attempt < retries:
                    print(f"No JSON braces, retrying ({attempt}/{retries})...")
                    time.sleep(delay)
                    continue
                return {
                    "topic": "Personal / Culture / Other",
                    "intent": "Other",
                    "sentiment": "Neutral",
                    "error": f"No JSON object in: {content[:200]}",
                }

        # Checking to maake sure that the keys exist
        if all(k in parsed for k in ("topic", "intent", "sentiment")):
            return parsed
        else:
            if attempt < retries:
                print(f"Missing keys, retrying ({attempt}/{retries})...")
                time.sleep(delay)
                continue
            return {
                "topic": "Personal / Culture / Other",
                "intent": "Other",
                "sentiment": "Neutral",
                "error": f"Missing keys in parsed JSON: {parsed}",
            }


: 

: 

: 

In [None]:
labels_cloud = []
error_count_cloud = 0

texts = sampled_df["text"].astype(str).tolist()
# texts = sampled_df["text"].astype(str).tolist()[:5] # testing with 5 entries first

for i, text in enumerate(texts):
    print(f"Labelling {i+1}/{len(texts)}...", end="\r")
    result = classify_text_openai(text)
    if "error" in result:
        error_count_cloud += 1
        print(result["error"])
    labels_cloud.append(result)

print(f"\nDone! Labeled: {len(labels_cloud)}, Errors: {error_count_cloud}")


Labelling 100/100...
Done! Labeled: 100, Errors: 0


: 

: 

: 

In [None]:
# Save the labeled dataset as a separate csv file

sampled_df["topic"] = [r.get("topic", "Personal / Culture / Other") for r in labels_cloud]
sampled_df["intent"] = [r.get("intent", "Other") for r in labels_cloud]
sampled_df["sentiment"] = [r.get("sentiment", "Neutral") for r in labels_cloud]

# Adding the labels to the last columns
ordered_cols = [
    "document_id",
    "text",
    "true_label",
    "text_length",
    "text_cleaned",
    "cleaned_length",
    "cluster",
    "topic",
    "intent",
    "sentiment",
]

sampled_df = sampled_df[ordered_cols]

# Save the output file
output_path = "sampled_documents_lts_labeled_openai.csv"
sampled_df.to_csv(output_path, index=False)

print(f"Saved labeled data to: {output_path}")
sampled_df.head()

Saved labeled data to: sampled_documents_lts_labeled_openai.csv


Unnamed: 0,document_id,text,true_label,text_length,text_cleaned,cleaned_length,cluster,topic,intent,sentiment
0,12800,From: 02106@ravel.udel.edu (Samuel Ross)\nSubj...,misc.forsale,948,someone please buy these books!!!!! i am not a...,696,0,Book Sale,Sell used textbooks,Desperate
1,10497,From: cdt@sw.stratus.com (C. D. Tavares)\nSubj...,talk.politics.guns,1831,"in article () writes: hey, joe -- assuming you...",578,1,Political Accountability,Defend or criticize the actions of President C...,Mixed
2,9913,From: drozinst@db.erau.edu (Drozinski Tim)\nSu...,rec.sport.hockey,2303,(joseph b stiehm) writes: as if an aluminum st...,869,2,Hockey Violence and Player Conduct,Expressing disapproval of a player's behavior ...,Negative
3,6094,From: sfp@lemur.cit.cornell.edu (Sheila Patter...,soc.religion.christian,2419,in article (anni dozier) writes: |> after read...,2155,3,Discussion on the nature of a religious newsgroup,Express dissatisfaction with the group's dynam...,Frustrated
4,10250,From: glang@slee01.srl.ford.com (Gordon Lang)\...,comp.sys.ibm.pc.hardware,1331,volker voecking wrote: : : hello : : i have pr...,1069,4,IDE hard disk configuration,seeking technical assistance,neutral


: 

: 

: 

In [None]:
# Setup: load full data, labeled data, and embeddings

import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC

EMB_PATH = "document_embeddings.npy"
LABELED_CSV_PATH = "sampled_documents_lts_labeled_openai.csv"  # OpenAI-labeled subset
FULL_CSV_PATH = "sampled_documents_lts.csv"                    # full dataset

# Load data
embeddings = np.load(EMB_PATH)
labeled_df = pd.read_csv(LABELED_CSV_PATH)
full_df = pd.read_csv(FULL_CSV_PATH)

print("Embeddings shape:", embeddings.shape)
print("Labeled subset shape:", labeled_df.shape)
print("Full dataset shape:", full_df.shape)

# Training set = docs with OpenAI labels
train_ids = labeled_df["document_id"].values
X_train = embeddings[train_ids]

# Test set = entire data - labeled data
test_df = full_df[~full_df["document_id"].isin(train_ids)].copy()
test_ids = test_df["document_id"].values
X_test = embeddings[test_ids]

print("\n=== Dataset stats ===")
print("Training docs (OpenAI-labeled):", X_train.shape[0])
print("Test docs (unlabeled):        ", X_test.shape[0])

print("\nSample labeled rows (training data):")
display(labeled_df.head())

print("\nSample unlabeled rows (test data):")
display(test_df.head())


In [None]:
def train_and_predict_openai_label(target_column: str):
    """
    Train on ALL OpenAI-labeled rows (no split),
    then predict this label for ALL unlabeled rows.
    """
    print("\n" + "="*80)
    print(f"Processing target: {target_column}")
    print("="*80)

    if target_column not in labeled_df.columns:
        print(f"Column '{target_column}' not found in labeled_df, skipping.")
        return None

    # Training labels from OpenAI-labeled subset
    y_train = labeled_df[target_column].values

    print("Training size:", len(y_train))
    print("Unique labels:", labeled_df[target_column].nunique())
    print("\nLabel distribution (top 10):")
    print(labeled_df[target_column].value_counts().head(10))

    # Simple, fast classifier
    model = LinearSVC()

    print("\nFitting model on ALL OpenAI-labeled data...")
    model.fit(X_train, y_train)

    print("Predicting for ALL unlabeled documents...")
    test_preds = model.predict(X_test)

    preds_df = pd.DataFrame(
        {
            "document_id": test_ids,
            f"predicted_{target_column}": test_preds,
        }
    )

    print(f"\nPrediction distribution on test set (top 10) for {target_column}:")
    print(preds_df[f"predicted_{target_column}"].value_counts().head(10))

    print("\nSample predictions with text:")
    joined = (
        test_df.merge(preds_df, on="document_id", how="inner")
        .loc[:, ["document_id", "text", f"predicted_{target_column}"]]
        .head(5)
    )
    display(joined)

    return preds_df


In [None]:
# Run for topic, intent, sentiment and write final unified CSV 

targets = ["topic", "intent", "sentiment"]
prediction_frames = {}

print("\nRunning distillation for targets:", targets)

for col in targets:
    prediction_frames[col] = train_and_predict_openai_label(col)

# Start from the full dataset
final_df = full_df.copy()

# Attach OpenAI labels (topic/intent/sentiment) only for labeled/training subset
final_df = final_df.merge(
    labeled_df[["document_id", "topic", "intent", "sentiment"]],
    on="document_id",
    how="left"
)

# Attach predictions for each target (only defined for test docs)
for col in targets:
    preds_df = prediction_frames[col]
    final_df = final_df.merge(
        preds_df,
        on="document_id",
        how="left"
    )

print("\n=== Final dataset sample with OpenAI + predicted labels ===")
display(final_df.head(10))

# Save everything into a single CSV
output_path = "sampled_documents_lts_labeled_openai.csv"
final_df.to_csv(output_path, index=False)

print(f"\n Final unified CSV saved to: {output_path}")
