## Occupation Classification
  - Traditional: Logistic Regression (TF-IDF)
  - Neural: Fine-tuned BERT/RoBERTa
  - Metrics: Accuracy, Precision, Recall, F1

### Data Loading

In [14]:
import pandas as pd
data = pd.read_csv('../../data/dataset3/ted_talks_en.csv')
data = data[['occupations', 'transcript']].dropna().reset_index(drop=True)
data.head()

Unnamed: 0,occupations,transcript
0,{0: ['climate advocate']},"Thank you so much, Chris. And it's truly a gre..."
1,{0: ['global health expert; data visionary']},"About 10 years ago, I took on the task to teac..."
2,{0: ['technology columnist']},"(Music: ""The Sound of Silence,"" Simon & Garfun..."
3,{0: ['activist for environmental justice']},If you're here today — and I'm very happy that...
4,"{0: ['author', 'educator']}",Good morning. How are you? (Audience) Good. It...


### Data Preprocessing

In [74]:
from sklearn.model_selection import train_test_split
import ast
import re

clean_data = data.copy()

# extract only occupations of the main speaker
def extract_first_occupation(x):
    if pd.isna(x):
        return []
    if isinstance(x, (list, tuple)):
        return [str(x[0])] if x else []
    if not isinstance(x, str):
        return []
    s = x.strip()
    # try to parse as literal
    try:
        val = ast.literal_eval(s)
        if isinstance(val, dict):
            first_val = next(iter(val.values()))
            if isinstance(first_val, (list, tuple)):
                return [str(first_val[0])] if first_val else []
            return [str(first_val).split(',')[0].strip()] if first_val else []
        if isinstance(val, (list, tuple)):
            return [str(val[0])] if val else []
        if isinstance(val, str):
            return [val.split(',')[0].strip()]
    except Exception:
        pass
    # fallback: check for brackets
    m = re.search(r'\[(.*?)\]', s)
    if m:
        inside = m.group(1)
        parts = [p.strip().strip('\'"') for p in re.split(r',\s*', inside) if p.strip()]
        return [parts[0]] if parts else []
    # fallback: split by comma
    parts = [p.strip().strip('\'"') for p in re.split(r',\s*', s) if p.strip()]
    return [parts[0]] if parts else []

PATTERNS = {
    'author': ['author', 'writer', 'novelist', 'biographer', 'poet'],
    'researcher': ['researcher', 'scientist', 'research', 'physicist', 'chemist', 'biologist'],
    'academic': ['professor', 'lecturer', 'associate professor', 'assistant professor', 'academic'],
    'engineer': ['engineer', 'developer', 'programmer', 'software', 'architect'],
    'entrepreneur': ['entrepreneur', 'founder', 'ceo', 'co-founder', 'startup'],
    'artist': ['artist', 'painter', 'sculptor', 'illustrator', 'designer'],
    'musician': ['musician', 'composer', 'singer', 'songwriter'],
    'actor': ['actor', 'actress', 'performer'],
    'journalist': ['journalist', 'reporter', 'editor'],
    'politician': ['politician', 'minister', 'senator', 'mayor'],
}

def map_to_coarse(label: str) -> str:
    if not isinstance(label, str) or not label.strip():
        return 'other'
    s = label.lower()
    # remove common modifiers
    s = re.sub(r'(\baward[-\s]?winning\b|\bbest[-\s]?selling\b|\bformer\b|\bsenior\b|\bchief\b|\blead\b)', ' ', s)
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    # check patterns
    for coarse, kws in PATTERNS.items():
        for kw in kws:
            if kw in s:
                return coarse
    # fallback: use last word as coarse label
    parts = s.split()
    return parts[-1] if parts else 'other'

clean_data['occupations'] = clean_data['occupations'].apply(extract_first_occupation)

# Remove entries with empty occupations
clean_data = clean_data[clean_data['occupations'].map(lambda x: bool(x))].reset_index(drop=True)

clean_data['transcript'] = clean_data['transcript'].str.replace(r'\[.*?\]', '', regex=True)
clean_data['transcript'] = clean_data['transcript'].str.replace(r'\s+', ' ', regex=True).str.strip()
clean_data['occupations'] = clean_data['occupations'].map(
    lambda lst: [map_to_coarse(lst[0])] if lst else []
)
top_k = 30
counts = clean_data['occupations'].explode().value_counts()
top_labels = set(counts.nlargest(top_k).index.tolist())

def keep_top_or_other(lst):
    if not lst:
        return ['other']
    lab = lst[0]
    return [lab] if lab in top_labels else ['other']

clean_data['occupations'] = clean_data['occupations'].map(keep_top_or_other)
clean_data = clean_data[clean_data['occupations'].map(bool)].reset_index(drop=True)
print("Coarse occupation counts:\n", clean_data['occupations'].explode().value_counts())

train_data, test_data = train_test_split(clean_data, test_size=0.2, random_state=63)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
print(f"Train size: {len(train_data)}, Test size: {len(test_data)}")
print(clean_data.head())


Coarse occupation counts:
 occupations
other             1072
researcher         466
author             289
artist             255
entrepreneur       160
engineer           144
journalist         129
activist           120
expert              98
psychologist        71
economist           63
advocate            54
inventor            50
educator            48
musician            43
photographer        38
academic            33
technologist        31
philosopher         30
filmmaker           28
visionary           28
physician           25
scholar             25
politician          24
actor               24
consultant          24
roboticist          23
theorist            23
strategist          22
anthropologist      22
historian           21
Name: count, dtype: int64
Train size: 2786, Test size: 697
   occupations                                         transcript
0   [advocate]  Thank you so much, Chris. And it's truly a gre...
1  [visionary]  About 10 years ago, I took on the task to

### Traditional Model: Logistic Regression with TF-IDF

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
log_reg = LogisticRegression(max_iter=1000)
multi_target_log_reg = MultiOutputClassifier(log_reg)

pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', multi_target_log_reg)
])

# Prepare training data
mlb = MultiLabelBinarizer()
X_train = train_data['transcript']
y_train = mlb.fit_transform(train_data['occupations'])    # fit on train
X_test = test_data['transcript']
y_test = mlb.transform(test_data['occupations'])          # transform test to same columns
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

# Train the model
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Evaluation
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=0))

y_train shape: (2786, 31), y_test shape: (697, 31)
Logistic Regression Classification Report:
                precision    recall  f1-score   support

      academic       0.00      0.00      0.00         7
      activist       0.00      0.00      0.00        29
         actor       0.00      0.00      0.00         3
      advocate       0.00      0.00      0.00         6
anthropologist       0.00      0.00      0.00         4
        artist       0.00      0.00      0.00        42
        author       0.00      0.00      0.00        56
    consultant       0.00      0.00      0.00         7
     economist       0.00      0.00      0.00        14
      educator       0.00      0.00      0.00        10
      engineer       0.00      0.00      0.00        24
  entrepreneur       0.00      0.00      0.00        37
        expert       0.00      0.00      0.00        14
     filmmaker       0.00      0.00      0.00         5
     historian       0.00      0.00      0.00         6
      inv