In [None]:
# -----------------------
# Imports and NLTK setup
# -----------------------

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from tqdm import tqdm
from collections import defaultdict
from scipy.spatial import distance
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

STOP_WORDS = set(stopwords.words('english'))
NON_ALPHA_REGEX = re.compile(r'[^a-zA-Z\s]')

# -----------------------
# 1. Load & Filter Data
# -----------------------
df = pd.read_csv(
    'campaigns.csv',
    usecols=['description', 'category', 'token_counts']
)

# Keep campaigns with enough tokens
df = df[df['token_counts'] > 50]

# Filter categories with enough samples
MIN_SAMPLES_PER_CATEGORY = 500
category_counts = df['category'].value_counts()
valid_categories = category_counts[category_counts >= MIN_SAMPLES_PER_CATEGORY].index
df = df[df['category'].isin(valid_categories)].reset_index(drop=True)

# -----------------------
# 2. Noun Phrase Extraction
# -----------------------
def extract_noun_phrases(text, grammar=None):
    """
    Extract noun phrases from text using POS tagging and chunking.
    """
    if grammar is None:
        grammar = """
            NP: {<NN.?>+<NN.?>}
            NP: {<JJ.?>+<NN.?>}
        """
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    chunk_parser = nltk.RegexpParser(grammar)
    chunked = chunk_parser.parse(tagged)

    return [
        " ".join(word for word, pos in subtree.leaves())
        for subtree in chunked.subtrees(lambda t: t.label() == 'NP')
    ]

df['noun_phrases'] = df['description'].map(extract_noun_phrases)

# -----------------------
# 3. Phrase Cleaning
# -----------------------
def clean_phrases(phrases):
    """
    Remove stopwords, punctuation, and non-alphabetic tokens from noun phrases.
    """
    cleaned = []
    for phrase in phrases:
        words = word_tokenize(phrase)
        filtered = [
            w for w in words
            if w.lower() not in STOP_WORDS and not NON_ALPHA_REGEX.search(w)
        ]
        if filtered:
            cleaned.append(" ".join(filtered))
    return cleaned

df['noun_phrases'] = df['noun_phrases'].map(clean_phrases)

# -----------------------
# 4. Map phrases to campaigns
# -----------------------
phrase_to_campaigns = defaultdict(list)

for idx, phrases in tqdm(df['noun_phrases'].items(), desc="Mapping phrases to campaigns"):
    for phrase in phrases:
        if idx not in phrase_to_campaigns[phrase]:
            phrase_to_campaigns[phrase].append(idx)

phrase_df = pd.DataFrame(
    list(phrase_to_campaigns.items()),
    columns=['noun_phrase', 'campaign_indexes']
)

# Filter phrases that appear in at least 2 campaigns and at most 50% of all campaigns
MAX_CAMPAIGNS_PER_PHRASE = len(df) // 2
mask = phrase_df['campaign_indexes'].apply(lambda idxs: 2 <= len(idxs) <= MAX_CAMPAIGNS_PER_PHRASE)
phrase_df = phrase_df[mask].reset_index(drop=True)

# -----------------------
# 5. Map phrases to categories
# -----------------------
campaign_categories = df['category']

def get_categories_for_phrase(indexes):
    return [campaign_categories.iloc[i] for i in indexes]

phrase_df['categories'] = phrase_df['campaign_indexes'].map(get_categories_for_phrase)

# -----------------------
# 6. Map categories to phrases
# -----------------------
category_to_phrases = defaultdict(list)

for idx, categories in phrase_df['categories'].items():
    for category in categories:
        category_to_phrases[category].append(idx)

category_df = pd.DataFrame(
    list(category_to_phrases.items()),
    columns=['category', 'phrase_indexes']
)

def get_phrases_for_category(indexes):
    return [phrase_df['noun_phrase'].iloc[i] for i in indexes]

category_df['noun_phrases'] = category_df['phrase_indexes'].map(get_phrases_for_category)

# -----------------------
# 7. Compute distribution vectors for each category
# -----------------------
num_phrases = len(phrase_df)

def category_distribution(indexes):
    vec = [0] * num_phrases
    for idx in indexes:
        vec[idx] += 1
    return vec

category_df['distribution'] = category_df['phrase_indexes'].map(category_distribution)

# -----------------------
# 8. Compute Jensen–Shannon distances between categories
# -----------------------
categories = category_df['category'].tolist()
distributions = category_df['distribution'].tolist()

category_distinctiveness_df = pd.DataFrame(0.0, index=categories, columns=categories)

for i in range(len(categories)):
    for j in range(len(categories)):
        dist = distance.jensenshannon(distributions[i], distributions[j]) ** 2
        category_distinctiveness_df.iloc[i, j] = dist

# -----------------------
# 9. Calculate final distinctiveness score per category
# -----------------------
TOTAL_CATEGORIES = len(categories)  # number of categories
final_distinctiveness_scores = category_distinctiveness_df.sum(axis=1) / TOTAL_CATEGORIES

# Add the final score as a new column
category_distinctiveness_df['final_distinctiveness_score'] = final_distinctiveness_scores

# -----------------------
# 10. Output Result
# -----------------------
category_distinctiveness_df.to_csv('distinctiveness_scores.csv', index=True)