In [None]:
# -----------------------
# Imports and NLTK setup
# -----------------------

import pandas as pd
import nltk
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

STOP_WORDS = set(stopwords.words('english'))
NON_ALPHA_REGEX = re.compile(r'[^a-zA-Z\s]')

# -----------------------
# 1. Load & Filter Data
# -----------------------
df = pd.read_csv('campaigns.csv', usecols=['description', 'category', 'token_counts', 'launch_date', 'success', 'funding_goal', 'parent_category', 'failure_date', 'success_date'])

# Filter campaigns with sufficient token counts for reliable analysis
MIN_TOKEN_COUNT = 50
df = df[df['token_counts'] > MIN_TOKEN_COUNT]

# Keep only categories with enough campaigns for statistical power
MIN_CAMPAIGNS_PER_CATEGORY = 500
valid_categories = df['category'].value_counts()
valid_categories = valid_categories[valid_categories >= MIN_CAMPAIGNS_PER_CATEGORY].index
df = df[df['category'].isin(valid_categories)].reset_index(drop=True)

# -----------------------
# 2. Extract Noun Phrases from Descriptions
# -----------------------
def extract_noun_phrases(text, grammar=None):
    """
    Extract noun phrases (NP) from text using POS tagging and chunking.
    Grammar:
        NP = consecutive nouns or adjective + noun sequences
    """
    if grammar is None:
        grammar = r"""
            NP: {<NN.*>+}
            NP: {<JJ.*>+<NN.*>+}
        """
    tokens = word_tokenize(text.lower())  # lowercase for uniformity
    tagged_tokens = pos_tag(tokens)
    chunk_parser = nltk.RegexpParser(grammar)
    tree = chunk_parser.parse(tagged_tokens)

    # Extract NP as joined strings
    noun_phrases = [
        " ".join(word for word, pos in subtree.leaves())
        for subtree in tree.subtrees()
        if subtree.label() == 'NP'
    ]
    return noun_phrases

df['noun_phrases'] = df['description'].map(extract_noun_phrases)

# -----------------------
# 3. Clean Noun Phrases
# -----------------------
def clean_phrases(phrases):
    """
    Remove stopwords, punctuation, and non-alphabetic tokens from noun phrases.
    """
    cleaned_phrases = []
    for phrase in phrases:
        words = word_tokenize(phrase)
        filtered_words = [
            w for w in words
            if w not in STOP_WORDS and not NON_ALPHA_REGEX.search(w)
        ]
        if filtered_words:
            cleaned_phrases.append(" ".join(filtered_words))
    return cleaned_phrases

df['cleaned_noun_phrases'] = df['noun_phrases'].map(clean_phrases)

# -----------------------
# 4. Merge Cleaned Noun Phrases for Vectorization
# -----------------------
def merge_phrases_for_vectorization(phrases):
    return " ".join([phrase.replace(" ", "_") for phrase in phrases])

df['merged_noun_phrases'] = df['cleaned_noun_phrases'].map(merge_phrases_for_vectorization)

# Filter out empty merged phrases
df = df[df['merged_noun_phrases'].str.strip() != ''].reset_index(drop=True)

# -----------------------
# 5. Compute TF-IDF Matrix for Noun Phrases
# -----------------------
tfidf_vectorizer = TfidfVectorizer(norm=None, binary=False, smooth_idf=False)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['merged_noun_phrases'])
feature_names = tfidf_vectorizer.get_feature_names_out()

# -----------------------
# 6. Extract Non-zero TF-IDF Values per Document
# -----------------------
rows, cols = tfidf_matrix.nonzero()
values = tfidf_matrix.data

# Map row index to list of non-zero tfidf values
row_to_tfidf_values = defaultdict(list)
for row_idx, val in zip(rows, values):
    row_to_tfidf_values[row_idx].append(val)

# Prepare DataFrame column with non-zero tfidf values
df['non_zero_tfidf_values'] = [row_to_tfidf_values.get(i, []) for i in range(tfidf_matrix.shape[0])]

# -----------------------
# 7. Calculate Novelty Features Based on TF-IDF Scores
# -----------------------
# Compute description length in words
df['doc_length'] = df['description'].map(lambda x: len(x.split()))

# Get top 10 TF-IDF values per document
def get_top_n_tfidf_scores(tfidf_list, top_n=10):
    if not tfidf_list:
        return []
    return sorted(tfidf_list, reverse=True)[:top_n]

# Compute top N TF-IDF scores per document
df["top_10_tfidf_scores"] = df["non_zero_tfidf_values"].map(get_top_n_tfidf_scores)

# Compute mean of top N TF-IDF scores
df["mean_top_10_tfidf"] = df["top_10_tfidf_scores"].apply(
    lambda scores: np.mean(scores) if scores else 0.0
)

# Normalize mean TF-IDF by log10 of description length to adjust for length effects
def normalize_mean_tfidf(mean_tfidf, doc_length):
    if doc_length > 1:
        return mean_tfidf / np.log10(doc_length)
    return 0.0

df["normalized_mean_top_10_tfidf"] = df.apply(
    lambda row: normalize_mean_tfidf(row["mean_top_10_tfidf"], row["doc_length"]),
    axis=1
)

df.to_csv('novelty_scores.csv', index=False)