In [None]:
# -----------------------
# Imports and NLTK setup
# -----------------------

import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
import matplotlib.pyplot as plt

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


# -----------------------
# 1. Load & Filter Data
# -----------------------

# Load relevant columns only
df = pd.read_csv('campaigns.csv', usecols=['category', 'token_counts', 'description'])

# Filter out short descriptions (token count <= 50)
df = df[df['token_counts'] > 50].reset_index(drop=True)

# Filter categories with at least MIN_SAMPLES_PER_CATEGORY samples
MIN_SAMPLES_PER_CATEGORY = 500
category_counts = df['category'].value_counts()
valid_categories = category_counts[category_counts >= MIN_SAMPLES_PER_CATEGORY].index
df = df[df['category'].isin(valid_categories)].reset_index(drop=True)

# -----------------------
# 2. Category Label Encoding
# -----------------------

categories = sorted(df['category'].unique())

# Map categories to integer labels
label_encoder = LabelEncoder()
df['category_label'] = label_encoder.fit_transform(df['category'])

# -----------------------
# 3. Balance Dataset
# -----------------------

def balance_dataset(dataframe, category_column):
    """
    Balance dataset by undersampling each category to the size of the smallest category.
    """
    min_samples = dataframe[category_column].value_counts().min()
    balanced_df = dataframe.groupby(category_column).apply(lambda x: x.sample(min_samples, random_state=42)).reset_index(drop=True)
    return balanced_df

df_balanced = balance_dataset(df, 'category')

# -----------------------
# 4. Text Preprocessing and Tagging for Doc2Vec
# -----------------------

def preprocess_text(text):
    """
    Tokenize, lowercase, remove stopwords and non-alpha tokens, lemmatize.
    """
    tokens = word_tokenize(text.lower())
    clean_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return clean_tokens

# Prepare TaggedDocument objects for training Doc2Vec
tagged_docs_balanced = [
    TaggedDocument(words=preprocess_text(desc), tags=[str(label), f"doc_{i}"])
    for i, (desc, label) in enumerate(zip(df_balanced['description'], df_balanced['category_label']))
]

# -----------------------
# 5. Train Doc2Vec Model
# -----------------------

doc2vec_model = Doc2Vec(vector_size=500, window=10, min_count=2, epochs=20, dm=1)  # dm=1 for distributed memory
doc2vec_model.build_vocab(tagged_docs_balanced)
doc2vec_model.train(tagged_docs_balanced, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# -----------------------
# 6. Generate Document Vectors
# -----------------------

balanced_doc_vectors = [doc2vec_model.infer_vector(doc.words) for doc in tagged_docs_balanced]

# -----------------------
# 7. Classification: Logistic Regression on Balanced Dataset to Tune Doc2Vec Hyperparameters
# -----------------------

X_balanced = balanced_doc_vectors
y_balanced = df_balanced['category_label']

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Balanced Dataset Classification Report:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# -----------------------
# 8. Generate Document Vectors for Full Dataset using Tuned Hyperparameters (Unbalanced)
# -----------------------

tagged_docs_full = [
    TaggedDocument(words=preprocess_text(desc), tags=[f"doc_{i}"])
    for i, desc in enumerate(df['description'])
]

full_doc_vectors = [doc2vec_model.infer_vector(doc.words) for doc in tagged_docs_full]

X_full = full_doc_vectors
y_full = df['category_label']

X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_full, y_full, test_size=0.2, stratify=y_full, random_state=42)

clf_balanced = LogisticRegression(max_iter=1000, class_weight='balanced')
clf_balanced.fit(X_train_f, y_train_f)
y_pred_f = clf_balanced.predict(X_test_f)

print("Full Dataset Classification Report:")
print("Accuracy:", accuracy_score(y_test_f, y_pred_f))
print(classification_report(y_test_f, y_pred_f, target_names=label_encoder.classes_))

# -----------------------
# 9. Calculate Category Coherence Using Cosine Similarity
# -----------------------

# Convert full document vectors to DataFrame for easier indexing
vectors_df = pd.DataFrame(full_doc_vectors)

category_coherence_scores = []

for category in categories:
    # Select document vectors for this category
    indices = df.index[df['category'] == category]
    category_vectors = vectors_df.loc[indices].values

    # Skip if fewer than 2 documents (cannot compute pairwise similarity)
    if len(category_vectors) < 2:
        category_coherence_scores.append(np.nan)
        continue

    # Compute cosine similarity
    cos_sim_matrix = cosine_similarity(category_vectors)

    # Extract unique pairwise similarities
    similarities = [
        cos_sim_matrix[i, j]
        for i, j in combinations(range(len(category_vectors)), 2)
    ]

    # Append average similarity for this category
    category_coherence_scores.append(np.mean(similarities))

# Create final DataFrame
category_coherence_df = pd.DataFrame({
    'category': categories,
    'coherence_score': category_coherence_scores
})

category_coherence_df.to_csv('coherence_scores.csv', index=False)
