# NLP Character Analysis: Collocates & PMI

## Description
This notebook analyzes character signatures in novels by identifying words that frequently co-occur with specific characters (Collocates) and calculating Pointwise Mutual Information (PMI) to find statistically significant associations.

## Requirements
1.  **Tokenization & Cleaning:** Sentence splitting, stopword removal, lemmatization.
2.  **Collocate Extraction:** Window-based context analysis (Â±5 words).
3.  **PMI Calculation:** Statistical measure of association strength.
4.  **Visualization:** Bar charts (Frequency vs. PMI) and Word Clouds.
5.  **Export:** Results saved as CSV and PNG files.

In [None]:
import os
import re
import math
import string
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

# --- 1. SETUP & CONFIGURATION ---

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Define paths based on your repository structure
DATA_DIR = 'data'
RESULTS_DIR = 'results'

# Create results directory if it doesn't exist
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

# Configuration for analysis
WINDOW_SIZE = 5  # +/- 5 tokens
PMI_FREQ_THRESHOLD = 5  # Word must appear at least this many times to be counted for PMI
TOP_N = 20  # Number of top words to visualize

# Define the books and the target characters for each
# Note: Filenames must match exactly what is in your data folder
BOOKS_CONFIG = {
    "Anna Karenina": {
        "filename": "The Project Gutenberg eBook of Anna Karenina, by Leo Tolstoy.txt",
        "characters": ["Anna", "Vronsky", "Levin", "Kitty", "Karenin"]
    },
    "War and Peace": {
        "filename": "The Project Gutenberg eBook of War and Peace, by Leo Tolstoy.txt",
        "characters": ["Pierre", "Natasha", "Andrei", "Rostov", "Mary"]
    }
}

## 2. Preprocessing Functions
Here we define functions to load text, clean it (lemmatization, stopword removal), and tokenize it into sentences and words.

In [None]:
def load_text(filepath):
    """Reads the text file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        return text
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return ""

def clean_and_tokenize(text):
    """
    1. Splits into sentences.
    2. Tokenizes words.
    3. Lowercases.
    4. Removes punctuation and stopwords.
    5. Lemmatizes.
    Returns: 
      - full_token_list: List of all processed tokens (for frequency counts)
      - tokenized_sentences: List of lists (for context window analysis)
    """
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    # Split into sentences first
    sentences = sent_tokenize(text)
    
    processed_sentences = []
    full_token_list = []
    
    for sentence in sentences:
        # Remove punctuation and split
        # We keep alphanumeric tokens only
        words = word_tokenize(sentence)
        clean_words = []
        
        for w in words:
            if w.isalnum():
                w_lower = w.lower()
                if w_lower not in stop_words:
                    lemma = lemmatizer.lemmatize(w_lower)
                    clean_words.append(lemma)
                    full_token_list.append(lemma)
        
        if clean_words:
            processed_sentences.append(clean_words)
            
    return full_token_list, processed_sentences

## 3. Core Analytics (Collocates & PMI)
These functions handle the logic for finding context words and calculating the PMI statistics.

In [None]:
def get_collocates(target_char_names, tokenized_sentences, window=5):
    """
    Finds words appearing within +/- window of the character name.
    target_char_names: List of variations (e.g., ["anna", "karenina"])
    """
    # Normalize target names to lower case
    targets = [t.lower() for t in target_char_names]
    
    collocates = []
    
    for sentence in tokenized_sentences:
        for i, word in enumerate(sentence):
            if word in targets:
                # Define window range
                start = max(0, i - window)
                end = min(len(sentence), i + window + 1)
                
                # Grab context (excluding the character name itself)
                context = sentence[start:i] + sentence[i+1:end]
                collocates.extend(context)
                
    return collocates

def calculate_pmi(char_collocates, total_corpus_tokens, top_n=20):
    """
    Calculates PMI for character collocates.
    PMI(w, c) = log( P(w|c) / P(w) )
    
    Interpretation: How much more likely is 'w' to appear near the character
    than it is to appear randomly in the book?
    """
    collocate_counts = Counter(char_collocates)
    corpus_counts = Counter(total_corpus_tokens)
    
    total_collocates = len(char_collocates)
    total_corpus = len(total_corpus_tokens)
    
    pmi_scores = {}
    
    for word, count_in_context in collocate_counts.items():
        # Apply frequency threshold to avoid noise
        if count_in_context < PMI_FREQ_THRESHOLD:
            continue
        
        # P(w|c) = count of w near c / total words near c
        p_w_given_c = count_in_context / total_collocates
        
        # P(w) = count of w in whole book / total words in book
        p_w = corpus_counts[word] / total_corpus
        
        if p_w > 0:
            pmi = math.log(p_w_given_c / p_w)
            pmi_scores[word] = pmi
            
    # Sort by highest PMI
    sorted_pmi = sorted(pmi_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return sorted_pmi

## 4. Visualization & Export
This section generates the graphs (Bar Charts, Word Clouds) and saves the data to CSV.

In [None]:
def generate_visualizations(char_name, collocate_counts, pmi_scores, book_title):
    """Generates and saves bar charts and wordclouds."""
    
    # 1. Prepare Data
    top_freq = collocate_counts.most_common(TOP_N)
    top_pmi = pmi_scores  # Already sorted and top N
    
    # Create a figure with 2 subplots (Frequency Bar, PMI Bar)
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle(f"Character Signature: {char_name} ({book_title})", fontsize=16)
    
    # Bar Chart: Frequency
    words_f, counts_f = zip(*top_freq) if top_freq else ([], [])
    axes[0].barh(words_f[::-1], counts_f[::-1], color='skyblue')
    axes[0].set_title(f"Top {TOP_N} Frequent Collocates")
    axes[0].set_xlabel("Frequency")
    
    # Bar Chart: PMI
    words_p, scores_p = zip(*top_pmi) if top_pmi else ([], [])
    axes[1].barh(words_p[::-1], scores_p[::-1], color='salmon')
    axes[1].set_title(f"Top {TOP_N} PMI Collocates")
    axes[1].set_xlabel("PMI Score")
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.88)
    
    # Save Bar Charts
    safe_name = char_name.lower().replace(" ", "_")
    safe_book = book_title.split()[0].lower()
    filename = f"{RESULTS_DIR}/{safe_book}_{safe_name}_bars.png"
    plt.savefig(filename)
    plt.close()
    
    # 2. Word Cloud (Frequency based)
    if collocate_counts:
        wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(collocate_counts)
        plt.figure(figsize=(10, 5))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(f"Collocate Word Cloud: {char_name}")
        
        wc_filename = f"{RESULTS_DIR}/{safe_book}_{safe_name}_wordcloud.png"
        plt.savefig(wc_filename)
        plt.close()

def save_csv(char_name, collocate_counts, pmi_scores, book_title):
    """Exports raw data to CSV."""
    # Convert counters/lists to DataFrame
    df_freq = pd.DataFrame(collocate_counts.most_common(), columns=['word', 'frequency'])
    df_pmi = pd.DataFrame(pmi_scores, columns=['word', 'pmi_score'])
    
    # Merge for a clean view (Outer join to keep all)
    df_merged = pd.merge(df_freq, df_pmi, on='word', how='outer')
    
    safe_name = char_name.lower().replace(" ", "_")
    safe_book = book_title.split()[0].lower()
    csv_filename = f"{RESULTS_DIR}/{safe_book}_{safe_name}_stats.csv"
    
    df_merged.to_csv(csv_filename, index=False)
    # print(f"Saved CSV: {csv_filename}")

## 5. Main Execution
Run this cell to process the books, generate all statistics, and save the results to the `results/` folder.

In [None]:
def run_analysis():
    print("Starting Analysis...")
    
    for book_title, config in BOOKS_CONFIG.items():
        print(f"\n--- Processing Book: {book_title} ---")
        filepath = os.path.join(DATA_DIR, config['filename'])
        
        # 1. Load
        raw_text = load_text(filepath)
        if not raw_text:
            continue
            
        # 2. Global Preprocessing (Tokenize whole book once)
        print("Tokenizing and cleaning text (this may take a moment)...")
        full_tokens, processed_sentences = clean_and_tokenize(raw_text)
        print(f"Total tokens: {len(full_tokens)}")
        
        # 3. Analyze each character
        for char_name in config['characters']:
            print(f"Analyzing character: {char_name}...")
            
            # Get collocates (Words near the character)
            # We look for the exact name (e.g. "Anna")
            collocates = get_collocates([char_name], processed_sentences, window=WINDOW_SIZE)
            
            if not collocates:
                print(f"Warning: No occurrences found for {char_name}")
                continue
                
            collocate_counts = Counter(collocates)
            
            # Calculate PMI
            pmi_results = calculate_pmi(collocates, full_tokens, top_n=TOP_N)
            
            # 4. Save Outputs
            generate_visualizations(char_name, collocate_counts, pmi_results, book_title)
            save_csv(char_name, collocate_counts, pmi_results, book_title)
            
    print(f"\nAnalysis Complete! Check the '{RESULTS_DIR}' folder.")

# Run the script
run_analysis()