# NLP Character Analysis: Presentation Graphs

## Description
This notebook analyzes character signatures and generates **three distinct graphs** for a presentation:
1.  **Scatter Plot:** Frequency vs. PMI (to show common vs. unique words).
2.  **Grouped Bar Chart:** Comparing specific themes between two characters.
3.  **Word Cloud:** A visual summary of the main character's context.

In [None]:
import os
import math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

# --- SETUP ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# PATHS (Adjusted for your folder structure)
DATA_DIR = '../data'
RESULTS_DIR = '../results'

if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

# CONFIG
WINDOW_SIZE = 5
PMI_FREQ_THRESHOLD = 5

# We will focus on ONE book for the presentation graphs to keep it clean
TARGET_BOOK = "The Project Gutenberg eBook of Anna Karenina, by Leo Tolstoy.txt"
MAIN_CHAR = "Anna"
COMPARE_CHAR = "Levin"

## 1. Processing Functions

In [None]:
def load_text(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        return ""

def clean_and_tokenize(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    sentences = sent_tokenize(text)
    processed_sentences = []
    full_token_list = []
    
    for sentence in sentences:
        words = word_tokenize(sentence)
        clean_words = []
        for w in words:
            if w.isalnum():
                w_lower = w.lower()
                if w_lower not in stop_words:
                    lemma = lemmatizer.lemmatize(w_lower)
                    clean_words.append(lemma)
                    full_token_list.append(lemma)
        if clean_words:
            processed_sentences.append(clean_words)
    return full_token_list, processed_sentences

def get_collocates(target_name, sentences, window=5):
    target = target_name.lower()
    collocates = []
    for sent in sentences:
        for i, word in enumerate(sent):
            if word == target:
                start = max(0, i - window)
                end = min(len(sent), i + window + 1)
                collocates.extend(sent[start:i] + sent[i+1:end])
    return collocates

def calculate_stats(collocates, full_tokens):
    collocate_counts = Counter(collocates)
    corpus_counts = Counter(full_tokens)
    total_collocates = len(collocates)
    total_corpus = len(full_tokens)
    
    stats = []
    for word, count in collocate_counts.items():
        if count < PMI_FREQ_THRESHOLD:
            continue
        p_w_given_c = count / total_collocates
        p_w = corpus_counts[word] / total_corpus
        if p_w > 0:
            pmi = math.log(p_w_given_c / p_w)
            stats.append({'word': word, 'freq': count, 'pmi': pmi})
    
    return pd.DataFrame(stats)

## 2. Generate Presentation Graphs
This section creates 3 specific graphs: **Scatter Plot, Grouped Bar Chart, and Word Cloud**.

In [None]:
def create_presentation_graphs(df_main, df_compare, main_name, compare_name):
    
    # --- GRAPH 1: SCATTER PLOT (Frequency vs. PMI) ---
    # Topic: "Common vs. Unique Vocabulary"
    print("Generating Graph 1: Scatter Plot...")
    plt.figure(figsize=(10, 6))
    
    # Filter for readability (top 50 by frequency)
    subset = df_main.sort_values(by='freq', ascending=False).head(40)
    
    plt.scatter(subset['freq'], subset['pmi'], color='purple', alpha=0.6, s=100)
    
    # Label points
    for i, row in subset.iterrows():
        plt.text(row['freq']+0.5, row['pmi'], row['word'], fontsize=9)
        
    plt.title(f"Graph 1: Word Usage Distribution for {main_name}\n(Frequency vs. Uniqueness)", fontsize=14)
    plt.xlabel("Frequency (Count)")
    plt.ylabel("PMI Score (Uniqueness)")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.savefig(f"{RESULTS_DIR}/presentation_graph_1_scatter.png")
    plt.show()

    # --- GRAPH 2: GROUPED BAR CHART ---
    # Topic: "Thematic Contrast between Characters"
    print("Generating Graph 2: Comparison Bar Chart...")
    
    # Select interesting thematic words to compare
    comparison_words = ['love', 'home', 'life', 'wife', 'eye', 'hand', 'smile', 'guilt']
    
    # Extract counts
    counts_main = [df_main[df_main['word'] == w]['freq'].sum() for w in comparison_words]
    counts_compare = [df_compare[df_compare['word'] == w]['freq'].sum() for w in comparison_words]
    
    x = np.arange(len(comparison_words))
    width = 0.35
    
    plt.figure(figsize=(10, 6))
    plt.bar(x - width/2, counts_main, width, label=main_name, color='skyblue')
    plt.bar(x + width/2, counts_compare, width, label=compare_name, color='salmon')
    
    plt.title(f"Graph 2: Thematic Comparison ({main_name} vs. {compare_name})", fontsize=14)
    plt.xticks(x, comparison_words, fontsize=11)
    plt.legend()
    plt.ylabel("Frequency")
    plt.savefig(f"{RESULTS_DIR}/presentation_graph_2_comparison.png")
    plt.show()

    # --- GRAPH 3: WORD CLOUD ---
    # Topic: "Visual Summary"
    print("Generating Graph 3: Word Cloud...")
    
    # Convert dataframe to dict for wordcloud
    freq_dict = dict(zip(df_main['word'], df_main['freq']))
    
    wc = WordCloud(width=800, height=400, background_color='white', colormap='magma').generate_from_frequencies(freq_dict)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Graph 3: Context Cloud for {main_name}", fontsize=14)
    plt.savefig(f"{RESULTS_DIR}/presentation_graph_3_wordcloud.png")
    plt.show()

## 3. Run Analysis
Execute the analysis and generate the 3 presentation files.

In [None]:
# Load Data
print("Loading and processing text...")
text_path = os.path.join(DATA_DIR, TARGET_BOOK)
raw_text = load_text(text_path)

if raw_text:
    full_tokens, processed_sentences = clean_and_tokenize(raw_text)
    
    # Analyze Main Character (Anna)
    print(f"Analyzing {MAIN_CHAR}...")
    col_main = get_collocates(MAIN_CHAR, processed_sentences, WINDOW_SIZE)
    df_main = calculate_stats(col_main, full_tokens)
    
    # Analyze Comparison Character (Levin)
    print(f"Analyzing {COMPARE_CHAR}...")
    col_compare = get_collocates(COMPARE_CHAR, processed_sentences, WINDOW_SIZE)
    df_compare = calculate_stats(col_compare, full_tokens)
    
    # Generate Graphs
    create_presentation_graphs(df_main, df_compare, MAIN_CHAR, COMPARE_CHAR)
    
    print("\nDone! 3 presentation graphs saved to 'results' folder.")
else:
    print("Could not load text. Check data path.")