In [None]:
# === Environment Setup ===
import os, sys, math, time, random, json, textwrap, warnings, re
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
try:
    import graphviz
    GRAPHVIZ_AVAILABLE = True
except ImportError:
    GRAPHVIZ_AVAILABLE = False
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from IPython.display import display, Markdown

try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize
    def download_nltk_data():
        resources = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']
        for resource in resources:
            try: nltk.data.find(f'tokenizers/{resource}' if resource=='punkt' else f'corpora/{resource}')
            except LookupError: nltk.download(resource, quiet=True)
    download_nltk_data()
    NLTK_AVAILABLE = True
except (ImportError, LookupError): NLTK_AVAILABLE = False

try:
    import gensim
    import gensim.downloader as api
    GENSIM_AVAILABLE = True
except ImportError: GENSIM_AVAILABLE = False

# --- Configuration ---
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({'font.size': 14, 'figure.figsize': (12, 8), 'figure.dpi': 150})
np.set_printoptions(suppress=True, linewidth=120, precision=4)
warnings.filterwarnings('ignore', category=FutureWarning)

# --- Utility Functions ---
def note(msg): display(Markdown(f"<div class='alert alert-block alert-info'>📝 **Note:** {msg}</div>"))
def sec(title): print(f"\n{80*'='}\n| {title.upper()} |\n{80*'='}")

note("Environment initialized for Advanced Natural Language Processing.")

# Part 7: Advanced Deep Learning & Frontier Topics
## Chapter 7.10: Natural Language Processing for Economics

### Introduction: Text as Data

This chapter provides a PhD-level introduction to **Natural Language Processing (NLP)**. A vast amount of economic information is stored as unstructured text—news articles, central bank statements, corporate 10-K filings, political speeches, and social media posts. NLP provides a powerful suite of tools for converting this text into quantitative data suitable for rigorous economic analysis.

This chapter covers the entire pipeline, from classical methods to the state-of-the-art:
1.  **Text Preprocessing:** The essential first step of cleaning and standardizing text.
2.  **Vector Space Models:** Representing documents as numerical vectors using TF-IDF.
3.  **Topic Modeling:** Unsupervised discovery of latent themes using Latent Dirichlet Allocation (LDA).
4.  **Word Embeddings:** Learning dense, semantic vector representations of words with Word2Vec.
5.  **Contextualized Embeddings:** A brief introduction to the Transformer architecture and its role in modern NLP.

## 1. The NLP Pipeline: From Raw Text to Numerical Data

### 1.1 Preprocessing
Before any analysis, raw text must be cleaned and standardized. The goal is to reduce the vocabulary to only the most meaningful terms.

1.  **Normalization:** Convert text to a consistent case (e.g., lowercase) and remove punctuation, numbers, and special characters.
2.  **Tokenization:** Split the text into individual words or "tokens".
3.  **Stop-Word Removal:** Remove extremely common words (e.g., "the", "a", "is") that carry little semantic weight.
4.  **Lemmatization:** Reduce words to their dictionary root form, or "lemma" (e.g., "rates", "rating", "rated" all become "rate"). This is more linguistically informed than **stemming**.

In [None]:
sec("Text Preprocessing Demonstration")

if NLTK_AVAILABLE:
    raw_text = "The U.S. economy showed robust growth in the 3rd quarter, with GDP increasing by 4.9%."
    lemmatizer = WordNetLemmatizer(); stop_words = set(stopwords.words('english'))
    def preprocess_text(text):
        text = re.sub(r'[^a-zA-Z\\s]', '', text.lower())
        tokens = word_tokenize(text)
        return [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    note(f"**Original:** {raw_text}\\n**Processed:** {preprocess_text(raw_text)}")
else:
    note("NLTK not available. Skipping preprocessing demonstration.")

### 1.2 Vectorization: Term Frequency-Inverse Document Frequency (TF-IDF)

**Intellectual Provenance:** The concept of TF-IDF was developed in the early 1970s by Karen Spärck Jones, a British computer scientist. Her work laid the foundation for modern information retrieval and search engines. The core idea was to find a way to automatically determine which terms in a document are most representative of its content, allowing for more relevant search results.

**TF-IDF** represents each document as a vector where each entry reflects the importance of a word in that document relative to the entire corpus.
$$ \text{tfidf}(t, d) = \underbrace{tf(t, d)}_{\text{Term Frequency}} \times \underbrace{\text{idf}(t)}_{\text{Inverse Document Frequency}} $$
*   **Term Frequency (TF):** How often a term appears in a document.
*   **Inverse Document Frequency (IDF):** A measure of a term's informativeness. A word that appears in many documents (like "economy" in financial news) gets a low IDF score, while a rare word gets a high score. The product gives higher weight to words that are frequent in a specific document but rare across the corpus.

## 2. Unsupervised Methods: Topic Modeling

**Intellectual Provenance:** LDA was introduced in a seminal 2003 paper by David Blei, Andrew Ng, and Michael I. Jordan. It revolutionized the field of topic modeling by providing a probabilistic framework for discovering the thematic structure of large text corpora. It is a generalization of earlier models and has become a standard tool in the digital humanities, social sciences, and economics for analyzing large collections of documents.

**Latent Dirichlet Allocation (LDA)** is an unsupervised generative statistical model that treats documents as mixtures of topics, and topics as distributions over words. The model uses the observed documents to infer the latent topic structure that most likely generated them. The number of topics, $k$, is a hyperparameter that must be chosen by the researcher, often by finding the $k$ that maximizes a **coherence score**.

In [None]:
sec("Text Preprocessing Demonstration")

if NLTK_AVAILABLE:
    raw_text = "The U.S. economy showed robust growth in the 3rd quarter, with GDP increasing by 4.9%."
    lemmatizer = WordNetLemmatizer(); stop_words = set(stopwords.words('english'))
    def preprocess_text(text):
        text = re.sub(r'[^a-zA-Z\\s]', '', text.lower())
        tokens = word_tokenize(text)
        return [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    note(f"**Original:** {raw_text}\\n**Processed:** {preprocess_text(raw_text)}")
else:
    note("NLTK not available. Skipping preprocessing demonstration.")

## 3. Modern NLP: Word Embeddings

While TF-IDF is powerful, it treats words as independent entities, missing semantic relationships. Modern NLP uses **word embeddings** to represent words as dense, low-dimensional vectors in a way that captures these relationships.

The core idea of algorithms like **Word2Vec** and **GloVe** is based on the **distributional hypothesis**: words that frequently appear in similar linguistic contexts tend to have similar meanings. These algorithms learn a vector for each word such that the geometric relationships between vectors mirror the semantic relationships between words.

### 3.1 Word2Vec Architectures: CBOW and Skip-gram

Word2Vec learns embeddings by training a shallow neural network on a "fake" prediction task. 
1.  **Continuous Bag-of-Words (CBOW):** Predicts a target word from its context words.
2.  **Skip-gram:** Predicts context words from a single target word. Skip-gram is typically slower but performs better for infrequent words.

![CBOW Architecture](../images/07-Machine-Learning/cbow_architecture.png)\n![Skip-Gram Architecture](../images/07-Machine-Learning/skip_gram_architecture.png)

In [None]:
sec("Training and Visualizing Word Embeddings")

if GENSIM_AVAILABLE and NLTK_AVAILABLE:
    note("Loading pre-trained GloVe embeddings... (This may take a moment on first run)")
    try:
        # 1. Load a pre-trained model.
        glove_vectors = api.load('glove-wiki-gigaword-100')
        note("GloVe model loaded successfully.")
        
        # 2. Explore Semantic Similarities
        note("Words most similar to 'finance':")
        display(pd.DataFrame(glove_vectors.most_similar('finance'), columns=['Word', 'Similarity']))
        
        # 3. Perform Semantic Arithmetic
        note("Semantic Arithmetic: king - man + woman = ?")
        display(pd.DataFrame(glove_vectors.most_similar(positive=['king', 'woman'], negative=['man']), columns=['Word', 'Similarity']))

    except Exception as e:
        note(f"Could not download pre-trained model. Error: {e}")
else:
    note("Gensim or NLTK not available. Skipping Word2Vec example.")

## 4. The Frontier: Transformers and Contextualized Embeddings

Word embeddings like Word2Vec are **static**: the vector for "bank" is the same regardless of whether it appears in "river bank" or "central bank". The state-of-the-art in NLP is dominated by **contextualized embeddings** generated by models with a **Transformer architecture**, such as BERT, RoBERTa, and the GPT family.

These **Large Language Models (LLMs)** are pre-trained on a massive corpus of text using self-supervised objectives. The key innovation is the **self-attention mechanism**, which allows the model to "look" at all other words in the sentence and assign a weight to each one, representing its importance for understanding the target word *in its specific context*. This allows the model to generate a different vector for "bank" depending on the surrounding words.

These pre-trained models can then be **fine-tuned** for specific downstream tasks like sentiment analysis, document classification, or question answering, achieving state-of-the-art performance with relatively little task-specific data.