In [None]:
# Import required libraries
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import re
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
import json
import urllib.parse
import webbrowser

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def process_text(text):
    """Clean and tokenize text"""
    # Remove HTML tags
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)
    
    # Split into sentences for trend analysis
    sentences = sent_tokenize(text)
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stopwords]
    
    return tokens, sentences

In [None]:
def calculate_word_trends(sentences, words):
    """Calculate word frequency trends across document segments"""
    trends = {word: [] for word in words}
    chunk_size = max(1, len(sentences) // 10)  # Split into 10 segments
    
    for i in range(0, len(sentences), chunk_size):
        chunk = ' '.join(sentences[i:i+chunk_size])
        chunk_tokens = word_tokenize(chunk.lower())
        chunk_freq = Counter(chunk_tokens)
        
        for word in words:
            trends[word].append(chunk_freq[word])
    
    return trends

In [None]:
def get_collocations(tokens, window_size=5):
    """Find word collocations"""
    collocations = []
    for i in range(len(tokens) - window_size):
        window = tokens[i:i+window_size]
        for j in range(len(window)):
            for k in range(j+1, len(window)):
                collocations.append((window[j], window[k]))
    return Counter(collocations).most_common(50)

In [None]:
def analyze_text(text):
    """Analyze text and generate visualizations"""
    # Process text
    tokens, sentences = process_text(text)
    
    # Word frequency
    word_freq = Counter(tokens)
    top_words = dict(word_freq.most_common(100))
    
    # Generate visualizations
    fig_freq = px.bar(x=list(top_words.keys())[:20], 
                      y=list(top_words.values())[:20],
                      title='Top 20 Word Frequencies')
    fig_freq.show()
    
    # Word cloud
    wordcloud = WordCloud(width=800, height=400).generate(' '.join(tokens))
    plt.figure(figsize=(10,5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
    
    # Trend analysis
    trends = calculate_word_trends(sentences, list(top_words.keys())[:10])
    fig_trends = go.Figure()
    for word, counts in trends.items():
        fig_trends.add_trace(go.Scatter(y=counts, name=word, mode='lines+markers'))
    fig_trends.update_layout(title='Word Frequency Trends')
    fig_trends.show()
    
    return {
        'word_frequencies': top_words,
        'total_words': len(tokens),
        'unique_words': len(set(tokens))
    }

In [None]:
def send_to_voyant(text):
    """Send text to local Voyant server"""
    encoded_text = urllib.parse.quote(text)
    voyant_url = f'http://localhost:8888/?input={encoded_text}'
    webbrowser.open(voyant_url)
    return "Opened in Voyant Tools"

In [None]:
# Example usage
file_path = '../data/inputs/enron/enron_subset.csv'

# Read the file
if file_path.endswith('.csv'):
    df = pd.read_csv(file_path)
    text = ' '.join(df['message'].dropna().tolist())
else:
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

# Option 1: Analyze locally
results = analyze_text(text)
print(f"Total words: {results['total_words']}")
print(f"Unique words: {results['unique_words']}")

# Option 2: Send to Voyant
send_to_voyant(text)