In [16]:
# Import required libraries
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import re
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
import json
import urllib.parse
import webbrowser
from email.parser import Parser
import string
import matplotlib.pyplot as plt
import tempfile
import os
import requests

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/itschris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/itschris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
def preprocessing(text):
    """Clean and tokenize text"""
    # Remove HTML tags
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)

    # only keep the email body information 
    email = Parser().parsestr(text)
    email_body= email.get_payload()

    # Split into words and sentences
    sen_tokens = sent_tokenize(email_body)
    wrd_tokens = word_tokenize(email_body.lower())

    # Remove stopwords & punctuation
    stopwords = set(nltk.corpus.stopwords.words('english'))
    filtered_wrds_token = [word for word in wrd_tokens if word.isalnum() and word not in stopwords and word not in string.punctuation]
    
    return filtered_wrds_token, sen_tokens, ' '.join(filtered_wrds_token)

In [3]:
def calculate_word_trends(sentences, words):
    """Calculate word frequency trends across document segments"""
    trends = {word: [] for word in words}
    chunk_size = max(1, len(sentences) // 10)  # Split into 10 segments
    
    for i in range(0, len(sentences), chunk_size):
        chunk = ' '.join(sentences[i:i+chunk_size])
        chunk_tokens = word_tokenize(chunk.lower())
        chunk_freq = Counter(chunk_tokens)
        
        for word in words:
            trends[word].append(chunk_freq[word])
    
    return trends

In [4]:
# in a specified window size, find the top 50 most common collocations in the data 

def get_collocations(tokens, window_size=5):
    """Find word collocations"""
    collocations = []
    for i in range(len(tokens) - window_size):
        window = tokens[i:i+window_size]
        for j in range(len(window)):
            for k in range(j+1, len(window)):
                collocations.append((window[j], window[k]))
    return Counter(collocations).most_common(50)

In [5]:
def analyze_text(text):
    """Analyze text and generate visualizations"""
    # Process text
    tokens, sentences = preprocessing(text)
    
    # Word frequency
    word_freq = Counter(tokens)
    top_words = dict(word_freq.most_common(100))
    
    # Generate visualizations
    fig_freq = px.bar(x=list(top_words.keys())[:20], 
                      y=list(top_words.values())[:20],
                      title='Top 20 Word Frequencies')
    fig_freq.show()
    
    # Word cloud
    wordcloud = WordCloud(width=800, height=400).generate(' '.join(tokens))
    plt.figure(figsize=(10,5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
    
    # Trend analysis
    trends = calculate_word_trends(sentences, list(top_words.keys())[:10])
    fig_trends = go.Figure()
    for word, counts in trends.items():
        fig_trends.add_trace(go.Scatter(y=counts, name=word, mode='lines+markers'))
    fig_trends.update_layout(title='Word Frequency Trends')
    fig_trends.show()
    
    return {
        'word_frequencies': top_words,
        'total_words': len(tokens),
        'unique_words': len(set(tokens))
    }

In [6]:
def send_to_voyant(text):
    """Send text to local Voyant server"""
    encoded_text = urllib.parse.quote(text)
    voyant_url = f'http://localhost:8888/?input={encoded_text}'
    webbrowser.open(voyant_url)
    return "Opened in Voyant Tools"

In [12]:
def send_to_voyant_large_text(text, return_url=False):
    """Create a Voyant Tools corpus and return/open the URL"""
    # Use the public Voyant Tools server
    base_url = "https://voyant-tools.org/upload"
    
    # Create a temporary file with the text
    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
        f.write(text)
        temp_filename = f.name
    
    # Create multipart form data
    files = {
        'upload': (os.path.basename(temp_filename), open(temp_filename, 'rb'), 'text/plain')
    }
    
    # Upload the file
    response = requests.post(base_url, files=files)
    
    # Clean up temporary file
    os.unlink(temp_filename)
    
    # Get the corpus ID from the response
    if response.status_code == 200:
        corpus_url = f"https://voyant-tools.org/?corpus={response.text.strip()}"
        
        if return_url:
            return corpus_url
        else:
            webbrowser.open(corpus_url)
            return "Opened in Voyant Tools"
    else:
        return f"Error: {response.status_code}"

In [20]:
def send_to_official_voyant(text, return_url=False):
    """Send text to Voyant Tools and return/open the URL"""
    
    # Use the public Voyant Tools server with the correct endpoint
    base_url = "https://voyant-tools.org/"
    
    # Create form data
    data = {
        'inputFormat': 'text',
        'input': text
    }
    
    # Send POST request
    try:
        response = requests.post(base_url, data=data)
        if response.status_code == 200:
            # Extract the corpus ID from the response URL
            corpus_url = response.url
            
            if return_url:
                return corpus_url
            else:
                webbrowser.open(corpus_url)
                return "Opened in Voyant Tools"
        else:
            return f"Error: Server returned status code {response.status_code}"
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
# enron email dataset
file_path = '../data/inputs/enron/enron_subset.csv'

# Read the file
if file_path.endswith('.csv'):
    df = pd.read_csv(file_path)
    text = ' '.join(df['message'].dropna().tolist())
else:
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

In [23]:
# Option 1: Analyze locally
# results = analyze_text(text)
# print(f"Total words: {results['total_words']}")
# print(f"Unique words: {results['unique_words']}")

# Option 2: Send to Voyant
words, sentence, email_body = preprocessing(text)

url = send_to_official_voyant(text)

In [22]:
url

'https://voyant-tools.org/?corpus=f7087499a373975c14fc89c568b3dbe0'