In [54]:
import re
from collections import Counter
import pandas as pd

import json

In [55]:
def get_feature_counts(df, text_column, n_char=4, remove_punct_first=True):
    """
    Compute global feature counts from the text in a pandas DataFrame.
    
    For each document, two types of tokens are generated:
      1. Character n-grams from the text (with spaces replaced by underscores) 
         where n-grams from within longer words are unpadded.
      2. Word tokens for words with length <= n_char, which are padded with underscores 
         on both sides (e.g. "bye" becomes "_bye_").
    
    If a short word token qualifies (length <= n_char), only the padded version is retained.
    
    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing text data.
    text_column : str
        Name of the column with text.
    n_char : int, optional
        Number of characters in the n-grams (default is 4).
    remove_punct_first : bool, optional
        If True, remove punctuation from the text (default is True).
    
    Returns
    -------
    feature_counts : dict
        A dictionary mapping feature tokens to their global counts.
    """
    # Compile regex patterns.
    token_pattern = re.compile(r'^(?:[a-z0-9]+|_[a-z0-9]+_)$')
    word_pattern = re.compile(r'\b[a-z0-9]+\b')
    
    global_counts = Counter()
    
    # Process each document.
    for text in df[text_column].astype(str):
        if remove_punct_first:
            text = re.sub(r'[^\w\s]', '', text)
        text_lower = text.lower()
        
        # --- Character n-gram extraction ---
        text_mod = text_lower.replace(" ", "_")
        char_tokens = []
        for i in range(len(text_mod) - n_char + 1):
            gram = text_mod[i:i+n_char]
            if token_pattern.match(gram):
                # If the token is a boundary token (e.g., _bye_), strip the underscores for the char token.
                if gram.startswith("_") and gram.endswith("_"):
                    token = gram.strip("_")
                    if token:  # avoid empty tokens
                        char_tokens.append(token)
                else:
                    char_tokens.append(gram)
        char_counts = Counter(char_tokens)
        
        # --- Word token extraction ---
        words = word_pattern.findall(text_lower)
        # Pad words with length <= n_char.
        word_tokens = [f"_{word}_" for word in words if len(word) <= n_char]
        word_counts = Counter(word_tokens)
        
        # --- Filter out unpadded tokens when a padded version exists ---
        filtered_char_counts = Counter()
        for token, count in char_counts.items():
            if len(token) <= n_char and f"_{token}_" in word_counts:
                continue
            filtered_char_counts[token] = count
        
        # --- Combine counts for the document ---
        combined_counts = filtered_char_counts + word_counts
        global_counts.update(combined_counts)
    
    return dict(global_counts)

In [56]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

def create_temp_doc_id(input_text):
    # Extract everything between the brackets
    match = re.search(r'\[(.*?)\]', input_text)
    if match:
        extracted_text = match.group(1)
        # Replace all punctuation and spaces with "_"
        cleaned_text = re.sub(r'[^\w]', '_', extracted_text)
        # Replace multiple underscores with a single "_"
        final_text = re.sub(r'_{2,}', '_', cleaned_text)
        return final_text.lower()
    return None

In [57]:
data_type = "training"
corpus = "Enron"

raw_data_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"

In [66]:
print(raw_data_loc)

/Volumes/BCross/datasets/author_verification/training/Enron/known_raw.jsonl


In [58]:
df = read_jsonl(raw_data_loc)

In [59]:
df.head(5)

Unnamed: 0,doc_id,text,corpus,author,texttype
0,known [Andy.zipper - Mail_1].txt,And I guess we simply weren't prepared for thi...,Enron,Andy.zipper,known
1,known [Andy.zipper - Mail_3].txt,Does that mean yes to tax increases as long as...,Enron,Andy.zipper,known
2,known [Andy.zipper - Mail_4].txt,Go ahead and get set up and coordinate documen...,Enron,Andy.zipper,known
3,known [Andy.zipper - Mail_5].txt,In addition he will be pursuing the channel pa...,Enron,Andy.zipper,known
4,known [Barry.tycholiz - Mail_1].txt,the fact it may be the only thing I get out of...,Enron,Barry.tycholiz,known


In [70]:
feature_counts = get_feature_counts(df, 'text', n_char=4, remove_punct_first=True)

In [72]:
feature_counts

{'gues': 20,
 'uess': 16,
 'simp': 10,
 'impl': 16,
 'mply': 7,
 'eren': 80,
 'rent': 118,
 'prep': 25,
 'repa': 30,
 'epar': 54,
 'pare': 43,
 'ared': 32,
 'leve': 24,
 'evel': 43,
 'hyst': 1,
 'yste': 52,
 'ster': 91,
 'teri': 22,
 'eria': 15,
 'befo': 73,
 'efor': 82,
 'fore': 96,
 'mili': 10,
 'ilit': 99,
 'lita': 5,
 'itar': 14,
 'tary': 9,
 'poli': 18,
 'olic': 24,
 'lice': 6,
 'rest': 84,
 'esto': 19,
 'stor': 53,
 'tore': 15,
 'ored': 7,
 'orde': 45,
 'rder': 43,
 'thou': 94,
 'hous': 131,
 'ousa': 1,
 'usan': 10,
 'sand': 13,
 'ands': 26,
 'fran': 13,
 'rant': 57,
 'anti': 36,
 'ntic': 11,
 'tica': 27,
 'ical': 87,
 'ally': 208,
 'spec': 90,
 'pecu': 2,
 'ecul': 2,
 'cula': 28,
 'ulat': 60,
 'lati': 78,
 'atin': 67,
 'ting': 417,
 'yout': 10,
 'outh': 15,
 'uths': 1,
 'drov': 2,
 'rove': 33,
 'nikk': 1,
 'ikke': 1,
 'kkei': 1,
 'aver': 14,
 'vera': 53,
 'erag': 17,
 'rage': 39,
 '1600': 1,
 '6000': 2,
 'erro': 9,
 'rrol': 4,
 '4000': 4,
 '000d': 1,
 'apro': 2,
 'proc': 63,
 'r

In [75]:
# Assume feature_counts is your dictionary of feature counts
filtered_feature_counts = {token: count for token, count in feature_counts.items() if "the" in token}

print(filtered_feature_counts)


{'othe': 204, 'ther': 591, 'thei': 140, 'dthe': 1, 'theh': 2, '_the_': 3810, '_them_': 136, '_they_': 210, 'ethe': 70, 'thes': 122, '_then_': 65, 'athe': 56, 'rthe': 41, 'ithe': 23, 'nthe': 5, 'thet': 1, 'they': 2, 'tthe': 1, 'thea': 1, 'thel': 6, 'uthe': 8, 'thew': 1, 'thed': 7, 'them': 1, 'theo': 1}


In [52]:
len(test['dfm'].toarray()[0])

1306

In [53]:
len(test['feature_names'])

1306