In [1]:
import pandas as pd
import re
from typing import Union

def count_prefix_occurrences(text: str, prefix: str, /, *, 
                             ignore_case: bool = True) -> int:
    """
    Count the number of words in *text* that begin with *prefix*.

    Parameters
    ----------
    text : str
        The input string to search.
    prefix : str
        The prefix to look for (e.g., "pre", "un", "anti").
    ignore_case : bool, optional
        If True (default) the match is case-insensitive.

    Returns
    -------
    int
        Number of word tokens that start with *prefix*.
    """
    # \b  → start of a word boundary
    # prefix (escaped)  → literal prefix, safely escaped for regex specials
    # \w* → any remaining word characters (zero or more)
    pattern = rf"\b{re.escape(prefix)}\w*"

    flags: Union[int, re.RegexFlag] = re.IGNORECASE if ignore_case else 0
    matches = re.findall(pattern, text, flags=flags)
    return len(matches)

def count_words_containing_term(df, column, term):
    """
    Count total words and average words per article for articles containing a specific term.
    
    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing the articles
    column : str
        Name of column containing text to analyze
    term : str
        Term to search for in articles
        
    Returns
    -------
    tuple
        (total_words, avg_words_per_article)
    """
    word_counts = []
    for _, row in df.iterrows():
        text = str(row[column])
        if term in text.lower():
            count = len(text.split(' '))
            word_counts.append(count)

    total_words = sum(word_counts)
    avg_words = total_words/len(word_counts) if word_counts else 0
    
    return total_words

def count_word_frequencies(words, df, response_columns):
    """
    Count frequencies of words across response columns in a dataframe.
    
    Parameters
    ----------
    words : list
        List of words to count, can include wildcards (*) and alternatives (/)
    df : pandas.DataFrame
        DataFrame containing the response columns
    response_columns : list
        Names of response columns to analyze
        
    Returns
    -------
    tuple of (pandas.DataFrame, pandas.DataFrame)
        Two DataFrames:
        1. Word frequency counts for each response column
        2. Number of articles containing each word for each response column
    """
    results_df = pd.DataFrame(index=words, columns=response_columns)
    article_counts_df = pd.DataFrame(index=words, columns=response_columns)
    
    for word in words:
        total_counts = {col: 0 for col in response_columns}
        article_counts = {col: 0 for col in response_columns}
        distinct_article_counts = {col: 0 for col in response_columns}
        word_totals = {col: 0 for col in response_columns}
        total_articles = {col: len(df[col].dropna()) for col in response_columns}
        
        # Cache to track unique articles for each source
        visited_articles = {col: set() for col in response_columns}
        
        for col in response_columns:
            # Get total word count for articles containing this word
            word_totals[col] = count_words_containing_term(df, col, word.lower().replace('*','').split('/')[0])
            
            for idx, text in enumerate(df[col]):
                if isinstance(text, str):  # Check if text is valid string
                    found_in_article = False
                    
                    if '/' in word:
                        # Handle words with alternatives separated by /
                        parts = word.split('/')
                        for part in parts:
                            if part.endswith('*'):
                                # Remove * and use as prefix
                                prefix = part[:-1]
                                count = text.lower().count(prefix.lower())
                                total_counts[col] += count
                                if count > 0:
                                    found_in_article = True
                            else:
                                # Count exact matches
                                count = text.lower().count(part.lower())
                                total_counts[col] += count
                                if count > 0:
                                    found_in_article = True
                    elif word.endswith('*'):
                        # Handle words with * wildcard
                        prefix = word[:-1]
                        count = text.lower().count(prefix.lower())
                        total_counts[col] += count
                        if count > 0:
                            found_in_article = True
                    else:
                        # Handle exact word matches
                        count = text.lower().count(word.lower())
                        total_counts[col] += count
                        if count > 0:
                            found_in_article = True
                            
                    if found_in_article:
                        article_counts[col] += 1
                        # Use the text content itself as the unique identifier
                        if text not in visited_articles[col]:
                            distinct_article_counts[col] += 1
                            visited_articles[col].add(text)
        
        # Store counts in results dataframe
        for col in response_columns:
            freq = total_counts[col]
            results_df.at[word, col + '_freq'] = freq
            # Calculate frequency per total words, multiply by 1000 and round
            freq_per_total = round((freq / word_totals[col] * 1000 if word_totals[col] > 0 else 0), 2)
            results_df.at[word, col + '_freq_per_1000_words'] = freq_per_total
            results_df.at[word, col + '_articles'] = article_counts[col]
            results_df.at[word, col + '_distinct_articles'] = distinct_article_counts[col]
            # Calculate article percentage (per hundred) using non-distinct article count
            article_pct = round((article_counts[col] / total_articles[col] * 100 if total_articles[col] > 0 else 0), 2)
            results_df.at[word, col + '_articles_percentage'] = article_pct
            
    # Reorder columns to group frequencies, then freq per total, then articles, then articles per hundred
    all_cols = []
    # Add all frequency columns first
    for col in response_columns:
        all_cols.append(col + '_freq')
    # Then add all frequency per total columns
    for col in response_columns:
        all_cols.append(col + '_freq_per_1000_words')
    # Add all non-distinct article count columns
    for col in response_columns:
        all_cols.append(col + '_articles')
    # Add all distinct article count columns
    for col in response_columns:
        all_cols.append(col + '_distinct_articles')
    # Finally add all articles per hundred columns
    for col in response_columns:
        all_cols.append(col + '_articles_percentage')
    results_df = results_df[all_cols]
    return results_df

In [2]:
df = pd.read_csv('../original_data_1000_queries.csv')

political_words = ['race', 'Aryan', 'abort*', 'rape', 'handicap*', 'disabled/disabilit*', 'mentally ill', 'mentally', 'gas chamber', 'Gyps*', 'Roma', 'sterilization', 'suicide', 'antisemit*/anti-semit*', 'racist/racial']

emotional_words =['tragic*', 'dramatic', 'horrific', 'atrocit*', 'torture', 'cruel*', 'powerful', 'major','testament', 'significant', 'example', 'reminder']

response_column_names = df.columns[1:]

In [3]:
# Call function and display results
political_words_df = count_word_frequencies(political_words, df, response_column_names)
print("\nPolitical Words article counts:")
political_words_df[[col + '_articles' for col in response_column_names]].head(14)


Political Words article counts:


Unnamed: 0,ushmm_article_articles,chatgpt_4o_response_articles,gemini_response_articles,grok_response_articles
race,289.0,91.0,131.0,68.0
Aryan,232.0,128.0,118.0,90.0
abort*,7.0,0.0,0.0,1.0
rape,34.0,7.0,11.0,4.0
handicap*,2.0,0.0,0.0,0.0
disabled/disabilit*,296.0,232.0,123.0,205.0
mentally ill,14.0,2.0,0.0,0.0
mentally,29.0,9.0,3.0,5.0
gas chamber,224.0,127.0,62.0,79.0
Gyps*,367.0,157.0,38.0,16.0


In [63]:
# Call function and display results
emotional_words_df = count_word_frequencies(emotional_words, df, response_column_names)
print("\nPolitical Words frequency counts:")
emotional_words_df.head(14)


Political Words frequency counts:


Unnamed: 0,ushmm_article_freq,chatgpt_4o_response_freq,gemini_response_freq,grok_response_freq,ushmm_article_freq_per_1000_words,chatgpt_4o_response_freq_per_1000_words,gemini_response_freq_per_1000_words,grok_response_freq_per_1000_words,ushmm_article_articles,chatgpt_4o_response_articles,gemini_response_articles,grok_response_articles,ushmm_article_distinct_articles,chatgpt_4o_response_distinct_articles,gemini_response_distinct_articles,grok_response_distinct_articles,ushmm_article_articles_percentage,chatgpt_4o_response_articles_percentage,gemini_response_articles_percentage,grok_response_articles_percentage
tragic*,21.0,63.0,39.0,25.0,0.73,3.37,3.71,3.44,19.0,58.0,34.0,25.0,7.0,58.0,34.0,25.0,1.9,5.8,3.4,2.5
dramatic,62.0,13.0,9.0,3.0,1.26,2.44,1.72,2.42,54.0,13.0,9.0,3.0,11.0,13.0,9.0,3.0,5.4,1.3,0.9,0.3
horrific,76.0,122.0,108.0,115.0,0.5,3.63,3.21,3.89,40.0,119.0,104.0,106.0,4.0,119.0,104.0,106.0,4.0,11.9,10.4,10.6
atrocit*,590.0,340.0,167.0,280.0,1.34,3.81,3.44,4.64,352.0,277.0,134.0,223.0,55.0,277.0,134.0,223.0,35.2,27.7,13.4,22.3
torture,77.0,30.0,58.0,15.0,0.4,3.2,3.03,4.44,59.0,29.0,52.0,14.0,15.0,29.0,52.0,14.0,5.9,2.9,5.2,1.4
cruel*,43.0,58.0,42.0,32.0,1.15,3.31,3.35,4.88,20.0,52.0,41.0,26.0,10.0,52.0,41.0,26.0,2.0,5.2,4.1,2.6
powerful,151.0,195.0,88.0,53.0,0.58,3.18,2.73,3.76,117.0,184.0,73.0,52.0,20.0,184.0,73.0,52.0,11.7,18.4,7.3,5.2
major,831.0,269.0,184.0,187.0,1.35,3.68,3.1,4.22,410.0,208.0,128.0,138.0,103.0,208.0,128.0,138.0,41.0,20.8,12.8,13.8
testament,73.0,13.0,21.0,14.0,1.3,2.68,2.54,3.27,38.0,13.0,21.0,14.0,3.0,13.0,21.0,14.0,3.8,1.3,2.1,1.4
significant,213.0,208.0,225.0,316.0,0.58,3.27,2.92,4.58,161.0,188.0,175.0,241.0,50.0,188.0,175.0,241.0,16.1,18.8,17.5,24.1


In [84]:
# Merge political and emotional words into one list and create classification labels
all_words = political_words + emotional_words
word_classifications = ['political'] * len(political_words) + ['emotional'] * len(emotional_words)

# Create classification Series
classification_df = pd.DataFrame({'word': all_words, 'classification': word_classifications})
classification_df = classification_df.set_index('word')

# Concatenate the political and emotional dataframes
combined_df = pd.concat([political_words_df, emotional_words_df])

# Add the classification column and reorder so classification is first
combined_df = combined_df.join(classification_df)
combined_df = combined_df[['classification'] + [col for col in combined_df.columns if col != 'classification']]

# Display results
print("\nCombined word frequencies with classifications:")
combined_df.to_csv('word_frequencies.csv')
combined_df


Combined word frequencies with classifications:


Unnamed: 0,classification,ushmm_article_freq,chatgpt_4o_response_freq,gemini_response_freq,grok_response_freq,ushmm_article_freq_per_1000_words,chatgpt_4o_response_freq_per_1000_words,gemini_response_freq_per_1000_words,grok_response_freq_per_1000_words,ushmm_article_articles,...,gemini_response_articles,grok_response_articles,ushmm_article_distinct_articles,chatgpt_4o_response_distinct_articles,gemini_response_distinct_articles,grok_response_distinct_articles,ushmm_article_articles_percentage,chatgpt_4o_response_articles_percentage,gemini_response_articles_percentage,grok_response_articles_percentage
race,political,1972.0,197.0,256.0,140.0,3.8,6.33,5.8,7.23,289.0,...,131.0,68.0,67.0,91.0,131.0,68.0,28.9,9.1,13.1,6.8
Aryan,political,1029.0,290.0,253.0,202.0,2.45,6.59,5.9,8.04,232.0,...,118.0,90.0,37.0,128.0,118.0,90.0,23.2,12.8,11.8,9.0
abort*,political,24.0,0.0,0.0,1.0,3.12,0.0,0.0,2.15,7.0,...,0.0,1.0,5.0,0.0,0.0,1.0,0.7,0.0,0.0,0.1
rape,political,41.0,7.0,11.0,4.0,0.72,2.71,2.57,2.58,34.0,...,11.0,4.0,9.0,7.0,11.0,4.0,3.4,0.7,1.1,0.4
handicap*,political,2.0,0.0,0.0,0.0,1.71,0.0,0.0,0.0,2.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0
disabled/disabilit*,political,514.0,269.0,133.0,221.0,5.4,5.14,8.9,5.11,296.0,...,123.0,205.0,37.0,232.0,123.0,203.0,29.6,23.2,12.3,20.5
mentally ill,political,14.0,2.0,0.0,0.0,1.5,2.01,0.0,0.0,14.0,...,0.0,0.0,3.0,2.0,0.0,0.0,1.4,0.2,0.0,0.0
mentally,political,34.0,9.0,3.0,7.0,1.47,2.38,3.44,4.82,29.0,...,3.0,5.0,9.0,9.0,3.0,5.0,2.9,0.9,0.3,0.5
gas chamber,political,695.0,183.0,83.0,113.0,1.72,4.63,4.84,5.48,224.0,...,62.0,79.0,50.0,127.0,62.0,79.0,22.4,12.7,6.2,7.9
Gyps*,political,567.0,170.0,47.0,17.0,0.97,3.74,2.9,5.62,367.0,...,38.0,16.0,54.0,157.0,38.0,16.0,36.7,15.7,3.8,1.6


In [74]:
# Find duplicate entries in each response column and store in a dataframe
duplicate_dfs = []
for col in response_column_names[1:]:  # Skip ushmm_articles
    duplicates = df[col].value_counts()[df[col].value_counts() > 1]
    if len(duplicates) > 0:
        # Create a dataframe for this column's duplicates
        dup_df = pd.DataFrame({
            'column': col,
            'text': duplicates.index,
            'count': duplicates.values
        })
        duplicate_dfs.append(dup_df)

# Combine all duplicate dataframes
all_duplicates_df = pd.concat(duplicate_dfs, ignore_index=True)

# Inspect the duplicates
print("\nDuplicate responses found:")
all_duplicates_df.head()

# Save duplicates to CSV
all_duplicates_df.to_csv('duplicate_responses.csv', index=False)




Duplicate responses found:
