In [1]:
import json
import pandas as pd
from collections import defaultdict
import re
from nltk import word_tokenize
from tqdm import tqdm

## Load Data

In [2]:
# Load debates data
print("Loading debates.json...")
with open('../../../data/DebateDotOrg/01_rawdata/debates.json', 'r') as f:
    debates = json.load(f)
print(f"Loaded {len(debates)} debates")

# Load users data
print("\nLoading users.json...")
with open('../../../data/DebateDotOrg/01_rawdata/users.json', 'r') as f:
    users = json.load(f)
print(f"Loaded {len(users)} users")

Loading debates.json...
Loaded 78376 debates

Loading users.json...
Loaded 45348 users


## Helper Functions

In [3]:
def count_words(text):
    """Count words in a text string using a regex tokenizer (no NLTK data required)."""
    if not text or text.strip().lower() == 'forfeit':
        return 0
    # Use NLTK word tokenizer for more accurate word counting
    words = word_tokenize(text)
    return len(words)

# def has_vote_info(user_name, debate):
#     """Check if we have voting/agreement information for a user in a debate.
#     For debaters, this means checking if there are votes that reference them.
#     Returns True if we have vote information about this user."""
#     if not debate.get('votes'):
#         return False
    
#     # Check if any vote includes information about this user
#     for vote in debate['votes']:
#         votes_map = vote.get('votes_map', {})
#         if user_name in votes_map:
#             # Check if there's actual agreement information
#             user_votes = votes_map[user_name]
#             if ('Agreed with before the debate' in user_votes or 
#                 'Agreed with after the debate' in user_votes):
#                 return True
#     return False

# def commenter_has_stance(comment):
#     """Check if a comment has associated stance/vote information.
#     For commenters, we consider them to have stance if they voted on the debate."""
#     # We'll check this later when matching commenters with voters
#     return True  # We'll filter this in the main analysis

# def commenter_has_stance(comment, debate_voters):
#     """Return True if commenter appears among debate voters (simple check)."""
#     commenter = comment.get('user_name')
#     return bool(commenter and commenter in debate_voters)

## Analysis Part 1: Collect Debater and Commenter Information

In [4]:
# from tqdm import tqdm

# # Track debaters and their word counts
# # Structure: {user_name: {'debates': set(), 'word_counts': []}}
# debaters_info = defaultdict(lambda: {'debates': set(), 'word_counts': []})

# # Track commenters and their word counts
# # Structure: {user_name: {'debates': set(), 'word_counts': []}}
# commenters_info = defaultdict(lambda: {'debates': set(), 'word_counts': []})

# # Track voters per debate
# voters_per_debate = defaultdict(set)

# print("Processing debates...")
# for debate_id, debate in tqdm(debates.items(), desc="Processing debates"):
#     # Process debaters
#     participant_1 = debate.get('participant_1_name')
#     participant_2 = debate.get('participant_2_name')
    
#     # Collect voters for this debate
#     if debate.get('votes'):
#         for vote in debate['votes']:
#             voter_name = vote.get('user_name')
#             if voter_name:
#                 voters_per_debate[debate_id].add(voter_name)
    
#     # Process rounds to get debater texts
#     if debate.get('rounds'):
#         for participant in [participant_1, participant_2]:
#             if not participant:
#                 continue
            
#             # Check if we have vote information for this debater
#             if has_vote_info(participant, debate):
#                 debaters_info[participant]['debates'].add(debate_id)
                
#                 # Get all text from this debater across all rounds
#                 for round_list in debate['rounds']:
#                     for utterance in round_list:
#                         # Match participant by position
#                         p1_pos = debate.get('participant_1_position')
#                         p2_pos = debate.get('participant_2_position')
                        
#                         utterance_side = utterance.get('side')
#                         utterance_text = utterance.get('text', '')
                        
#                         # Check if this utterance belongs to current participant
#                         if ((participant == participant_1 and utterance_side == p1_pos) or
#                             (participant == participant_2 and utterance_side == p2_pos)):
#                             word_count = count_words(utterance_text)
#                             if word_count > 0:  # Don't count forfeits
#                                 debaters_info[participant]['word_counts'].append(word_count)
    
#     # Process comments
#     if debate.get('comments'):
#         for comment in debate['comments']:
#             commenter = comment.get('user_name')
#             comment_text = comment.get('comment_text', '')
            
#             if commenter and comment_text:
#                 # Only include commenters who also voted on this debate (have stance)
#                 if commenter in voters_per_debate[debate_id]:
#                     commenters_info[commenter]['debates'].add(debate_id)
#                     word_count = count_words(comment_text)
#                     if word_count > 0:
#                         commenters_info[commenter]['word_counts'].append(word_count)

# print(f"\nProcessed {len(debates)} debates")
# print(f"Found {len(debaters_info)} unique debaters with vote information")
# print(f"Found {len(commenters_info)} unique commenters with voting information")

In [5]:
# Track debaters and their word counts
# Structure: {user_name: {'debates': set(), 'word_counts': []}}
debaters_info = defaultdict(lambda: {'debates': set(), 'word_counts': []})

# Track commenters and their word counts
# Structure: {user_name: {'debates': set(), 'word_counts': []}}
commenters_info = defaultdict(lambda: {'debates': set(), 'word_counts': []})

# Track voters per debate
voters_per_debate = defaultdict(set)

print("Processing debates...")
for debate_id, debate in tqdm(debates.items(), desc="Processing debates"):
    # Process debaters
    participant_1 = debate.get('participant_1_name')
    participant_2 = debate.get('participant_2_name')
    
    # Collect voters for this debate
    if debate.get('votes'):
        for vote in debate['votes']:
            voter_name = vote.get('user_name')
            if voter_name:
                voters_per_debate[debate_id].add(voter_name)
    
    # Process rounds to get debater texts
    if debate.get('rounds'):
        for participant in [participant_1, participant_2]:
            if not participant:
                continue
            
            # All debaters have stances (Pro/Con) - no vote check needed
            debaters_info[participant]['debates'].add(debate_id)
            
            # Get all text from this debater across all rounds
            for round_list in debate['rounds']:
                for utterance in round_list:
                    # Match participant by position
                    p1_pos = debate.get('participant_1_position')
                    p2_pos = debate.get('participant_2_position')
                    
                    utterance_side = utterance.get('side')
                    utterance_text = utterance.get('text', '')
                    
                    # Check if this utterance belongs to current participant
                    if ((participant == participant_1 and utterance_side == p1_pos) or
                        (participant == participant_2 and utterance_side == p2_pos)):
                        word_count = count_words(utterance_text)
                        if word_count > 0:  # Don't count forfeits
                            debaters_info[participant]['word_counts'].append(word_count)
    
    # Process comments
    if debate.get('comments'):
        for comment in debate['comments']:
            commenter = comment.get('user_name')
            comment_text = comment.get('comment_text', '')
            
            if commenter and comment_text:
                # Only include commenters who also voted on this debate (have stance)
                if commenter in voters_per_debate[debate_id]:
                    commenters_info[commenter]['debates'].add(debate_id)
                    word_count = count_words(comment_text)
                    if word_count > 0:
                        commenters_info[commenter]['word_counts'].append(word_count)

print(f"\nProcessed {len(debates)} debates")
print(f"Found {len(debaters_info)} unique debaters")
print(f"Found {len(commenters_info)} unique commenters with voting information")

Processing debates...


Processing debates: 100%|██████████| 78376/78376 [04:39<00:00, 280.18it/s]


Processed 78376 debates
Found 42906 unique debaters
Found 4443 unique commenters with voting information





## Analysis A: Users who commented on >1 debate OR participated in >1 debate

In [6]:
# Find users with >1 debate as debater
debaters_multiple = {user for user, info in debaters_info.items() if len(info['debates']) > 1}

# Find users with >1 debate as commenter
commenters_multiple = {user for user, info in commenters_info.items() if len(info['debates']) > 1}

# Union: users who did either (or both)
users_active_multiple = debaters_multiple | commenters_multiple

print("="*60)
print("A. Users who commented on >1 debate OR participated in >1 debate")
print("="*60)
print(f"Number of unique users: {len(users_active_multiple)}")
print(f"\nBreakdown:")
print(f"  - Debaters in >1 debate: {len(debaters_multiple)}")
print(f"  - Commenters in >1 debate: {len(commenters_multiple)}")
print(f"  - Overlap (both): {len(debaters_multiple & commenters_multiple)}")

A. Users who commented on >1 debate OR participated in >1 debate
Number of unique users: 20314

Breakdown:
  - Debaters in >1 debate: 19741
  - Commenters in >1 debate: 2807
  - Overlap (both): 2234


## Analysis B: Users who were debaters in >1 debate

In [7]:
print("="*60)
print("B. Users who were debaters in >1 debate")
print("="*60)
print(f"Number of unique users: {len(debaters_multiple)}")

# Show distribution
debate_counts = [len(info['debates']) for info in debaters_info.values() if len(info['debates']) > 1]
if debate_counts:
    print(f"\nStatistics:")
    print(f"  - Mean debates per user: {sum(debate_counts)/len(debate_counts):.2f}")
    print(f"  - Max debates by a single user: {max(debate_counts)}")
    print(f"  - Median: {sorted(debate_counts)[len(debate_counts)//2]}")

B. Users who were debaters in >1 debate
Number of unique users: 19741

Statistics:
  - Mean debates per user: 6.77
  - Max debates by a single user: 1018
  - Median: 3


## Analysis C: Users who were commenters in >1 debate

In [8]:
print("="*60)
print("C. Users who were commenters in >1 debate")
print("="*60)
print(f"Number of unique users: {len(commenters_multiple)}")

# Show distribution
comment_counts = [len(info['debates']) for info in commenters_info.values() if len(info['debates']) > 1]
if comment_counts:
    print(f"\nStatistics:")
    print(f"  - Mean debates commented on per user: {sum(comment_counts)/len(comment_counts):.2f}")
    print(f"  - Max debates commented on by a single user: {max(comment_counts)}")
    print(f"  - Median: {sorted(comment_counts)[len(comment_counts)//2]}")

C. Users who were commenters in >1 debate
Number of unique users: 2807

Statistics:
  - Mean debates commented on per user: 14.47
  - Max debates commented on by a single user: 1086
  - Median: 4


## Analysis D: Average word count for each debater

In [9]:
# Calculate average word count per debater
debater_avg_words = {}
for user, info in debaters_info.items():
    if info['word_counts']:  # Only if they have text
        debater_avg_words[user] = sum(info['word_counts']) / len(info['word_counts'])

print("="*60)
print("D. Average word count for each debater")
print("="*60)
print(f"Number of debaters with text: {len(debater_avg_words)}")

if debater_avg_words:
    overall_avg = sum(debater_avg_words.values()) / len(debater_avg_words)
    print(f"\nOverall average word count per debater utterance: {overall_avg:.2f}")
    print(f"Min average: {min(debater_avg_words.values()):.2f}")
    print(f"Max average: {max(debater_avg_words.values()):.2f}")
    
    # Show top 10 most verbose debaters
    print("\nTop 10 most verbose debaters (by average words per utterance):")
    top_10 = sorted(debater_avg_words.items(), key=lambda x: x[1], reverse=True)[:10]
    for i, (user, avg_words) in enumerate(top_10, 1):
        num_debates = len(debaters_info[user]['debates'])
        num_utterances = len(debaters_info[user]['word_counts'])
        print(f"  {i}. {user}: {avg_words:.2f} words/utterance ({num_debates} debates, {num_utterances} utterances)")

D. Average word count for each debater
Number of debaters with text: 42876

Overall average word count per debater utterance: 223.37
Min average: 1.00
Max average: 23646.40

Top 10 most verbose debaters (by average words per utterance):
  1. numba1_person: 23646.40 words/utterance (1 debates, 5 utterances)
  2. PimpDaddy-1: 9548.40 words/utterance (1 debates, 5 utterances)
  3. thiskidthou: 9319.00 words/utterance (1 debates, 5 utterances)
  4. PreciousC: 8845.00 words/utterance (1 debates, 4 utterances)
  5. hc: 7700.50 words/utterance (1 debates, 2 utterances)
  6. Munchin_Mitch: 6600.60 words/utterance (1 debates, 5 utterances)
  7. Im_Chance: 5748.00 words/utterance (1 debates, 1 utterances)
  8. noelleybelley: 5715.00 words/utterance (1 debates, 1 utterances)
  9. theman27: 5516.60 words/utterance (1 debates, 5 utterances)
  10. usmanwarlord21: 4190.67 words/utterance (1 debates, 3 utterances)


## Analysis E: Average word count for commenters

In [10]:
# Calculate average word count per commenter
commenter_avg_words = {}
for user, info in commenters_info.items():
    if info['word_counts']:  # Only if they have text
        commenter_avg_words[user] = sum(info['word_counts']) / len(info['word_counts'])

print("="*60)
print("E. Average word count for commenters")
print("="*60)
print(f"Number of commenters with text and vote info: {len(commenter_avg_words)}")

if commenter_avg_words:
    commenter_overall_avg = sum(commenter_avg_words.values()) / len(commenter_avg_words)
    print(f"\nOverall average word count per comment: {commenter_overall_avg:.2f}")
    print(f"Min average: {min(commenter_avg_words.values()):.2f}")
    print(f"Max average: {max(commenter_avg_words.values()):.2f}")
    
    # Show top 10 most verbose commenters
    print("\nTop 10 most verbose commenters (by average words per comment):")
    top_10 = sorted(commenter_avg_words.items(), key=lambda x: x[1], reverse=True)[:10]
    for i, (user, avg_words) in enumerate(top_10, 1):
        num_debates = len(commenters_info[user]['debates'])
        num_comments = len(commenters_info[user]['word_counts'])
        print(f"  {i}. {user}: {avg_words:.2f} words/comment ({num_debates} debates, {num_comments} comments)")

E. Average word count for commenters
Number of commenters with text and vote info: 4443

Overall average word count per comment: 73.20
Min average: 1.00
Max average: 447.00

Top 10 most verbose commenters (by average words per comment):
  1. Supernova: 447.00 words/comment (1 debates, 1 comments)
  2. Knologistprime: 420.50 words/comment (1 debates, 2 comments)
  3. Ambassador95: 420.00 words/comment (1 debates, 1 comments)
  4. LBJMVP6: 419.00 words/comment (1 debates, 1 comments)
  5. Inductivelogic: 410.00 words/comment (1 debates, 1 comments)
  6. Sivispacem08: 409.00 words/comment (1 debates, 1 comments)
  7. Yariiii: 406.00 words/comment (1 debates, 2 comments)
  8. Davisc09: 404.00 words/comment (1 debates, 1 comments)
  9. Everett_Tsosie: 402.00 words/comment (1 debates, 1 comments)
  10. Thaumaturgy: 400.00 words/comment (1 debates, 1 comments)


## Summary: All Results

In [11]:
print("="*60)
print("SUMMARY OF ALL ANALYSES")
print("="*60)
print(f"\nA. Users active in >1 debate (commented OR debated): {len(users_active_multiple)}")
print(f"\nB. Users who debated in >1 debate: {len(debaters_multiple)}")
print(f"\nC. Users who commented in >1 debate: {len(commenters_multiple)}")

# Calculate averages for summary
debater_summary_avg = sum(debater_avg_words.values()) / len(debater_avg_words) if debater_avg_words else 0
commenter_summary_avg = sum(commenter_avg_words.values()) / len(commenter_avg_words) if commenter_avg_words else 0

print(f"\nD. Average word count for debaters: {debater_summary_avg:.2f}" if debater_avg_words else "\nD. No debater data available")
print(f"\nE. Average word count for commenters: {commenter_summary_avg:.2f}" if commenter_avg_words else "\nE. No commenter data available")
print("\n" + "="*60)
print("Note: Debaters have stances from their Pro/Con positions. Commenters are filtered to those who also voted (have stance information).")
print("="*60)

SUMMARY OF ALL ANALYSES

A. Users active in >1 debate (commented OR debated): 20314

B. Users who debated in >1 debate: 19741

C. Users who commented in >1 debate: 2807

D. Average word count for debaters: 223.37

E. Average word count for commenters: 73.20

Note: Debaters have stances from their Pro/Con positions. Commenters are filtered to those who also voted (have stance information).


In [12]:
# Create a comprehensive user text and stance dataframe
user_records = []

for user_id in tqdm(users_active_multiple, desc="Building user records"):
    text_by_debate = {}
    stances = {}
    all_text_parts = []
    debate_text = {}
    comment_text = {}
    
    # Check if user qualifies (debated in >1 debate OR commented on >1 debate)
    num_debates_as_debater = len(debaters_info.get(user_id, {}).get('debates', []))
    num_debates_as_commenter = len(commenters_info.get(user_id, {}).get('debates', []))
    
    # Skip if user doesn't meet criteria
    if num_debates_as_debater <= 1 and num_debates_as_commenter <= 1:
        continue
    
    # Determine if user is commenter only
    commenter_only = num_debates_as_debater == 0
    
    # Process debates where user was a debater
    if user_id in debaters_info:
        for debate_id in debaters_info[user_id]['debates']:
            debate = debates[debate_id]
            participant_1 = debate.get('participant_1_name')
            participant_2 = debate.get('participant_2_name')
            p1_pos = debate.get('participant_1_position')
            p2_pos = debate.get('participant_2_position')
            
            debate_text_parts = []
            
            # Collect debater's text from rounds
            if debate.get('rounds'):
                for round_list in debate['rounds']:
                    for utterance in round_list:
                        utterance_text = utterance.get('text', '')
                        utterance_side = utterance.get('side')
                        
                        # Check if this utterance belongs to current user
                        if ((user_id == participant_1 and utterance_side == p1_pos) or
                            (user_id == participant_2 and utterance_side == p2_pos)):
                            if utterance_text.strip().lower() != 'forfeit':
                                debate_text_parts.append(utterance_text)
            
            if debate_text_parts:
                debate_text[debate_id] = ' '.join(debate_text_parts)
                text_by_debate[debate_id] = debate_text[debate_id]
                all_text_parts.extend(debate_text_parts)
                
                # Set stance based on debater position
                if user_id == participant_1:
                    stances[debate_id] = p1_pos
                else:
                    stances[debate_id] = p2_pos
    
    # Process comments and votes where user was a commenter
    if user_id in commenters_info:
        for debate_id in commenters_info[user_id]['debates']:
            debate = debates[debate_id]
            comment_text_parts = []
            user_stance = None
            
            # Get user's stance from votes
            if debate.get('votes'):
                for vote in debate['votes']:
                    if vote.get('user_name') == user_id:
                        votes_map = vote.get('votes_map', {})
                        # Determine stance from votes_map
                        # Check if voter agreed with "Tied" after the debate
                        if votes_map.get('Tied', {}).get('Agreed with after the debate'):
                            user_stance = 'Tie'
                        else:
                            # Check which debater they agreed with
                            for debater, stance_info in votes_map.items():
                                if debater != 'Tied' and stance_info.get('Agreed with after the debate'):
                                    # Map debater to their position
                                    p1_name = debate.get('participant_1_name')
                                    p2_name = debate.get('participant_2_name')
                                    p1_pos = debate.get('participant_1_position')
                                    p2_pos = debate.get('participant_2_position')
                                    
                                    if debater == p1_name:
                                        user_stance = p1_pos
                                        break
                                    elif debater == p2_name:
                                        user_stance = p2_pos
                                        break
                        break
            
            # Collect comment text
            if debate.get('comments'):
                for comment in debate['comments']:
                    if comment.get('user_name') == user_id:
                        comment_text_value = comment.get('comment_text', '')
                        if comment_text_value.strip():
                            comment_text_parts.append(comment_text_value)
            
            if comment_text_parts:
                comment_full_text = ' '.join(comment_text_parts)
                comment_text[debate_id] = comment_full_text
                
                # If user also debated in this debate, append comments to debate text
                if debate_id in text_by_debate:
                    text_by_debate[debate_id] = text_by_debate[debate_id] + ' ' + comment_full_text
                else:
                    text_by_debate[debate_id] = comment_full_text
                
                all_text_parts.extend(comment_text_parts)
                
                # Only set stance if not already set as debater (debater stance takes precedence)
                if debate_id not in stances and user_stance:
                    stances[debate_id] = user_stance
    
    # Create record if user has data
    if text_by_debate and stances:
        user_records.append({
            'user_id': user_id,
            'text_by_debate': text_by_debate,
            'stances': stances,
            'all_text': ' '.join(all_text_parts),
            'commenter_only': commenter_only,
            'debate_text': debate_text,
            'comment_text': comment_text
        })

# Create dataframe
user_text_df = pd.DataFrame(user_records)

print(f"Created dataframe with {len(user_text_df)} users")
print(f"Columns: {list(user_text_df.columns)}")
print(f"\nCommenter only users: {user_text_df['commenter_only'].sum()}")
print("\nSample record:")
print(user_text_df.iloc[0])

# Save to CSV
user_text_df.to_csv('user_text_and_stances.csv', index=False)
print("\nSaved to user_text_and_stances.csv")
user_text_df.head()

Building user records: 100%|██████████| 20314/20314 [00:01<00:00, 12564.42it/s]


Created dataframe with 20314 users
Columns: ['user_id', 'text_by_debate', 'stances', 'all_text', 'commenter_only', 'debate_text', 'comment_text']

Commenter only users: 311

Sample record:
user_id                                             LachlanSmithson
text_by_debate    {'Extracurricular-activities-being-bad-for-you...
stances           {'Extracurricular-activities-being-bad-for-you...
all_text          \n  \r\nI accept.  \n  \n  \n   BOP   \r\nPro ...
commenter_only                                                False
debate_text       {'Extracurricular-activities-being-bad-for-you...
comment_text                                                     {}
Name: 0, dtype: object

Saved to user_text_and_stances.csv


Unnamed: 0,user_id,text_by_debate,stances,all_text,commenter_only,debate_text,comment_text
0,LachlanSmithson,{'Extracurricular-activities-being-bad-for-you...,{'Extracurricular-activities-being-bad-for-you...,\n \r\nI accept. \n \n \n BOP \r\nPro ...,False,{'Extracurricular-activities-being-bad-for-you...,{}
1,dorky,{'Rey-Mysterio-is-the-best-WWE-wrestler/1/': '...,{'Rey-Mysterio-is-the-best-WWE-wrestler/1/': '...,\n \r\nRey Mysterio is the best WWE wrestler ...,False,{'Rey-Mysterio-is-the-best-WWE-wrestler/1/': '...,{}
2,abbitha,{'Gun-Control-in-America-would-help-problems-w...,{'Gun-Control-in-America-would-help-problems-w...,\n \r\nIt is an obvious fact that the United ...,False,{'Gun-Control-in-America-would-help-problems-w...,{}
3,awoutas,{'Should-the-electoral-college-be-here/1/': ' ...,{'Should-the-electoral-college-be-here/1/': 'P...,\n \r\nMany say the electoral college should ...,False,{'Should-the-electoral-college-be-here/1/': ' ...,{}
4,NonIdentifiable,{'Tomato-is-a-fruit/1/': '  Rules 1) No ...,"{'Tomato-is-a-fruit/1/': 'Pro', 'Border-fence-...",\n \r\nRules \r\n1) No forfeiting \r\n2) Ke...,False,{'Tomato-is-a-fruit/1/': '  Rules 1) No ...,{}


In [13]:
user_text_df

Unnamed: 0,user_id,text_by_debate,stances,all_text,commenter_only,debate_text,comment_text
0,LachlanSmithson,{'Extracurricular-activities-being-bad-for-you...,{'Extracurricular-activities-being-bad-for-you...,\n \r\nI accept. \n \n \n BOP \r\nPro ...,False,{'Extracurricular-activities-being-bad-for-you...,{}
1,dorky,{'Rey-Mysterio-is-the-best-WWE-wrestler/1/': '...,{'Rey-Mysterio-is-the-best-WWE-wrestler/1/': '...,\n \r\nRey Mysterio is the best WWE wrestler ...,False,{'Rey-Mysterio-is-the-best-WWE-wrestler/1/': '...,{}
2,abbitha,{'Gun-Control-in-America-would-help-problems-w...,{'Gun-Control-in-America-would-help-problems-w...,\n \r\nIt is an obvious fact that the United ...,False,{'Gun-Control-in-America-would-help-problems-w...,{}
3,awoutas,{'Should-the-electoral-college-be-here/1/': ' ...,{'Should-the-electoral-college-be-here/1/': 'P...,\n \r\nMany say the electoral college should ...,False,{'Should-the-electoral-college-be-here/1/': ' ...,{}
4,NonIdentifiable,{'Tomato-is-a-fruit/1/': '  Rules 1) No ...,"{'Tomato-is-a-fruit/1/': 'Pro', 'Border-fence-...",\n \r\nRules \r\n1) No forfeiting \r\n2) Ke...,False,{'Tomato-is-a-fruit/1/': '  Rules 1) No ...,{}
...,...,...,...,...,...,...,...
20309,benko12345678,{'On-balance-the-Ribbentrop-Molotov-pact-was-a...,{'On-balance-the-Ribbentrop-Molotov-pact-was-a...,"\n \n Introduction In this debate, I will s...",False,{'On-balance-the-Ribbentrop-Molotov-pact-was-a...,{}
20310,Deav0n,{'Flaw-in-the-main-argument-of-gun-ownership-i...,{'Flaw-in-the-main-argument-of-gun-ownership-i...,\n \r\nIf allowing gun ownership to protect o...,False,{'Flaw-in-the-main-argument-of-gun-ownership-i...,{}
20311,Lazy_Lipids,{'Poetry-Debate-No.-2/1/': '  Sometimes we ...,"{'Poetry-Debate-No.-2/1/': 'Con', 'Chuck-Norri...",\n \r\nSometimes we see something beautiful a...,False,{'Poetry-Debate-No.-2/1/': '  Sometimes we ...,{}
20312,Dirty-Morgs,{'Is-paper-a-figment-of-our-imagination/1/': '...,{'Is-paper-a-figment-of-our-imagination/1/': '...,\n \n https://en.wikipedia.org... \n \r\nFi...,False,{'Is-paper-a-figment-of-our-imagination/1/': '...,{}


In [14]:
import ast

# let's make sure loading works
loaded_df = pd.read_csv('user_text_and_stances.csv')

# Convert string representations back to dictionaries
for col in ['text_by_debate', 'stances', 'debate_text', 'comment_text']:
    loaded_df[col] = loaded_df[col].apply(ast.literal_eval)

assert len(loaded_df) == len(user_text_df)
assert all(loaded_df['user_id'] == user_text_df['user_id'])
assert all(loaded_df['all_text'] == user_text_df['all_text'])
assert all(loaded_df['text_by_debate'] == user_text_df['text_by_debate'])
assert all(loaded_df['stances'] == user_text_df['stances'])
assert all(loaded_df['debate_text'] == user_text_df['debate_text'])
assert all(loaded_df['comment_text'] == user_text_df['comment_text'])
print("✓ All assertions passed!")

✓ All assertions passed!


## Optional: Export Results to CSV

In [15]:
# Create DataFrames for export
debaters_df = pd.DataFrame([
    {
        'username': user,
        'num_debates': len(info['debates']),
        'num_utterances': len(info['word_counts']),
        'avg_words_per_utterance': debater_avg_words.get(user, 0),
        'total_words': sum(info['word_counts'])
    }
    for user, info in debaters_info.items()
    if len(info['debates']) > 1
]).sort_values('num_debates', ascending=False)

commenters_df = pd.DataFrame([
    {
        'username': user,
        'num_debates': len(info['debates']),
        'num_comments': len(info['word_counts']),
        'avg_words_per_comment': commenter_avg_words.get(user, 0),
        'total_words': sum(info['word_counts'])
    }
    for user, info in commenters_info.items()
    if len(info['debates']) > 1
]).sort_values('num_debates', ascending=False)

# Save to CSV
debaters_df.to_csv('debaters_analysis.csv', index=False)
commenters_df.to_csv('commenters_analysis.csv', index=False)

print("Results exported to:")
print("  - debaters_analysis.csv")
print("  - commenters_analysis.csv")

# Display first few rows
print("\nDebaters sample:")
print(debaters_df.head(10))

print("\nCommenters sample:")
print(commenters_df.head(10))

Results exported to:
  - debaters_analysis.csv
  - commenters_analysis.csv

Debaters sample:
            username  num_debates  num_utterances  avg_words_per_utterance  \
21           vi_spex         1018            3633                39.056702   
657    dairygirl4u2c          674            1842               280.932682   
208         imabench          600            2026               415.741362   
286         lannan13          546            2007               340.367713   
219         Danielle          525            1674               712.952210   
523  brian_eggleston          420             944               441.515890   
429         16kadams          356            1343               581.547282   
119       9spaceking          325            1165               244.154506   
122        Stupidape          315            1071               293.440710   
121           Wylted          313            1010               248.631683   

     total_words  
21        141893  
657       

In [16]:
# Check if usernames are unique in the users.json file
# According to the readme, users.json is keyed by username, not numeric ID
# We want to verify that each username appears exactly once as a key

print("Checking username uniqueness in users.json:")
print(f"Total user entries: {len(users)}")

# Extract all usernames from the user keys
usernames_as_keys = list(users.keys())

# Check for any duplicate keys (should be 0 since dict keys are unique by definition)
print(f"Unique usernames (dict keys): {len(set(usernames_as_keys))}")

# The dict itself enforces uniqueness, so this should always be equal
if len(usernames_as_keys) == len(set(usernames_as_keys)):
    print("✓ All usernames in users.json are unique (1:1 mapping guaranteed by dict structure)")
else:
    print("✗ ERROR: Duplicate keys found (this shouldn't be possible in valid JSON)")

# Now let's check if there are any usernames in debates that don't exist in users
print("\n" + "="*60)
print("Checking if all debate participants/voters/commenters exist in users.json:")

# Collect all usernames referenced in debates
debate_usernames = set()

for debate_id, debate in list(debates.items())[:100]:  # Sample first 100 for quick check
    # Add participants
    if debate.get('participant_1_name'):
        debate_usernames.add(debate.get('participant_1_name'))
    if debate.get('participant_2_name'):
        debate_usernames.add(debate.get('participant_2_name'))
    
    # Add voters
    if debate.get('votes'):
        for vote in debate['votes']:
            if vote.get('user_name'):
                debate_usernames.add(vote.get('user_name'))
    
    # Add commenters
    if debate.get('comments'):
        for comment in debate['comments']:
            if comment.get('user_name'):
                debate_usernames.add(comment.get('user_name'))

users_set = set(users.keys())
missing_from_users = debate_usernames - users_set
extra_in_users = users_set - debate_usernames

print(f"Unique usernames in debates (sample of 100): {len(debate_usernames)}")
print(f"Usernames in debates NOT in users.json: {len(missing_from_users)}")
if missing_from_users and len(missing_from_users) <= 10:
    print(f"  Examples: {list(missing_from_users)[:10]}")
print(f"Usernames in users.json NOT in debates (sample): {len(extra_in_users)}")
print("  (This is expected - users may exist without participating in sampled debates)")

Checking username uniqueness in users.json:
Total user entries: 45348
Unique usernames (dict keys): 45348
✓ All usernames in users.json are unique (1:1 mapping guaranteed by dict structure)

Checking if all debate participants/voters/commenters exist in users.json:
Unique usernames in debates (sample of 100): 727
Usernames in debates NOT in users.json: 144
Usernames in users.json NOT in debates (sample): 44765
  (This is expected - users may exist without participating in sampled debates)
