# Cell 1: Setup & Mount Google Drive

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("Google Drive mounted successfully!")

Mounted at /content/drive
Google Drive mounted successfully!


# Cell 2: Define Paths

In [2]:
# Define all project paths (use this same block in every notebook)
import os

BASE_PATH = '/content/drive/MyDrive/same_words_different_worlds'

PATHS = {
    'raw': os.path.join(BASE_PATH, 'data/raw'),
    'processed': os.path.join(BASE_PATH, 'data/processed'),
    'outputs': os.path.join(BASE_PATH, 'data/outputs'),
    'models': os.path.join(BASE_PATH, 'models'),
    'figures': os.path.join(BASE_PATH, 'figures'),
}

# Create directories if they don't exist
for name, path in PATHS.items():
    os.makedirs(path, exist_ok=True)
    print(f"✓ {name}: {path}")

# Verify raw data exists
raw_file = os.path.join(PATHS['raw'], 'congress_tweets_full_2018_2024.csv')
if os.path.exists(raw_file):
    print(f"\n✓ Raw data file found!")
else:
    print(f"\n✗ ERROR: Raw data file not found at {raw_file}")

✓ raw: /content/drive/MyDrive/same_words_different_worlds/data/raw
✓ processed: /content/drive/MyDrive/same_words_different_worlds/data/processed
✓ outputs: /content/drive/MyDrive/same_words_different_worlds/data/outputs
✓ models: /content/drive/MyDrive/same_words_different_worlds/models
✓ figures: /content/drive/MyDrive/same_words_different_worlds/figures

✓ Raw data file found!


# Cell 3: Load Raw Data & Initial Inspection

In [3]:
import pandas as pd

# Load the raw dataset
raw_file = os.path.join(PATHS['raw'], 'congress_tweets_full_2018_2024.csv')
df = pd.read_csv(raw_file, low_memory=False)

# Basic info
print("="*60)
print("DATASET OVERVIEW")
print("="*60)
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nColumn names:\n{df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nFirst 3 rows:")
df.head(3)

DATASET OVERVIEW
Shape: 1,893,564 rows × 7 columns

Column names:
['years', 'chamber', 'party', 'name', 'postedAt', 'text', 'tweetId']

Data types:
years       object
chamber     object
party       object
name        object
postedAt     int64
text        object
tweetId      int64
dtype: object

First 3 rows:


Unnamed: 0,years,chamber,party,name,postedAt,text,tweetId
0,2017_2018,House,R,Ken Buck R-CO,1542901786338,Happy #Thanksgiving! I hope everyone has a cha...,1065633444502224896
1,2021_2022,House,R,Kat Cammack R-FL,1641481912624,"RT @MorningsMaria: .@RepKatCammack: ""The Democ...",1479108462504136704
2,2021_2022,House,R,Chip Roy R-TX,1631195457715,RT @chiproytx: No. https://t.co/STZW0DdDz1,1435963943533568000


# Cell 4: Explore Key Columns

In [4]:
# Check unique values and distributions for key columns
print("="*60)
print("KEY COLUMN EXPLORATION")
print("="*60)

# Party distribution
print("\n1. PARTY DISTRIBUTION:")
print(df['party'].value_counts())

# Years covered
print("\n2. YEARS COVERED:")
print(df['years'].value_counts().sort_index())

# Chamber distribution
print("\n3. CHAMBER DISTRIBUTION:")
print(df['chamber'].value_counts())

# Missing values
print("\n4. MISSING VALUES:")
print(df.isnull().sum())

# Sample of text column
print("\n5. SAMPLE TWEETS:")
print(df['text'].iloc[0])
print("---")
print(df['text'].iloc[100])

KEY COLUMN EXPLORATION

1. PARTY DISTRIBUTION:
party
D    1140842
R     744702
I       8020
Name: count, dtype: int64

2. YEARS COVERED:
years
2017_2018     23079
2019_2020    761494
2021_2022    641354
Name: count, dtype: int64

3. CHAMBER DISTRIBUTION:
chamber
House     1482006
Senate     411558
Name: count, dtype: int64

4. MISSING VALUES:
years       467637
chamber          0
party            0
name             0
postedAt         0
text          1034
tweetId          0
dtype: int64

5. SAMPLE TWEETS:
Happy #Thanksgiving! I hope everyone has a chance today to reflect on the ways they're thankful. I'm grateful today for a loving family, for a caring community, and for a country that believes in freedom.
---
My thoughts are with the sailors and civilians injured in the fire aboard the USS Bonhomme Richard in San Diego. I’m grateful for the sailors and firefighters who continue to battle this blaze. Please stay safe. @LHD6BHR @NavBaseSD


The years column has 467K missing values, but postedAt is complete. Let's use that to extract accurate dates.

# Cell 5: Convert Timestamp & Extract Year

In [5]:
# Convert postedAt (Unix timestamp in milliseconds) to datetime
df['posted_date'] = pd.to_datetime(df['postedAt'], unit='ms')

# Extract year from the proper timestamp
df['year'] = df['posted_date'].dt.year

# Check the date range and year distribution
print("="*60)
print("TIMESTAMP CONVERSION RESULTS")
print("="*60)

print(f"\nDate range: {df['posted_date'].min()} to {df['posted_date'].max()}")

print(f"\nYear distribution (from postedAt):")
print(df['year'].value_counts().sort_index())

print(f"\nTotal tweets per party per year:")
print(pd.crosstab(df['year'], df['party']))

TIMESTAMP CONVERSION RESULTS

Date range: 2018-11-18 20:18:34.266000 to 2024-11-21 01:03:48

Year distribution (from postedAt):
year
2018     23116
2019    353044
2020    412609
2021    415836
2022    261979
2023    170847
2024    256133
Name: count, dtype: int64

Total tweets per party per year:
party       D     I       R
year                       
2018    15002   253    7861
2019   228813  1896  122335
2020   273283  1724  137602
2021   248648  1586  165602
2022   154092  1069  106818
2023    89647   700   80500
2024   131357   792  123984


Now we have clean year data spanning 2018-2024. Good coverage across both parties. Let's proceed to filter for AI-related tweets.

# Cell 6: Define AI Keywords & Filter

In [6]:
import re

# Comprehensive AI keyword list
AI_KEYWORDS = [
    # Core AI terms
    "ai", "artificial intelligence", "machine learning", "deep learning",
    "neural network", "generative ai", "large language model", "llm",
    "natural language processing", "nlp",

    # Specific models & companies
    "chatgpt", "gpt-3", "gpt-4", "gpt4", "openai", "midjourney",
    "dall-e", "dall e", "bard", "gemini", "anthropic", "claude ai",
    "stable diffusion", "copilot",

    # AI applications & concerns
    "deepfake", "facial recognition", "biometric", "predictive policing",
    "autonomous vehicle", "self-driving", "robotics", "algorithm bias",
    "algorithmic bias", "algorithmic discrimination",
    "automated decision", "ai regulation", "ai safety", "ai act",
    "ai ethics", "responsible ai", "explainable ai"
]

# Build regex pattern with word boundaries
# \b ensures we match whole words (avoids "said" matching "ai")
# s? allows optional plural (e.g., "algorithms")
pattern = '|'.join([r'\b' + re.escape(kw) + r's?\b' for kw in AI_KEYWORDS])

print("Sample pattern fragments:")
print(pattern[:200] + "...")

# Apply filter (case-insensitive)
print("\nFiltering for AI-related tweets...")
df['text_lower'] = df['text'].astype(str).str.lower()
ai_mask = df['text_lower'].str.contains(pattern, case=False, regex=True, na=False)

# Count results
n_total = len(df)
n_ai = ai_mask.sum()

print(f"\nResults:")
print(f"  Total tweets:     {n_total:,}")
print(f"  AI-related:       {n_ai:,}")
print(f"  Retention rate:   {n_ai/n_total*100:.4f}%")

Sample pattern fragments:
\bais?\b|\bartificial\ intelligences?\b|\bmachine\ learnings?\b|\bdeep\ learnings?\b|\bneural\ networks?\b|\bgenerative\ ais?\b|\blarge\ language\ models?\b|\bllms?\b|\bnatural\ language\ processings?...

Filtering for AI-related tweets...

Results:
  Total tweets:     1,893,564
  AI-related:       3,209
  Retention rate:   0.1695%


3,209 AI-related tweets—a healthy sample size for this analysis. Now let's apply the filter and remove Independents.

# Cell 7: Apply Filter & Remove Independents

In [7]:
# Apply the AI filter
df_ai = df[ai_mask].copy()

# Check party distribution before removing Independents
print("="*60)
print("AI TWEETS BY PARTY (Before Filtering)")
print("="*60)
print(df_ai['party'].value_counts())

# Remove Independents (keep only D and R for clean partisan comparison)
df_ai = df_ai[df_ai['party'].isin(['D', 'R'])].copy()

print(f"\nAfter removing Independents:")
print(df_ai['party'].value_counts())

# Remove rows with missing text
n_before = len(df_ai)
df_ai = df_ai.dropna(subset=['text']).copy()
n_after = len(df_ai)
print(f"\nDropped {n_before - n_after} rows with missing text")

# Final count
print(f"\nFinal AI tweet count: {len(df_ai):,}")

AI TWEETS BY PARTY (Before Filtering)
party
D    2022
R    1180
I       7
Name: count, dtype: int64

After removing Independents:
party
D    2022
R    1180
Name: count, dtype: int64

Dropped 0 rows with missing text

Final AI tweet count: 3,202


Good! 3,202 tweets—Democrats discuss AI more frequently (2,022 vs 1,180), but both have enough data. Let's now clean the text for modeling.

# Cell 8: Text Cleaning for RoBERTa

In [8]:
def clean_text_for_roberta(text):
    """
    Clean text while preserving features RoBERTa needs.
    - Remove URLs (no semantic value)
    - Remove excessive whitespace
    - KEEP case and punctuation (RoBERTa is case-sensitive)
    """
    if not isinstance(text, str):
        return ""

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove RT prefix (retweet marker)
    text = re.sub(r'^RT @\w+:\s*', '', text)

    # Replace newlines with spaces
    text = text.replace('\n', ' ')

    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply cleaning
df_ai['clean_text'] = df_ai['text'].apply(clean_text_for_roberta)

# Check results
print("="*60)
print("TEXT CLEANING RESULTS")
print("="*60)

# Show before/after examples
for i in [0, 50, 100]:
    print(f"\n--- Example {i+1} ---")
    print(f"ORIGINAL: {df_ai['text'].iloc[i][:100]}...")
    print(f"CLEANED:  {df_ai['clean_text'].iloc[i][:100]}...")

# Check for empty or too-short texts after cleaning
print(f"\n\nText length distribution after cleaning:")
df_ai['text_length'] = df_ai['clean_text'].str.len()
print(df_ai['text_length'].describe())

TEXT CLEANING RESULTS

--- Example 1 ---
ORIGINAL: Instead of unfunded executive orders and gauzy principles, American leadership in AI demands a compr...
CLEANED:  Instead of unfunded executive orders and gauzy principles, American leadership in AI demands a compr...

--- Example 51 ---
ORIGINAL: .@RoyBlunt and I are facing off against the IRS’s recent requirement that forced people to use facia...
CLEANED:  .@RoyBlunt and I are facing off against the IRS’s recent requirement that forced people to use facia...

--- Example 101 ---
ORIGINAL: On the potential risks of AI, 
@RepTedLieu
 tells 
@lmatsakis
:

"We're not even sure what the harms...
CLEANED:  On the potential risks of AI, @RepTedLieu tells @lmatsakis : "We're not even sure what the harms cou...


Text length distribution after cleaning:
count    3202.000000
mean      223.907245
std        65.391235
min         0.000000
25%       188.000000
50%       244.000000
75%       269.000000
max      1125.000000
Name: text_length, dtyp

notice min = 0—some texts are empty after cleaning. Let's filter those out.

# Cell 9: Remove Empty & Too-Short Texts

In [9]:
# Remove empty or too-short texts (less than 10 characters is likely meaningless)
MIN_LENGTH = 10

n_before = len(df_ai)
df_ai = df_ai[df_ai['text_length'] >= MIN_LENGTH].copy()
n_after = len(df_ai)

print("="*60)
print("REMOVING SHORT TEXTS")
print("="*60)
print(f"Minimum length threshold: {MIN_LENGTH} characters")
print(f"Before: {n_before:,}")
print(f"After:  {n_after:,}")
print(f"Dropped: {n_before - n_after}")

# Verify no empty texts remain
print(f"\nNew minimum text length: {df_ai['text_length'].min()}")

# Check a few of the shortest remaining texts to ensure quality
print("\nShortest remaining texts:")
shortest = df_ai.nsmallest(5, 'text_length')[['clean_text', 'text_length']]
for idx, row in shortest.iterrows():
    print(f"  [{row['text_length']} chars]: {row['clean_text']}")

REMOVING SHORT TEXTS
Minimum length threshold: 10 characters
Before: 3,202
After:  3,201
Dropped: 1

New minimum text length: 19

Shortest remaining texts:
  [19 chars]: AI has come so far.
  [27 chars]: Livestream is open now! #AI
  [28 chars]: LIVE NOW: Future of AI Forum
  [31 chars]: Learn more about my AI Ads Act:
  [32 chars]: Today’s AI hearing on Oversight.


# Cell 10: Select Final Columns & Save

In [10]:
# Select columns needed for analysis
final_columns = [
    'party',           # D or R
    'text',            # Original text (for reference)
    'clean_text',      # Cleaned text (for modeling)
    'posted_date',     # Full timestamp
    'year',            # Year extracted
    'name',            # Congress member name
    'chamber'          # House or Senate
]

df_final = df_ai[final_columns].copy()

# Reset index for clean numbering
df_final = df_final.reset_index(drop=True)

# Final verification
print("="*60)
print("FINAL DATASET SUMMARY")
print("="*60)
print(f"Shape: {df_final.shape}")
print(f"\nParty distribution:")
print(df_final['party'].value_counts())
print(f"\nYear distribution:")
print(df_final['year'].value_counts().sort_index())
print(f"\nChamber distribution:")
print(df_final['chamber'].value_counts())
print(f"\nSample row:")
print(df_final.iloc[0])

FINAL DATASET SUMMARY
Shape: (3201, 7)

Party distribution:
party
D    2022
R    1179
Name: count, dtype: int64

Year distribution:
year
2018      23
2019     497
2020     303
2021     224
2022     188
2023     714
2024    1252
Name: count, dtype: int64

Chamber distribution:
chamber
House     2074
Senate    1127
Name: count, dtype: int64

Sample row:
party                                                          D
text           Instead of unfunded executive orders and gauzy...
clean_text     Instead of unfunded executive orders and gauzy...
posted_date                           2020-01-31 16:16:18.923000
year                                                        2020
name                                         Michael Bennet D-CO
chamber                                                   Senate
Name: 0, dtype: object


# Cell 11: Save Cleaned Dataset

In [11]:
# Save to processed folder
output_path = os.path.join(PATHS['processed'], '01_ai_tweets_clean.csv')
df_final.to_csv(output_path, index=False)

print("="*60)
print("DATA SAVED SUCCESSFULLY")
print("="*60)
print(f"Output file: {output_path}")
print(f"File size: {os.path.getsize(output_path) / 1024:.1f} KB")

# Final checklist
print("\n" + "="*60)
print("NOTEBOOK 01 COMPLETE ✓")
print("="*60)
print("""
Summary:
  - Started with 1,893,564 raw tweets
  - Filtered to 3,201 AI-related tweets (D & R only)
  - Cleaned text for RoBERTa processing
  - Saved to: data/processed/01_ai_tweets_clean.csv

Next steps:
  → Notebook 02: Exploratory Data Analysis
  → Input: data/processed/01_ai_tweets_clean.csv
""")

DATA SAVED SUCCESSFULLY
Output file: /content/drive/MyDrive/same_words_different_worlds/data/processed/01_ai_tweets_clean.csv
File size: 1637.9 KB

NOTEBOOK 01 COMPLETE ✓

Summary:
  - Started with 1,893,564 raw tweets
  - Filtered to 3,201 AI-related tweets (D & R only)
  - Cleaned text for RoBERTa processing
  - Saved to: data/processed/01_ai_tweets_clean.csv

Next steps:
  → Notebook 02: Exploratory Data Analysis
  → Input: data/processed/01_ai_tweets_clean.csv



#