In [1]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


H1: Most communities primarily evolve through rule additions.

H2: Communities are more likely to undergo incremental (single-rule) changes rather than bundled (multiple-rule) changes.

**H3_0: (Null Hypothesis): Rule additions and deletions occur with equal frequency across communities experiencing incremental and bundled changes.**

**H3.1: Communities undergoing incremental (single) changes show higher proportions of rule additions than deletions, while communities undergoing bundled changes show higher proportions of rule deletions than additions.**

**H3.2: Communities undergoing bundled changes show higher proportions of rule additions than deletions, while communities undergoing incremental changes show higher proportions of rule deletions than additions.**


# General Descriptive Statistics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime

# Load the CSV file
file_path = '/content/drive/My Drive/Projects/Reddit_rules/2024_ChenEtAl_rulechange/sub_level_data.csv'
df = pd.read_csv(file_path)

# Set up visualization style
plt.style.use('default')
sns.set_palette("husl")

print("=" * 80)
print("REDDIT COMMUNITY STATISTICAL ANALYSIS")
print("=" * 80)

# 1. BASIC DATASET INFORMATION
print("\n1. DATASET OVERVIEW")
print("-" * 40)
print(f"Total number of communities: {len(df)}")
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nMissing values per column:")
print(df.isnull().sum())

# 2. DESCRIPTIVE STATISTICS
print("\n2. DESCRIPTIVE STATISTICS")
print("-" * 40)

# Numerical columns for analysis
numerical_cols = ['added', 'changed', 'deleted', 'unchanged',
                 'subscribers_1', 'subscribers_2', 'rules_1', 'rules_2', 'age_in_months']

print("\nBasic Statistics for Numerical Variables:")
print(df[numerical_cols].describe().round(2))

# 3. RULE CHANGE ANALYSIS
print("\n3. RULE CHANGE PATTERNS")
print("-" * 40)

# Total rule changes
df['total_changes'] = df['added'] + df['changed'] + df['deleted']
df['change_rate'] = df['total_changes'] / (df['rules_1'] + 1)  # +1 to avoid division by zero

print(f"\nCommunities with any rule changes: {(df['total_changes'] > 0).sum()} ({(df['total_changes'] > 0).mean()*100:.1f}%)")
print(f"Communities with no changes: {(df['total_changes'] == 0).sum()} ({(df['total_changes'] == 0).mean()*100:.1f}%)")

print("\nRule Change Statistics:")
print(f"Average rules added: {df['added'].mean():.2f} (±{df['added'].std():.2f})")
print(f"Average rules changed: {df['changed'].mean():.2f} (±{df['changed'].std():.2f})")
print(f"Average rules deleted: {df['deleted'].mean():.2f} (±{df['deleted'].std():.2f})")
print(f"Average rules unchanged: {df['unchanged'].mean():.2f} (±{df['unchanged'].std():.2f})")

# Distribution of change types
change_distribution = pd.DataFrame({
    'Added': [(df['added'] > 0).sum()],
    'Changed': [(df['changed'] > 0).sum()],
    'Deleted': [(df['deleted'] > 0).sum()],
    'No Changes': [(df['total_changes'] == 0).sum()]
})
print("\nCommunities by Change Type:")
print(change_distribution.T)

# 4. SUBSCRIBER GROWTH ANALYSIS
print("\n4. SUBSCRIBER GROWTH ANALYSIS")
print("-" * 40)

df['subscriber_growth'] = df['subscribers_2'] - df['subscribers_1']
df['growth_rate'] = ((df['subscribers_2'] - df['subscribers_1']) / (df['subscribers_1'] + 1)) * 100

print(f"\nAverage subscriber growth: {df['subscriber_growth'].mean():.0f} (±{df['subscriber_growth'].std():.0f})")
print(f"Median subscriber growth: {df['subscriber_growth'].median():.0f}")
print(f"Average growth rate: {df['growth_rate'].mean():.2f}%")
print(f"Median growth rate: {df['growth_rate'].median():.2f}%")

# Categorize communities by size
df['size_category'] = pd.cut(df['subscribers_2'],
                             bins=[0, 1000, 10000, 100000, float('inf')],
                             labels=['Small (<1k)', 'Medium (1k-10k)',
                                    'Large (10k-100k)', 'Very Large (>100k)'])

print("\nCommunity Distribution by Size:")
print(df['size_category'].value_counts().sort_index())

# 5. CORRELATION ANALYSIS
print("\n5. CORRELATION ANALYSIS")
print("-" * 40)

# Select variables for correlation
corr_vars = ['added', 'changed', 'deleted', 'unchanged', 'total_changes',
            'subscribers_1', 'subscribers_2', 'subscriber_growth',
            'growth_rate', 'rules_1', 'rules_2', 'age_in_months']

# Calculate correlation matrix
corr_matrix = df[corr_vars].corr()

# Find strongest correlations
print("\nStrongest Correlations (|r| > 0.3):")
strong_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.3:
            strong_corr.append((corr_matrix.columns[i],
                              corr_matrix.columns[j],
                              corr_matrix.iloc[i, j]))

strong_corr.sort(key=lambda x: abs(x[2]), reverse=True)
for var1, var2, corr in strong_corr[:10]:
    print(f"  {var1} ↔ {var2}: {corr:.3f}")

# 6. COMMUNITY AGE ANALYSIS
print("\n6. COMMUNITY AGE ANALYSIS")
print("-" * 40)

print(f"Average community age: {df['age_in_months'].mean():.1f} months")
print(f"Median community age: {df['age_in_months'].median():.1f} months")
print(f"Oldest community: {df['age_in_months'].max():.0f} months ({df['age_in_months'].max()/12:.1f} years)")
print(f"Youngest community: {df['age_in_months'].min():.0f} months")

# Age categories
df['age_category'] = pd.cut(df['age_in_months'],
                            bins=[0, 12, 36, 60, float('inf')],
                            labels=['New (<1yr)', 'Young (1-3yrs)',
                                   'Established (3-5yrs)', 'Mature (>5yrs)'])

print("\nCommunity Distribution by Age:")
print(df['age_category'].value_counts().sort_index())

# 7. RULE COMPLEXITY ANALYSIS
print("\n7. RULE COMPLEXITY ANALYSIS")
print("-" * 40)

df['rule_change'] = df['rules_2'] - df['rules_1']

print(f"Average initial rules: {df['rules_1'].mean():.2f} (±{df['rules_1'].std():.2f})")
print(f"Average final rules: {df['rules_2'].mean():.2f} (±{df['rules_2'].std():.2f})")
print(f"Average rule count change: {df['rule_change'].mean():.2f}")

print("\nRule Count Changes:")
print(f"Communities that increased rules: {(df['rule_change'] > 0).sum()} ({(df['rule_change'] > 0).mean()*100:.1f}%)")
print(f"Communities that decreased rules: {(df['rule_change'] < 0).sum()} ({(df['rule_change'] < 0).mean()*100:.1f}%)")
print(f"Communities with same rule count: {(df['rule_change'] == 0).sum()} ({(df['rule_change'] == 0).mean()*100:.1f}%)")

# 8. STATISTICAL TESTS
print("\n8. STATISTICAL SIGNIFICANCE TESTS")
print("-" * 40)

# Test if rule changes are related to community size
communities_with_changes = df[df['total_changes'] > 0]['subscribers_1']
communities_without_changes = df[df['total_changes'] == 0]['subscribers_1']

if len(communities_with_changes) > 0 and len(communities_without_changes) > 0:
    # Mann-Whitney U test (non-parametric)
    statistic, pvalue = stats.mannwhitneyu(communities_with_changes,
                                           communities_without_changes,
                                           alternative='two-sided')
    print(f"\nMann-Whitney U test (size difference between communities with/without changes):")
    print(f"  U-statistic: {statistic:.2f}")
    print(f"  p-value: {pvalue:.4f}")
    if pvalue < 0.05:
        print(f"  Result: Significant difference in size (p < 0.05)")
    else:
        print(f"  Result: No significant difference in size (p >= 0.05)")

# Test correlation between subscriber growth and rule changes
if df['subscriber_growth'].std() > 0 and df['total_changes'].std() > 0:
    corr_coef, p_value = stats.spearmanr(df['subscriber_growth'], df['total_changes'])
    print(f"\nSpearman correlation (subscriber growth vs. total rule changes):")
    print(f"  Correlation coefficient: {corr_coef:.4f}")
    print(f"  p-value: {p_value:.4f}")
    if p_value < 0.05:
        print(f"  Result: Significant correlation (p < 0.05)")
    else:
        print(f"  Result: No significant correlation (p >= 0.05)")

# 9. OUTLIER DETECTION
print("\n9. OUTLIER ANALYSIS")
print("-" * 40)

def detect_outliers(series, threshold=3):
    z_scores = np.abs(stats.zscore(series.dropna()))
    return sum(z_scores > threshold)

print("Outliers detected (z-score > 3):")
outlier_cols = ['added', 'changed', 'deleted', 'subscribers_2', 'subscriber_growth']
for col in outlier_cols:
    n_outliers = detect_outliers(df[col])
    print(f"  {col}: {n_outliers} outliers ({n_outliers/len(df)*100:.2f}%)")

# 10. SUMMARY INSIGHTS
print("\n10. KEY INSIGHTS SUMMARY")
print("-" * 40)

# Calculate some key metrics
active_communities = (df['total_changes'] > 0).mean() * 100
avg_growth = df['growth_rate'].median()
most_common_change = 'added' if df['added'].sum() > df['deleted'].sum() else 'deleted'

print(f"• {active_communities:.1f}% of communities made at least one rule change")
print(f"• Median subscriber growth rate: {avg_growth:.1f}%")
print(f"• Most common rule change type: Rules {most_common_change}")
print(f"• Larger communities tend to have {'more' if corr_matrix.loc['subscribers_1', 'rules_1'] > 0 else 'fewer'} rules")
print(f"• Average community age: {df['age_in_months'].mean()/12:.1f} years")

# Identify most active communities
df['activity_score'] = df['total_changes'] + (df['growth_rate'] / 100)
top_active = df.nlargest(5, 'activity_score')[['communityID', 'total_changes', 'growth_rate', 'subscribers_2']]
print("\nMost Active Communities (by rule changes and growth):")
print(top_active.to_string(index=False))

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)

REDDIT COMMUNITY STATISTICAL ANALYSIS

1. DATASET OVERVIEW
----------------------------------------
Total number of communities: 130851
Dataset shape: (130851, 13)

Column names and types:
communityID       object
added            float64
changed          float64
deleted          float64
unchanged        float64
subscribers_1      int64
subscribers_2      int64
rules_1            int64
rules_2            int64
timestamp_1      float64
timestamp_2      float64
founding_date    float64
age_in_months    float64
dtype: object

Missing values per column:
communityID      0
added            0
changed          0
deleted          0
unchanged        0
subscribers_1    0
subscribers_2    0
rules_1          0
rules_2          0
timestamp_1      0
timestamp_2      0
founding_date    0
age_in_months    0
dtype: int64

2. DESCRIPTIVE STATISTICS
----------------------------------------

Basic Statistics for Numerical Variables:
           added    changed    deleted  unchanged  subscribers_1  \
count