# Summary Statistics and Data Exploration
This notebook explores the provided CSV datasets and Reddit comment data, computing summary statistics and performing statistical tests as described in the assignment.

In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import glob
import os


In [2]:
# Load and Explore CSV Data Sets
data_files = sorted(glob.glob('data-*.csv'))
datasets = {}
for fname in data_files:
    df = pd.read_csv(fname)
    datasets[fname] = df
    print(f"\nFile: {fname}")
    display(df.head())
    print(df.info())


File: data-1.csv


Unnamed: 0,x,y
0,29.168628,26.350615
1,12.801887,11.705425
2,19.744856,15.331458
3,34.890718,27.384153
4,30.410696,20.859101


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       100 non-null    float64
 1   y       100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB
None

File: data-2.csv


Unnamed: 0,x,y
0,-4.430182,-4.828145
1,-2.87255,1.879986
2,13.195915,6.361629
3,7.863165,2.571226
4,7.681578,5.173555


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       100 non-null    float64
 1   y       100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB
None

File: data-3.csv


Unnamed: 0,x,y
0,-2.102059,7.014917
1,-1.882548,7.188184
2,3.828216,10.280269
3,3.955505,10.048889
4,1.66786,9.151079


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       300 non-null    float64
 1   y       300 non-null    float64
dtypes: float64(2)
memory usage: 4.8 KB
None

File: data-4.csv


Unnamed: 0,x,y
0,69.941065,10.85095
1,69.941065,20.833335
2,82.93697,16.50135
3,0.0,32.430215
4,66.00881,44.376695


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       118 non-null    float64
 1   y       118 non-null    float64
dtypes: float64(2)
memory usage: 2.0 KB
None

File: data-5.csv


Unnamed: 0,x,y
0,68.003737,6.715228
1,68.061273,18.087055
2,88.684044,17.876819
3,5.484007,32.665511
4,68.040978,41.768251


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       118 non-null    float64
 1   y       118 non-null    float64
dtypes: float64(2)
memory usage: 2.0 KB
None

File: data-6.csv


Unnamed: 0,x,y
0,26.72544,27.718622
1,7.15822,7.974897
2,10.556236,11.682195
3,1.586883,2.825055
4,13.306077,14.408962


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       100 non-null    float64
 1   y       100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB
None


In [3]:
# Compute Summary Statistics for Each Data Set
for fname, df in datasets.items():
    print(f"\nSummary statistics for {fname}:")
    for col in df.columns:
        print(f"  {col}: mean={df[col].mean():.4f}, std={df[col].std():.4f}, min={df[col].min()}, max={df[col].max()}")


Summary statistics for data-1.csv:
  x: mean=8.7177, std=12.1555, min=-22.635877790008944, max=35.23705548785129
  y: mean=12.5229, std=6.6400, min=-2.565087832693213, max=27.38415261152477

Summary statistics for data-2.csv:
  x: mean=-0.7108, std=11.4816, min=-33.2711130771198, max=27.6470003722073
  y: mean=-0.7490, std=6.2579, min=-18.621803760419063, max=14.14731499231748

Summary statistics for data-3.csv:
  x: mean=0.9689, std=2.2356, min=-2.4490643296742807, max=4.51888491755363
  y: mean=8.5085, std=1.1493, min=6.641734490391763, max=10.551871022804317

Summary statistics for data-4.csv:
  x: mean=49.9051, std=28.5086, min=0.0, max=87.3767
  y: mean=18.5538, std=16.5459, min=0.0, max=44.56504

Summary statistics for data-5.csv:
  x: mean=49.9100, std=28.5000, min=4.337892453674525, max=91.0528633396976
  y: mean=18.5555, std=16.5408, min=0.0554975075811517, max=49.93419703633311

Summary statistics for data-6.csv:
  x: mean=7.2162, std=11.9216, min=-24.33382075234097, max=35.

In [4]:
# Calculate Correlation Coefficient
for fname, df in datasets.items():
    cols = df.columns
    if len(cols) >= 2:
        corr = df[cols[0]].corr(df[cols[1]])
        print(f"Correlation between {cols[0]} and {cols[1]} in {fname}: {corr:.4f}")

Correlation between x and y in data-1.csv: 0.9503
Correlation between x and y in data-2.csv: 0.9552
Correlation between x and y in data-3.csv: 0.9819
Correlation between x and y in data-4.csv: -0.0812
Correlation between x and y in data-5.csv: -0.0882
Correlation between x and y in data-6.csv: 0.9351


## Data Set Descriptions
For each data set, provide a one-sentence description after reviewing the data above. Example:

- **data-1.csv**: [Your description here]
- **data-2.csv**: [Your description here]
- **data-3.csv**: [Your description here]
- **data-4.csv**: [Your description here]
- **data-5.csv**: [Your description here]
- **data-6.csv**: [Your description here]

In [5]:
# Load Reddit Comment Counts Data
reddit_counts = pd.read_json('reddit-counts.json.gz', lines=True)

In [6]:
# Filter Data for 2012-2013 and /r/canada
reddit_counts['date'] = pd.to_datetime(reddit_counts['date'])
filtered = reddit_counts[(reddit_counts['subreddit'] == 'canada') & 
                        (reddit_counts['date'].dt.year.isin([2012, 2013]))].copy()
filtered = filtered.sort_values('date')
filtered.head()

Unnamed: 0,date,subreddit,comment_count
8406,2012-01-01,canada,995
15469,2012-01-02,canada,1127
2150,2012-01-03,canada,1755
8979,2012-01-04,canada,1501
11947,2012-01-05,canada,1693


In [7]:
# Separate Weekdays and Weekends
def is_weekend(dt):
    return dt.weekday() >= 5
filtered['is_weekend'] = filtered['date'].apply(is_weekend)
weekdays = filtered[~filtered['is_weekend']]['comment_count']
weekends = filtered[filtered['is_weekend']]['comment_count']
print(f"Weekday count: {len(weekdays)}, Weekend count: {len(weekends)}")

Weekday count: 522, Weekend count: 209


In [8]:
# Perform Student's T-Test
from scipy.stats import ttest_ind

t_stat, t_pval = ttest_ind(weekdays, weekends, equal_var=False)
print(f"T-test p-value: {t_pval:.4g}")

T-test p-value: 6.139e-60



In [9]:
# Test for Normality and Equal Variance
from scipy.stats import normaltest, levene

norm_weekdays = normaltest(weekdays)
norm_weekends = normaltest(weekends)
levene_test = levene(weekdays, weekends)
print(f"Normality p-value (weekdays): {norm_weekdays.pvalue:.4g}")
print(f"Normality p-value (weekends): {norm_weekends.pvalue:.4g}")
print(f"Levene's test p-value: {levene_test.pvalue:.4g}")

Normality p-value (weekdays): 1.009e-07
Normality p-value (weekends): 0.001521
Levene's test p-value: 0.04379


In [10]:
# Transform Data and Re-Test Normality
transforms = {
    'log': np.log1p,
    'sqrt': np.sqrt,
    'square': np.square
}
for name, func in transforms.items():
    print(f"\nTransformation: {name}")
    wkd = func(weekdays)
    wke = func(weekends)
    norm_wkd = normaltest(wkd)
    norm_wke = normaltest(wke)
    print(f"  Normality p-value (weekdays): {norm_wkd.pvalue:.4g}")
    print(f"  Normality p-value (weekends): {norm_wke.pvalue:.4g}")


Transformation: log

  Normality p-value (weekdays): 0.00041
  Normality p-value (weekends): 0.3146

Transformation: sqrt
  Normality p-value (weekdays): 0.03687
  Normality p-value (weekends): 0.1076

Transformation: square
  Normality p-value (weekdays): 2.78e-29
  Normality p-value (weekends): 2.991e-11
  Normality p-value (weekdays): 0.00041
  Normality p-value (weekends): 0.3146

Transformation: sqrt
  Normality p-value (weekdays): 0.03687
  Normality p-value (weekends): 0.1076

Transformation: square
  Normality p-value (weekdays): 2.78e-29
  Normality p-value (weekends): 2.991e-11


In [11]:
# Aggregate by Year/Week and Analyze with Central Limit Theorem
def get_year_week(dt):
    return dt.isocalendar()[:2]
filtered['year_week'] = filtered['date'].apply(get_year_week)
agg = filtered.groupby(['year_week', 'is_weekend'])['comment_count'].mean().unstack()
agg = agg.dropna()
week_means = agg[False]
weekend_means = agg[True]
print(f"Aggregated weeks: {len(agg)}")
# Normality and variance tests
norm_week_means = normaltest(week_means)
norm_weekend_means = normaltest(weekend_means)
levene_agg = levene(week_means, weekend_means)
print(f"Normality p-value (week_means): {norm_week_means.pvalue:.4g}")
print(f"Normality p-value (weekend_means): {norm_weekend_means.pvalue:.4g}")
print(f"Levene's test p-value (agg): {levene_agg.pvalue:.4g}")
# T-test
agg_ttest = ttest_ind(week_means, weekend_means, equal_var=False)
print(f"Aggregated T-test p-value: {agg_ttest.pvalue:.4g}")

Aggregated weeks: 104
Normality p-value (week_means): 0.3267
Normality p-value (weekend_means): 0.1718
Levene's test p-value (agg): 0.1867
Aggregated T-test p-value: 2.298e-34


In [12]:
# Perform Mann–Whitney U-Test
from scipy.stats import mannwhitneyu
u_stat, u_pval = mannwhitneyu(weekdays, weekends, alternative='two-sided')
print(f"Mann–Whitney U-test p-value: {u_pval:.4g}")

Mann–Whitney U-test p-value: 8.624e-53


In [13]:
# Output Relevant p-values
print("\nRelevant p-values:")
print(f"T-test p-value: {t_pval:.4g}")
print(f"Normality p-value (weekdays): {norm_weekdays.pvalue:.4g}")
print(f"Normality p-value (weekends): {norm_weekends.pvalue:.4g}")
print(f"Levene's test p-value: {levene_test.pvalue:.4g}")
print(f"Aggregated T-test p-value: {agg_ttest.pvalue:.4g}")
print(f"Normality p-value (week_means): {norm_week_means.pvalue:.4g}")
print(f"Normality p-value (weekend_means): {norm_weekend_means.pvalue:.4g}")
print(f"Levene's test p-value (agg): {levene_agg.pvalue:.4g}")
print(f"Mann–Whitney U-test p-value: {u_pval:.4g}")
print(f"Irrelevant p-value: 0.1234")


Relevant p-values:
T-test p-value: 6.139e-60
Normality p-value (weekdays): 1.009e-07
Normality p-value (weekends): 0.001521
Levene's test p-value: 0.04379
Aggregated T-test p-value: 2.298e-34
Normality p-value (week_means): 0.3267
Normality p-value (weekend_means): 0.1718
Levene's test p-value (agg): 0.1867
Mann–Whitney U-test p-value: 8.624e-53
Irrelevant p-value: 0.1234


In [14]:
# Debug: Check columns and preview filtered DataFrame
print(filtered.columns)
display(filtered.head())

Index(['date', 'subreddit', 'comment_count', 'is_weekend', 'year_week'], dtype='object')


Unnamed: 0,date,subreddit,comment_count,is_weekend,year_week
8406,2012-01-01,canada,995,True,"(2011, 52)"
15469,2012-01-02,canada,1127,False,"(2012, 1)"
2150,2012-01-03,canada,1755,False,"(2012, 1)"
8979,2012-01-04,canada,1501,False,"(2012, 1)"
11947,2012-01-05,canada,1693,False,"(2012, 1)"
