# Part 2: Hypothesis Testing & Analysis

## H1: Temporal Impact Analysis
### Does the time of day when a post is published significantly affect its reach?

In [None]:
# Average reach by hour of day
hourly_reach = df.groupby('Upload_Hour')['Views'].agg(['mean', 'median', 'std']).round(0)

fig, ax = plt.subplots(figsize=(14, 6))
ax.bar(hourly_reach.index, hourly_reach['mean'], alpha=0.7, color='skyblue', label='Average Views')
ax.plot(hourly_reach.index, hourly_reach['mean'], color='red', marker='o', linewidth=2, markersize=6, label='Trend')
ax.set_xlabel('Hour of Day', fontsize=12, fontweight='bold')
ax.set_ylabel('Average Views', fontsize=12, fontweight='bold')
ax.set_title('Average Reach by Posting Hour', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
plt.xticks(range(0, 24))
plt.tight_layout()
plt.savefig('../visualizations/h1_hourly_reach.png', dpi=300)
plt.show()

print('Peak posting hours:')
print(hourly_reach.nlargest(5, 'mean'))

In [None]:
# Heatmap: Reach by Day and Hour
pivot_table = df.pivot_table(values='Views', index='Upload_Day', columns='Upload_Hour', aggfunc='mean')

# Order days properly
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
pivot_table = pivot_table.reindex(day_order)

plt.figure(figsize=(16, 6))
sns.heatmap(pivot_table, annot=False, fmt='.0f', cmap='YlOrRd', linewidths=0.5, cbar_kws={'label': 'Average Views'})
plt.title('Reach Heatmap: Day of Week vs Hour of Day', fontsize=14, fontweight='bold')
plt.xlabel('Hour of Day', fontsize=12)
plt.ylabel('Day of Week', fontsize=12)
plt.tight_layout()
plt.savefig('../visualizations/h1_heatmap.png', dpi=300)
plt.show()

In [None]:
# Statistical Test: ANOVA
groups_by_period = [df[df['Time_Period'] == period]['Views'] for period in df['Time_Period'].unique()]
f_stat, p_value = stats.f_oneway(*groups_by_period)

print('\n' + '='*60)
print('H1: Statistical Test (ANOVA)')
print('='*60)
print(f'F-statistic: {f_stat:.4f}')
print(f'P-value: {p_value:.6f}')

if p_value < 0.05:
    print('\nConclusion: REJECT null hypothesis')
    print('There IS a statistically significant difference in reach based on posting time.')
else:
    print('\nConclusion: FAIL TO REJECT null hypothesis')
    print('There is NO statistically significant difference in reach based on posting time.')

# Box plot by time period
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Time_Period', y='Views', order=['Late Night', 'Morning', 'Afternoon', 'Evening'])
plt.title('Reach Distribution by Time Period', fontsize=14, fontweight='bold')
plt.xlabel('Time Period', fontsize=12)
plt.ylabel('Views', fontsize=12)
plt.yscale('log')
plt.tight_layout()
plt.savefig('../visualizations/h1_boxplot.png', dpi=300)
plt.show()

## H2: Engagement Correlation Analysis
### What is the relationship between likes, comments, shares and total views?

In [None]:
# Correlation matrix
engagement_cols = ['Views', 'Likes', 'Comments', 'Shares', 'Engagement_Rate']
corr_matrix = df[engagement_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix: Engagement Metrics', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../visualizations/h2_correlation.png', dpi=300)
plt.show()

print('\nCorrelation with Views:')
print(corr_matrix['Views'].sort_values(ascending=False))

In [None]:
# Scatter plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

metrics = ['Likes', 'Comments', 'Shares']
colors = ['blue', 'green', 'red']

for idx, (metric, color) in enumerate(zip(metrics, colors)):
    axes[idx].scatter(df[metric], df['Views'], alpha=0.5, s=30, c=color)
    
    # Add regression line
    z = np.polyfit(df[metric], df['Views'], 1)
    p = np.poly1d(z)
    axes[idx].plot(df[metric], p(df[metric]), 'r--', linewidth=2, label='Regression Line')
    
    # Calculate R-squared
    r_value = stats.pearsonr(df[metric], df['Views'])[0]
    axes[idx].text(0.05, 0.95, f'R = {r_value:.3f}', transform=axes[idx].transAxes,
                   fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    axes[idx].set_xlabel(metric, fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Views', fontsize=12, fontweight='bold')
    axes[idx].set_title(f'Views vs {metric}', fontsize=14, fontweight='bold')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../visualizations/h2_scatter.png', dpi=300)
plt.show()

In [None]:
# Statistical tests
print('\n' + '='*60)
print('H2: Correlation Analysis (Pearson Correlation)')
print('='*60)

for metric in ['Likes', 'Comments', 'Shares']:
    corr, p_val = stats.pearsonr(df[metric], df['Views'])
    print(f'\n{metric} vs Views:')
    print(f'  Correlation Coefficient: {corr:.4f}')
    print(f'  P-value: {p_val:.6f}')
    print(f'  Interpretation: {"Strong" if abs(corr) > 0.7 else "Moderate" if abs(corr) > 0.4 else "Weak"} {"positive" if corr > 0 else "negative"} correlation')