In [None]:
# Exploratory Data Analysis (EDA)
'''
This notebook explores the dataset prior to modeling.  
All heavy cleaning and feature engineering are handled in the scripts (`src/data_preparation.py`).  
Here, we visualize, summarize, and find key patterns.
'''

In [None]:
# Imports and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# Import the loading functions from your script
from src.data_preparation import load_and_prepare_data, load_raw_dataframe

# Load the dataset
file_path = "./mnt/data/mini_project_1_data.csv"
x, y = load_and_prepare_data(file_path)
df = load_raw_dataframe(file_path)      # for more EDA flexibility

print(f"Features shape: {x.shape}")
print(f"Target shape: {y.shape}")

# Check for Missing Values and Duplicates
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nDuplicated rows: {df.duplicated().sum()}")
print(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)))

# Target Variable Distribution
sns.countplot(x=y)
plt.title("Distribution of High Share (Target Variable)")
plt.xlabel("High Share (1=Above Median)")
plt.ylabel("Count")
plt.grid(True)
plt.show()

#Feature Distributions
sns.countplot(x='weekday', data=df)
plt.title("Articles by Day")
plt.xlabel('Day')
plt.ylabel('Count')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

sns.countplot(x='data_channel', data=df)
plt.title("Articles by Data Channel")
plt.xlabel('Data channel')
plt.ylabel('Count')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

# Numeric Features & Boxplots
numeric_cols = df.select_dtypes(include='number').columns.tolist()

for col in numeric_cols:
    plt.figure(figsize=(6, 3))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.show()

# ---------- First Boxplot: Shares by Weekday ----------
sns.boxplot(x='weekday', y='shares', data=df)
plt.title("Shares by Day")
plt.xlabel('Day')
plt.ylabel('Share')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

# Relationships & Correlations
# Scatterplot
if 'n_tokens_content' in df.columns:
    sns.scatterplot(x='n_tokens_content', y='shares', data=df)
    plt.title("Shares vs Number of Tokens in Content")
    plt.show()

# Correlation Heatmap
corr = df.select_dtypes(include='number').corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Scenario A: Viral despite odds
# Define what you mean by "lacking common success factors."
outliers = df[
    (df['shares'] > df['shares'].quantile(0.95)) & # very high shares
    (df['n_tokens_content'] < df['n_tokens_content'].median()) & # short article
    (df.get('num_imgs', 0) == 0) & # no images
    (df.get('num_hrefs', 0) == 0) & # no hyperlinks
    (df.get('num_self_hrefs', 0) == 0) & # no self-hyperlinks
    (df.get('num_comments', 0) < df['num_comments'].median()) # few comments
]
outliers.describe().T
# Goal: Find Viral articles with unexpectedly high popularity despite odds (lacking common "success factors").
# Insight: High shares despite short length, no images, and few comments suggest the impact of strong headlines or timely relevance.

# Scenario B: Viral business articles on weekends
unusual_outliers = df[
    (df['shares'] > df['shares'].quantile(0.95)) &
    (df['weekday'].isin(['Saturday', 'Sunday'])) &
    (df['data_channel'] == 'Business')
]
unusual_outliers.describe().T
# You can also group and count to get more insight:
unusual_outliers.groupby(['weekday', 'data_channel']).size()
# Goal: Find which categories or weekdays produce "viral despite odds" articles.
# Insight: Unusually high shares for weekend business articles hint at special events or changing reader habits influencing engagement.

# Scenario C: No-sibling outliers
no_sibling_outliers = df[
    (df['shares'] > df['shares'].quantile(0.95)) &  # top 5% performers
    (df.get('siblings', 0) == 0) &                  # no siblings
    (df.get('attendance', 'Never') == 'Never')      # never attends
]
no_sibling_outliers.describe().T
# Goal: Find top students who succeed without family/school support—indicating strong self-motivation, alternative learning strategies/special circumstances.
# Insight: Some students excel without family or school support, hinting at personal discipline or alternative learning strategies.

In [None]:
# Insights & Next Steps
# (Markdown)
"""
# Insights

- Target variable (`high_share`) is approximately balanced/unbalanced (check plot).
- Weekday and channel distributions show...
- Key features correlated with 'shares': ...
- Outliers may be present in 'shares' or other numeric columns.

**Next:** Move to modeling and feature engineering in scripts, not in this notebook!
"""