# Data Acquisition Notebook

This notebook handles the data acquisition process for the CheckDi project. It scrapes news data from the Anti-Fake News Center (AFNC) website and creates a dataset for training the fake news detection model.

In [None]:
# Import required modules
import sys
import os

# Add the src directory to the path so we can import our modules
sys.path.append(os.path.join(os.getcwd(), '..'))

from src.core.scraper import main as scrape_data
import pandas as pd
import logging

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Run the data scraping process
print("Starting data acquisition process...")
scrape_data()
print("Data acquisition completed!")

In [None]:
# Load and examine the scraped data
try:
    df = pd.read_csv('../data/raw/news.csv')
    print(f"Dataset shape: {df.shape}")
    print("\nFirst few rows:")
    display(df.head())
    
    print("\nLabel distribution:")
    print(df['label'].value_counts())
    
    print("\nMissing values:")
    print(df.isnull().sum())
except FileNotFoundError:
    print("Data file not found. Please run the scraping process first.")
except Exception as e:
    print(f"Error loading data: {e}")

## Next Steps

After running this notebook:
1. Check the `data/raw/news.csv` file for the scraped data
2. Proceed to the data exploration notebook for further analysis
3. Move to the data preparation notebook to clean and prepare the data for modeling