# Time-Based Analysis of Fake News Patterns

This notebook analyzes how fake news characteristics evolve over time.

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Load datasets
bf_fake_df = pd.read_csv("../data/BuzzFeed_fake_news_content.csv")
bf_real_df = pd.read_csv("../data/BuzzFeed_real_news_content.csv")
pf_fake_df = pd.read_csv("../data/PolitiFact_fake_news_content.csv")
pf_real_df = pd.read_csv("../data/PolitiFact_real_news_content.csv")

# Add labels
bf_fake_df['label'] = 1  # 1 for fake news
bf_real_df['label'] = 0  # 0 for real news
pf_fake_df['label'] = 1
pf_real_df['label'] = 0

# Add dataset identifier
bf_fake_df['dataset'] = 'BuzzFeed'
bf_real_df['dataset'] = 'BuzzFeed'
pf_fake_df['dataset'] = 'PolitiFact'
pf_real_df['dataset'] = 'PolitiFact'

# Combine all data for temporal analysis
all_data = pd.concat([bf_fake_df, bf_real_df, pf_fake_df, pf_real_df], ignore_index=True)

In [None]:
# Function to extract date from various formats
def extract_date(date_str):
    if pd.isna(date_str):
        return None
    
    # Check if the date is in dictionary format with $date key
    if isinstance(date_str, str) and date_str.startswith('{'):
        try:
            date_dict = json.loads(date_str.replace("'", '"'))
            if '$date' in date_dict:
                # Convert Unix timestamp (milliseconds) to datetime
                timestamp = int(date_dict['$date']) / 1000  # Convert to seconds
                return datetime.fromtimestamp(timestamp)
        except (json.JSONDecodeError, ValueError):
            pass
    
    # Try common date formats
    date_formats = [
        '%Y-%m-%d',
        '%Y/%m/%d',
        '%d-%m-%Y',
        '%d/%m/%Y',
        '%B %d, %Y',
        '%b %d, %Y'
    ]
    
    for fmt in date_formats:
        try:
            return datetime.strptime(date_str, fmt)
        except (ValueError, TypeError):
            continue
    
    return None

# Apply the function to standardize dates
all_data['parsed_date'] = all_data['publish_date'].apply(extract_date)

# Add year, month, day columns for easier analysis
all_data['year'] = all_data['parsed_date'].apply(lambda x: x.year if x is not None else None)
all_data['month'] = all_data['parsed_date'].apply(lambda x: x.month if x is not None else None)
all_data['day'] = all_data['parsed_date'].apply(lambda x: x.day if x is not None else None)

# Drop rows with no date information for time analysis
dated_data = all_data.dropna(subset=['parsed_date']).copy()

# Add day of week
dated_data['day_of_week'] = dated_data['parsed_date'].apply(lambda x: x.weekday())

# Show how many articles have valid dates
print(f"Total articles: {len(all_data)}")
print(f"Articles with valid dates: {len(dated_data)} ({len(dated_data)/len(all_data)*100:.2f}%)")

In [None]:
# Distribution of articles over time by real/fake label
plt.figure(figsize=(14, 6))
ax = sns.histplot(data=dated_data, x='parsed_date', hue='label', bins=20, element='step')
plt.title('Distribution of Articles Over Time by Label')
plt.xlabel('Publication Date')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.legend(['Real News', 'Fake News'])
plt.tight_layout()
plt.show()

In [None]:
# Articles by month of year
plt.figure(figsize=(12, 6))
sns.countplot(data=dated_data, x='month', hue='label')
plt.title('Distribution of Articles by Month')
plt.xlabel('Month')
plt.ylabel('Number of Articles')
plt.xticks(range(12), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.legend(['Real News', 'Fake News'])
plt.tight_layout()
plt.show()

In [None]:
# Articles by day of week
plt.figure(figsize=(12, 6))
sns.countplot(data=dated_data, x='day_of_week', hue='label')
plt.title('Distribution of Articles by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Articles')
plt.xticks(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.legend(['Real News', 'Fake News'])
plt.tight_layout()
plt.show()

In [None]:
# Create a combined text field
dated_data['combined_text'] = dated_data['title'].fillna('') + ' ' + dated_data['text'].fillna('')

# Group data by year and label
years = sorted(dated_data['year'].dropna().unique())
print(f"Years in dataset: {years}")

In [None]:
# Function to analyze language patterns for a given year
def analyze_year_language(year):
    # Select data for this year
    year_data = dated_data[dated_data['year'] == year]
    fake_texts = year_data[year_data['label'] == 1]['combined_text'].tolist()
    real_texts = year_data[year_data['label'] == 0]['combined_text'].tolist()
    
    # If no data for this label in this year, return empty lists
    if not fake_texts or not real_texts:
        return ([], [])
    
    # Calculate word frequencies
    vectorizer = TfidfVectorizer(max_features=20, stop_words='english')
    
    # Fake news top words
    try:
        tfidf_fake = vectorizer.fit_transform(fake_texts)
        fake_words = vectorizer.get_feature_names_out()
        fake_scores = np.mean(tfidf_fake.toarray(), axis=0)
        fake_top_words = [(fake_words[i], fake_scores[i]) for i in fake_scores.argsort()[::-1][:5]]
    except ValueError:
        fake_top_words = []
    
    # Real news top words
    try:
        tfidf_real = vectorizer.fit_transform(real_texts)
        real_words = vectorizer.get_feature_names_out()
        real_scores = np.mean(tfidf_real.toarray(), axis=0)
        real_top_words = [(real_words[i], real_scores[i]) for i in real_scores.argsort()[::-1][:5]]
    except ValueError:
        real_top_words = []
    
    return (fake_top_words, real_top_words)

# Calculate language patterns by year
year_language = {}
for year in years:
    year_language[year] = analyze_year_language(year)

# Display top words by year
for year in years:
    fake_top, real_top = year_language[year]
    
    if fake_top and real_top:
        print(f"\nYear: {year}")
        print("Top words in FAKE news:")
        for word, score in fake_top:
            print(f"  {word}: {score:.4f}")
            
        print("\nTop words in REAL news:")
        for word, score in real_top:
            print(f"  {word}: {score:.4f}")

In [None]:
# Prepare data for time-based model evaluation
def train_test_by_time(train_years, test_years):
    # Filter BuzzFeed data by training years
    bf_train = pd.concat([bf_fake_df, bf_real_df], ignore_index=True)
    bf_train['parsed_date'] = bf_train['publish_date'].apply(extract_date)
    bf_train['year'] = bf_train['parsed_date'].apply(lambda x: x.year if x is not None else None)
    bf_train = bf_train.dropna(subset=['year'])
    bf_train = bf_train[bf_train['year'].isin(train_years)]
    
    # Filter PolitiFact data by test years
    pf_test = pd.concat([pf_fake_df, pf_real_df], ignore_index=True)
    pf_test['parsed_date'] = pf_test['publish_date'].apply(extract_date)
    pf_test['year'] = pf_test['parsed_date'].apply(lambda x: x.year if x is not None else None)
    pf_test = pf_test.dropna(subset=['year'])
    pf_test = pf_test[pf_test['year'].isin(test_years)]
    
    if len(bf_train) == 0 or len(pf_test) == 0:
        return None, None, None, None
    
    # Prepare text data
    bf_train['combined_text'] = bf_train['title'].fillna('') + ' ' + bf_train['text'].fillna('')
    pf_test['combined_text'] = pf_test['title'].fillna('') + ' ' + pf_test['text'].fillna('')
    
    # Create feature vectors
    X_train = bf_train['combined_text']
    y_train = bf_train['label']
    X_test = pf_test['combined_text']
    y_test = pf_test['label']
    
    return X_train, y_train, X_test, y_test

In [None]:
# Define time periods for analysis based on actual years in your dataset
# You may need to adjust these based on the years present in your data
early_years = [min(years), min(years) + 1]
later_years = [max(years) - 1, max(years)]

time_periods = [
    {'name': 'Early', 'train_years': early_years, 'test_years': early_years},
    {'name': 'Later', 'train_years': later_years, 'test_years': later_years},
    {'name': 'Train-Early-Test-Later', 'train_years': early_years, 'test_years': later_years},
    {'name': 'Train-Later-Test-Early', 'train_years': later_years, 'test_years': early_years}
]

# Evaluate models for each time period
results = []

for period in time_periods:
    X_train, y_train, X_test, y_test = train_test_by_time(period['train_years'], period['test_years'])
    
    if X_train is None:
        print(f"Skipping period {period['name']} - insufficient data")
        continue
    
    print(f"\nEvaluating period: {period['name']}")
    print(f"Training on years {period['train_years']} ({len(X_train)} articles)")
    print(f"Testing on years {period['test_years']} ({len(X_test)} articles)")
    
    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    # Logistic Regression
    lr_model = LogisticRegression(max_iter=1000, C=1.0)
    lr_model.fit(X_train_tfidf, y_train)
    lr_preds = lr_model.predict(X_test_tfidf)
    lr_accuracy = accuracy_score(y_test, lr_preds)
    lr_f1 = f1_score(y_test, lr_preds)
    
    # Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_tfidf, y_train)
    rf_preds = rf_model.predict(X_test_tfidf)
    rf_accuracy = accuracy_score(y_test, rf_preds)
    rf_f1 = f1_score(y_test, rf_preds)
    
    print(f"\nLogistic Regression - Accuracy: {lr_accuracy:.4f}, F1 Score: {lr_f1:.4f}")
    print(f"Random Forest - Accuracy: {rf_accuracy:.4f}, F1 Score: {rf_f1:.4f}")
    
    results.append({
        'period': period['name'],
        'lr_accuracy': lr_accuracy,
        'lr_f1': lr_f1,
        'rf_accuracy': rf_accuracy,
        'rf_f1': rf_f1
    })

In [None]:
# Convert results to dataframe and visualize
if results:
    results_df = pd.DataFrame(results)
    results_df

In [None]:
# Visualize accuracy across time periods
if 'results_df' in locals() and len(results_df) > 0:
    plt.figure(figsize=(14, 6))
    bar_width = 0.35
    index = np.arange(len(results_df))

    plt.bar(index, results_df['lr_accuracy'], bar_width, label='Logistic Regression')
    plt.bar(index + bar_width, results_df['rf_accuracy'], bar_width, label='Random Forest')

    plt.xlabel('Time Period')
    plt.ylabel('Accuracy')
    plt.title('Model Accuracy Across Different Time Periods')
    plt.xticks(index + bar_width / 2, results_df['period'])
    plt.legend()
    plt.ylim(0.5, 1.0)  # Set y-axis to start from 0.5 for better visualization
    plt.tight_layout()
    plt.show()

## Conclusions

The time-based analysis reveals how fake news patterns evolve and affect model performance over different time periods. Key insights include:

1. Publication patterns show when fake news is most likely to appear
2. The language and topics of fake news change over time
3. Models perform differently when trained and tested on different time periods
4. This suggests fake news detection systems need regular updates to remain effective