In [15]:
import os
import pandas as pd
import spacy
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
# List of CSV files to process
csv_files = [
    'dataset/gossipcop_fake.csv',
    'dataset/gossipcop_real.csv',
    'dataset/politifact_fake.csv',
    'dataset/politifact_real.csv'
]

In [17]:
# Save updated CSV
def save_updated_csv(data, original_filename, step, step_name):
    base_name = os.path.basename(original_filename)
    name, ext = os.path.splitext(base_name)
    new_filename = f"{name}_{step}_{step_name}_updated{ext}"
    output_path = os.path.join('output', new_filename)
    os.makedirs('output', exist_ok=True)
    data.to_csv(output_path, index=False)
    print(f"Updated CSV after step {step} ({step_name}) saved as {output_path}")

In [18]:
# Preprocessing text
def preprocess_text(data, filename):
    print("Step 1: Preprocessing text...")
    import re
    data['cleaned_text'] = data['title'].apply(lambda text: re.sub(r'[^a-zA-Z0-9 ]', '', re.sub(r'<.*?>', '', text.lower())))
    save_updated_csv(data, filename, 'step1', 'preprocessing')
    return data

In [20]:
# Extract Named Entities
def extract_entities(text, nlp):
    doc = nlp(text)
    entity_counts = {'ORG': 0, 'GPE': 0, 'PERSON': 0}
    for ent in doc.ents:
        if ent.label_ in entity_counts:
            entity_counts[ent.label_] += 1
    return entity_counts

In [21]:
def extract_features(data, filename):
    print("Step 2: Extracting Named Entities...")
    nlp = spacy.load('en_core_web_sm')
    entity_features = data['cleaned_text'].apply(lambda x: extract_entities(x, nlp))
    data['org_count'] = entity_features.apply(lambda x: x['ORG'])
    data['gpe_count'] = entity_features.apply(lambda x: x['GPE'])
    data['person_count'] = entity_features.apply(lambda x: x['PERSON'])
    save_updated_csv(data, filename, 'step2', 'entity_extraction')
    return data

In [22]:
# Calculate Sentiment
def calculate_features(data, filename):
    print("Step 3: Calculating sentiment and article length...")
    data['sentiment'] = data['cleaned_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    data['article_length'] = data['cleaned_text'].apply(lambda x: len(x.split()))
    save_updated_csv(data, filename, 'step3', 'feature_calculation')
    return data

In [23]:
# Predictive Modeling
def predictive_modeling(data, filename):
    print("Step 4: Predictive Modeling...")
    features = ['org_count', 'gpe_count', 'person_count', 'sentiment', 'article_length']
    X = data[features]
    y = data['engagement']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Random Forest Regressor
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions and Evaluation
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    print(f"Mean Absolute Error: {mae}")

    # For classification-like metrics, binarize predictions and actuals for evaluation
    y_test_binarized = np.where(y_test > np.median(y_test), 1, 0)
    predictions_binarized = np.where(predictions > np.median(predictions), 1, 0)

    accuracy = accuracy_score(y_test_binarized, predictions_binarized)
    f1 = f1_score(y_test_binarized, predictions_binarized)
    print(f"Accuracy: {accuracy}")
    print(f"F1-Score: {f1}")

    # Save model results
    data['predicted_engagement'] = model.predict(X)
    save_updated_csv(data, filename, 'step4', 'modeling')

    return model, mae, accuracy, f1

In [24]:
# Visualization
def create_visualizations(data, filename):
    print("Step 5: Creating visualizations...")
    os.makedirs('visualizations', exist_ok=True)

    # Entity Frequency Bar Chart
    entity_counts = data[['org_count', 'gpe_count', 'person_count']].sum()
    entity_counts.plot(kind='bar', color=['blue', 'green', 'red'])
    plt.title('Entity Frequency')
    plt.xlabel('Entity Type')
    plt.ylabel('Frequency')
    plt.savefig(f'visualizations/{os.path.basename(filename)}_entity_frequency.png')
    plt.close()

    # Scatter Plot: Sentiment vs Engagement
    sns.scatterplot(x='sentiment', y='engagement', data=data)
    plt.title('Sentiment vs Engagement')
    plt.xlabel('Sentiment Polarity')
    plt.ylabel('Engagement')
    plt.savefig(f'visualizations/{os.path.basename(filename)}_sentiment_engagement.png')
    plt.close()

    # Heatmap: Feature Correlation
    numeric_data = data.select_dtypes(include=['number'])  # Select only numeric columns
    correlation_matrix = numeric_data.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
    plt.title('Feature Correlation')
    plt.savefig(f'visualizations/{os.path.basename(filename)}_feature_correlation.png')
    plt.close()

In [25]:
# Main pipeline
def main():
    for file in csv_files:
        print(f"Processing file: {file}")
        data = pd.read_csv(file)

        # Step 1: Preprocessing
        data = preprocess_text(data, file)

        # Step 2: Named Entity Recognition and Feature Extraction
        data = extract_features(data, file)

        # Step 3: Calculate Sentiment and Additional Features
        data = calculate_features(data, file)

        # Step 4: Placeholder Engagement Metric
        print("Step 4: Creating placeholder engagement metric...")
        data['engagement'] = data['org_count'] + data['gpe_count'] + data['person_count'] + data['sentiment'] * 10

        # Step 5: Predictive Modeling
        model, mae, accuracy, f1 = predictive_modeling(data, file)
        print(f"Results for {file}: MAE={mae}, Accuracy={accuracy}, F1-Score={f1}")

        # Step 6: Visualization
        create_visualizations(data, file)

    print("Processing complete.")

In [26]:
if __name__ == "__main__":
    main()

Processing file: dataset/gossipcop_fake.csv
Step 1: Preprocessing text...
Updated CSV after step step1 (preprocessing) saved as output\gossipcop_fake_step1_preprocessing_updated.csv
Step 2: Extracting Named Entities...
Updated CSV after step step2 (entity_extraction) saved as output\gossipcop_fake_step2_entity_extraction_updated.csv
Step 3: Calculating sentiment and article length...
Updated CSV after step step3 (feature_calculation) saved as output\gossipcop_fake_step3_feature_calculation_updated.csv
Step 4: Creating placeholder engagement metric...
Step 4: Predictive Modeling...
Mean Absolute Error: 0.035913856559455226
Accuracy: 0.9990610328638497
F1-Score: 0.9989417989417989
Updated CSV after step step4 (modeling) saved as output\gossipcop_fake_step4_modeling_updated.csv
Results for dataset/gossipcop_fake.csv: MAE=0.035913856559455226, Accuracy=0.9990610328638497, F1-Score=0.9989417989417989
Step 5: Creating visualizations...
Processing file: dataset/gossipcop_real.csv
Step 1: Prep