In [None]:
import pandas as pd
import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.utils import compute_class_weight
import csv
import re
import nltk
from tensorflow.python.keras.layers import GlobalMaxPool1D
from tensorflow.python.keras.saving.save import load_model
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import pickle
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from transformers import AutoTokenizer
from transformers import DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [None]:
def shuffle_data(data_frame):
    return data_frame.sample(frac=1).reset_index(drop=True)


In [None]:
def clear_dataset():
    # Define the input and output file paths
    input_file = 'news_cleaned_2018_02_13.csv'
    output_file = 'news_cleaned_valid_records.csv'

    # Define the chunk size (number of rows per chunk)
    chunk_size = 1_000_000

    csv.field_size_limit(131072 * 10)

    # Initialize a flag to write the header only once
    write_header = True

    # Open the output file in append mode
    with open(output_file, 'w', newline='', encoding='utf-8') as f_out:
        # Read the input file in chunks
        for chunk in pd.read_csv(
            input_file,
            usecols=['type', 'content', 'title'],
            on_bad_lines='skip',
            quoting=csv.QUOTE_MINIMAL,
            engine='python',
            chunksize=chunk_size
        ):
            try:
                print(f"Processing chunk with {len(chunk)} rows")
                # Drop rows with missing values in 'content', 'type', or 'title'
                chunk_cleaned = chunk.dropna(subset=["content", "type"])
                chunk_cleaned = shuffle_data(chunk_cleaned)
                # Write the cleaned chunk to the output file
                chunk_cleaned.to_csv(f_out, index=False, header=write_header, quoting=csv.QUOTE_MINIMAL)

                # After the first chunk, do not write the header again
                write_header = False

                # Print progress
                print(f"Processed {len(chunk_cleaned)} valid rows")
            except Exception as e:
                print(f"Error processing chunk: {e}")

    print(f"Cleaned dataset saved to '{output_file}'")

In [None]:
def sanitize_filename(filename):
    # Replace invalid characters with underscores
    return re.sub(r'[<>:"/\\|?*\s]', '_', filename)

def split_type(input_file, output_dir="./split"):
    os.makedirs(output_dir, exist_ok=True)
    written_types = set()
    accepted_types = {'fake', 'reliable', 'political', 'bias',
                     'conspiracy', 'rumor', 'unreliable', 'clickbait',
                     'junksci', 'satire', 'hate', 'unknown'}

    chunk_size = 100000
    for chunk in pd.read_csv(input_file, chunksize=chunk_size):
        # Clean type values and group
        chunk['type'] = chunk['type'].astype(str).str.strip().str.lower()
        chunk['type'] = chunk['type'].apply(lambda x: x if x in accepted_types else 'unknown')

        for type_val, group in chunk.groupby('type'):
            sanitized_type = sanitize_filename(type_val)
            file_path = os.path.join(output_dir, f"{sanitized_type}.csv")

            # Write to file with proper mode/header
            mode = 'a' if sanitized_type in written_types else 'w'
            header = sanitized_type not in written_types
            group.to_csv(file_path, mode=mode, index=False, header=header)

            if sanitized_type not in written_types:
                written_types.add(sanitized_type)
                print(f"Created: {file_path} (rows: {len(group)})")
            else:
                print(f"Appended: {file_path} (rows: {len(group)})")

    for type_val in accepted_types:
        file_path = os.path.join(output_dir, f"{type_val}.csv")
        print(f"Total rows in {type_val}: {len(pd.read_csv(file_path))}")

In [None]:
def split_combined():
    file_paths = [
        './split/fake.csv',
        './split/reliable.csv',
        './split/political.csv',
        './split/bias.csv',
        './split/conspiracy.csv',
        './split/rumor.csv',
        './split/unreliable.csv',
        './split/junksci.csv',
        './split/clickbait.csv'
    ]

    # Mapping from original labels to merged categories
    label_mapping = {
        'fake': 'misinformation',
        'reliable': 'credible',
        'political': 'political_bias',
        'bias': 'political_bias',
        'conspiracy': 'misinformation',
        'rumor': 'misinformation',
        'unreliable': 'unreliable',
        'junksci': 'misinformation',
        'clickbait': 'unreliable'
    }

    csv.field_size_limit(131072 * 50)
    header_tracker = set()

    chunk_size = 100_000
    for file_path in file_paths:
        try:
            # Extract original label from filename
            original_label = file_path.split('/')[-1].replace('.csv', '')
            new_label = label_mapping.get(original_label, '')
            if new_label == '':
                continue

            # Read and process the chunk
            for chunk in pd.read_csv(
                file_path,
                usecols=['type', 'content', 'title'],
                quoting=csv.QUOTE_MINIMAL,
                engine='python',
                chunksize=chunk_size
            ):
                # Update the type column with merged category
                chunk['type'] = new_label
                chunk = shuffle_data(chunk)

                write_header = new_label not in header_tracker
                if write_header:
                    header_tracker.add(new_label)

                chunk.to_csv(
                    f"./combined_split/{new_label}.csv",
                    index=False,
                    header=write_header,
                    mode='a'
                )

                print(f"Processed '{file_path}' ({len(chunk)} rows) → '{new_label}'")

        except Exception as e:
            print(f"Error processing '{file_path}': {e}")
            continue

In [None]:
def save_dataset(data_frame, name):
    data_frame.to_csv(name, index=False)

In [None]:
def create_merged_dataset():
    output_file = 'news_merged.csv'
    file_paths = [
        './combined_split/credible.csv',
        './combined_split/misinformation.csv',
        './combined_split/political_bias.csv',
        './combined_split/unreliable.csv'
    ]

    csv.field_size_limit(131072 * 50)

    write_header = True

    with open(output_file, 'w', newline='', encoding='utf-8') as f_out:
        for file_name in file_paths:
            try:
                # Read the file
                chunk = pd.read_csv(file_name, quoting=csv.QUOTE_MINIMAL, engine='python', nrows=300_000, usecols=['type', 'content', 'title'])

                shuffle_data(chunk)
                # Write the chunk to the output file
                chunk.to_csv(f_out, index=False, header=write_header)

                # After the first chunk, do not write the header again
                write_header = False

                # Print progress
                print(f"Processed '{file_name}' with {len(chunk)} rows")
            except Exception as e:
                print(f"Error processing '{file_name}': {e}")

In [None]:
def load_dataset(file_path):
    return pd.read_csv(file_path)

In [None]:
clear_dataset()

In [None]:
split_type('news_cleaned_valid_records.csv')

In [None]:
split_combined()

In [None]:
create_merged_dataset()