In [5]:
import pandas as pd
import numpy as np
import ast
import json
import swifter
import nltk
import matplotlib.pyplot as plt
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
from scipy.sparse import lil_matrix

# Required resources; download once
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

tqdm.pandas()

In [2]:
df = pd.read_csv('data/995,000_row_cleaned.csv')

  df = pd.read_csv('data/995,000_row_cleaned.csv')


In [3]:
# Process in chunks to avoid memory issues
def create_vocab_optimized(df, chunk_size=10000):
    vocab = Counter()
    
    # Process DataFrame in chunks
    for i in tqdm(range(0, len(df), chunk_size)):
        chunk = df.iloc[i:i+chunk_size]
        
        # Update vocabulary with words from this chunk
        for content in chunk['content']:
            try:
                word_list = ast.literal_eval(content)
                vocab.update(word_list)
            except (ValueError, SyntaxError):
                # Skip malformed entries
                continue
    
    return vocab

# Replace the original vocabulary creation
vocab = create_vocab_optimized(df)
print(f"Vocabulary size: {len(vocab)} unique words")

# For saving top words
shortend_vocab = dict(vocab.most_common(10000))
with open('10000vocab.json', 'w') as f:
    json.dump(shortend_vocab, f)

100%|██████████| 100/100 [04:00<00:00,  2.40s/it]

Vocabulary size: 1639414 unique words





In [6]:
def create_features_optimized(df, vocab_words, chunk_size=1000):
    # Get the list of words we care about (fixed size vocabulary)
    word_to_index = {word: idx for idx, word in enumerate(vocab_words)}
    
    # Create a sparse matrix for features
    features = lil_matrix((len(df), len(vocab_words)), dtype=np.int32)
    labels = np.zeros(len(df), dtype=np.int8)
    
    # Label mapping
    label_map = {
        "fake": 1, "satire": 1, "conspiracy": 1, "unreliable": 1, "bias": 1, 
        "rumor": 1, "junksci": 1, "hate": 1, "reliable": 0, "clickbait": 0, "political": 0
    }
    
    # Process in chunks
    for i in tqdm(range(0, len(df), chunk_size)):
        chunk = df.iloc[i:i+chunk_size]
        
        for j, (_, row) in enumerate(chunk.iterrows()):
            absolute_idx = i + j
            
            # Set label
            labels[absolute_idx] = label_map.get(row['type'], 0)
            
            # Process content
            try:
                word_list = ast.literal_eval(row['content'])
                # Count words that are in our vocabulary
                for word in word_list:
                    if word in word_to_index:
                        features[absolute_idx, word_to_index[word]] += 1
            except (ValueError, SyntaxError):
                continue
    
    return features, labels

# Get top N words from vocabulary
top_words = list(shortend_vocab.keys())
features, labels = create_features_optimized(df, top_words)

print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")

100%|██████████| 995/995 [09:12<00:00,  1.80it/s]

Features shape: (995000, 10000)
Labels shape: (995000,)





### Taking the processed fake dataset and look into the parts of speech within this subset, creating a new dataset with the analyzed result

In [7]:
# Filter the DataFrame
print("Filtering the DataFrame to include only rows where the 'type' column is 'fake' or 'reliable'...")
filtered_df_fake = df[df['type'].isin(['fake'])]
filtered_df_reliable = df[df['type'].isin(['reliable'])]

Filtering the DataFrame to include only rows where the 'type' column is 'fake' or 'reliable'...
Filtering complete! The filtered DataFrame contains 104883 rows.


In [11]:
def analyze_pos_distribution(df, column_name='content', chunk_size=1000):
    # Initialize counters
    pos_counts = Counter()
    processed_rows = 0
    
    # POS Mapping Dictionary
    pos_mapping = {
        "CC": "Conjunction (coordinating)", "CD": "Numeral (cardinal)", "DT": "Determiner",
        "EX": "Existential 'there'", "IN": "Preposition or subordinating conjunction",
        "JJ": "Adjective", "JJR": "Adjective (comparative)", "JJS": "Adjective (superlative)",
        "LS": "List item marker", "MD": "Modal auxiliary", "NN": "Noun (singular/mass)",
        "NNS": "Noun (plural)", "NNP": "Proper Noun (singular)", "NNPS": "Proper Noun (plural)",
        "PDT": "Pre-determiner", "POS": "Genitive marker ('s)", "PRP": "Pronoun (personal)",
        "PRP$": "Pronoun (possessive)", "RB": "Adverb", "RBR": "Adverb (comparative)",
        "RBS": "Adverb (superlative)", "RP": "Particle", "TO": "To (preposition/infinitive marker)",
        "UH": "Interjection", "VB": "Verb (base form)", "VBD": "Verb (past tense)",
        "VBG": "Verb (present participle/gerund)", "VBN": "Verb (past participle)",
        "VBP": "Verb (present, non-3rd person singular)", "VBZ": "Verb (present, 3rd person singular)",
        "WDT": "WH-determiner", "WP": "WH-pronoun", "WRB": "WH-adverb"
    }
    
    # Process in chunks
    for i in tqdm(range(0, len(df), chunk_size)):
        chunk = df.iloc[i:i+chunk_size]
        
        for text in chunk[column_name].fillna('').astype(str):
            try:
                # Skip empty text
                if not text.strip():
                    continue
                
                # Tokenize and get POS tags
                tokens = word_tokenize(text)
                pos_tags = pos_tag(tokens)
                
                # Update POS counts
                for _, tag in pos_tags:
                    readable_tag = pos_mapping.get(tag, "Other")
                    pos_counts[readable_tag] += 1
                
                processed_rows += 1
            except Exception as e:
                print(f"Error processing text: {e}")
                continue
    
    print(f"Processed {processed_rows} rows")
    
    # Create DataFrame for visualization
    df_counts = pd.DataFrame(pos_counts.items(), columns=['POS', 'Count'])
    df_counts = df_counts.sort_values('Count', ascending=False)
    
    # Plot results
    plt.figure(figsize=(12, 6))
    plt.bar(df_counts['POS'], df_counts['Count'])
    plt.xlabel("Part of Speech")
    plt.ylabel("Frequency")
    plt.title("POS Distribution")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
    
    return df_counts

# Example usage:
# pos_counts_fake = analyze_pos_distribution(filtered_df_fake, column_name='content')
# pos_counts_reliable = analyze_pos_distribution(filtered_df_reliable, column_name='content')

### Analyzing the relationship between the total amount of words and the total amount of unique words.
#### Finding the ratio of unique words per word.

In [12]:
# Total amount of word in content
total_words = df["content"].swifter.progress_bar(True).apply(lambda x: len(str(x).split())).sum()
print("Total words in dataset:", total_words)

# Function to extract unique words from a text
def extract_unique_words(text):
    return set(str(text).split())

# Apply function efficiently using swifter
unique_word_sets = df["content"].swifter.progress_bar(True).apply(extract_unique_words)

# Combine all sets and count unique words
total_unique_words = len(set().union(*tqdm(unique_word_sets)))

print("Total unique words in dataset:", total_unique_words)

# Calculate the percentage of unique words per word
print("Unique words per word:", (total_unique_words / total_words) * 100)

Pandas Apply: 100%|██████████| 995000/995000 [00:06<00:00, 159909.41it/s]


Total words in dataset: 263245932


Pandas Apply: 100%|██████████| 995000/995000 [00:46<00:00, 21196.93it/s]
100%|██████████| 995000/995000 [00:00<00:00, 7307943.15it/s]


Total unique words in dataset: 1726502
Unique words per word: 0.6558513504398616
