In [None]:
import pandas as pd
import numpy as np
import ast
import json
import swifter
import nltk
import matplotlib.pyplot as plt
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

# Required resources; download once
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

tqdm.pandas()

In [None]:
df = pd.read_csv('data/995,000_row_cleaned.csv')

In [None]:
vocab = {}
# Function to create a vocabulary with word counts
def create_vocab(lst):
    # Convert the string representation of a list into a actual list
    lst = ast.literal_eval(lst)
    # Iterate over words in the list and update their counts in the dictionary
    for word in lst:
        if word in vocab:
            vocab[word] += 1 # Increment count if the word already exists
        else:
            vocab[word] = 1 # Add the word to the dictionary with an initial count of 1

# Apply the function to the 'content' column
df['content'].swifter.progress_bar(True).apply(create_vocab)
print(vocab)

In [None]:
shortend_vocab = dict(sorted(vocab.items(), key=lambda item: item[1], reverse=True)[:10000])
with open('10000vocab.json', 'w') as f:
    json.dump(shortend_vocab, f)

In [None]:
# Function to count word frequencies based on the vocabulary
def count_freq(lst):
    dic = {key: 0 for key in list(vocab)}
    # Convert the string representation of a list into a actual list
    lst = ast.literal_eval(lst)
    # Iterate over words in the list and update their counts in the dictionary
    for word in lst:
        if word in dic:
            dic[word] += 1
    return [value for value in dic.values()]

# Apply the function to the 'content' column
df['content'] = df['content'].swifter.progress_bar(True).apply(count_freq)

# Save cleaned data
output_path = 'data/995,000_row_counted.csv'
df.to_csv(output_path, index=False)
print(f"Fully cleaned; data saved to {output_path}")

In [None]:

label_map = {"fake": 1, "satire":1, "conspiracy": 1, "unreliable": 1, "bias": 1, "rumor": 1, "junksci": 1, "hate": 1,
             "reliable": 0,  "clickbait": 0,   "political": 0}
labels = np.array([label_map.get(row['type']) for _,row in df.iterrows()])
features = np.array([count_freq(row['content']) for _,row in df.iterrows()])
dataset = np.column_stack((labels, features))

# Print shapes
print("Features shape:", features.shape)  # (num_samples, num_features)
print("Labels shape:", labels.shape)  # (num_samples,)

# Example output
print("First sample:", dataset[-1])

In [None]:
# Step 1: Read the CSV file
print("Starting to read the CSV file...")
df = pd.read_csv('data/995,000_rows.csv')
print("Successfully read the CSV file!")

# Step 2: Filter the DataFrame
print("Filtering the DataFrame to include only rows where the 'type' column is 'fake' or 'reliable'...")
filtered_df = df[df['type'].isin(['fake'])]
print("Filtering complete! The filtered DataFrame contains", len(filtered_df), "rows.")

# Step 3: Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('fake_data.csv', index=False)
print("Filtered data saved to 'filtered_data.csv'!")

# Final Step: Notify progress completion
print("All steps completed successfully.")

### Taking the processed fake dataset and look into the parts of speech within this subset, creating a new dataset with the analyzed result

In [None]:
df = pd.read_csv('fake_data.csv')

column_index = 5
if df.shape[1] > column_index:
    text_column = df.iloc[:, column_index].dropna().astype(str)  # Convert to string
    print(f"Targeted column name: {df.columns[column_index]}")  # Print column name
else:
    raise IndexError("something's wrong!")


# POS Mapping Dictionary
pos_mapping = {
    "CC": "Conjunction (coordinating)", "CD": "Numeral (cardinal)", "DT": "Determiner",
    "EX": "Existential 'there'", "IN": "Preposition or subordinating conjunction",
    "JJ": "Adjective", "JJR": "Adjective (comparative)", "JJS": "Adjective (superlative)",
    "LS": "List item marker", "MD": "Modal auxiliary", "NN": "Noun (singular/mass)",
    "NNS": "Noun (plural)", "NNP": "Proper Noun (singular)", "NNPS": "Proper Noun (plural)",
    "PDT": "Pre-determiner", "POS": "Genitive marker ('s)", "PRP": "Pronoun (personal)",
    "PRP$": "Pronoun (possessive)", "RB": "Adverb", "RBR": "Adverb (comparative)",
    "RBS": "Adverb (superlative)", "RP": "Particle", "TO": "To (preposition/infinitive marker)",
    "UH": "Interjection", "VB": "Verb (base form)", "VBD": "Verb (past tense)",
    "VBG": "Verb (present participle/gerund)", "VBN": "Verb (past participle)",
    "VBP": "Verb (present, non-3rd person singular)", "VBZ": "Verb (present, 3rd person singular)",
    "WDT": "WH-determiner", "WP": "WH-pronoun", "WRB": "WH-adverb"
}


# Function to process each row
def process_text(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    
    # Convert POS tags to human-readable format
    readable_tags = [pos_mapping.get(tag, "Other") for _, tag in pos_tags]
    
    # Count occurrences of each word class
    pos_counts = {pos: readable_tags.count(pos) for pos in set(readable_tags)}
    
    return pos_counts

# Apply function to each row and create a new DataFrame
df_pos = text_column.apply(process_text).apply(pd.Series).fillna(0).astype(int)

# Add original text for reference
df_pos.insert(0, "Original_Text", text_column)

# Display result
#print(df_pos.head())

# Save to CSV
df_pos.to_csv("pos_analysis_fake.csv", index=False)

# Optional: Aggregate and plot total POS counts
pos_totals = df_pos.drop(columns=["Original_Text"]).sum().sort_values(ascending=False)

plt.figure(figsize=(10, 5))
plt.bar(pos_totals.index, pos_totals.values)
plt.xlabel("Part of Speech")
plt.ylabel("Total Frequency")
plt.title("Total POS Frequency Distribution")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Step 1: Read the CSV file
print("Starting to read the CSV file...")
df = pd.read_csv('data/995,000_rows.csv')
print("Successfully read the CSV file!")

# Step 2: Filter the DataFrame
print("Filtering the DataFrame to include only rows where the 'type' column is 'fake' or 'reliable'...")
filtered_df = df[df['type'].isin(['reliable'])]
print("Filtering complete! The filtered DataFrame contains", len(filtered_df), "rows.")

# Step 3: Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('reliable_data.csv', index=False)
print("Filtered data saved to 'filtered_data.csv'!")

# Final Step: Notify progress completion
print("All steps completed successfully.")

### Taking the processed reliable dataset and look into the parts of speech within this subset, creating a new dataset with the analyzed result

In [None]:
df = pd.read_csv('reliable_data.csv')

column_index = 5
if df.shape[1] > column_index:
    text_column = df.iloc[:, column_index].dropna().astype(str)  # Convert to string
    print(f"Targeted column name: {df.columns[column_index]}")  # Print column name
else:
    raise IndexError("something's wrong!")


# POS Mapping Dictionary
pos_mapping = {
    "CC": "Conjunction (coordinating)", "CD": "Numeral (cardinal)", "DT": "Determiner",
    "EX": "Existential 'there'", "IN": "Preposition or subordinating conjunction",
    "JJ": "Adjective", "JJR": "Adjective (comparative)", "JJS": "Adjective (superlative)",
    "LS": "List item marker", "MD": "Modal auxiliary", "NN": "Noun (singular/mass)",
    "NNS": "Noun (plural)", "NNP": "Proper Noun (singular)", "NNPS": "Proper Noun (plural)",
    "PDT": "Pre-determiner", "POS": "Genitive marker ('s)", "PRP": "Pronoun (personal)",
    "PRP$": "Pronoun (possessive)", "RB": "Adverb", "RBR": "Adverb (comparative)",
    "RBS": "Adverb (superlative)", "RP": "Particle", "TO": "To (preposition/infinitive marker)",
    "UH": "Interjection", "VB": "Verb (base form)", "VBD": "Verb (past tense)",
    "VBG": "Verb (present participle/gerund)", "VBN": "Verb (past participle)",
    "VBP": "Verb (present, non-3rd person singular)", "VBZ": "Verb (present, 3rd person singular)",
    "WDT": "WH-determiner", "WP": "WH-pronoun", "WRB": "WH-adverb"
}


# Function to process each row
def process_text(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    
    # Convert POS tags to human-readable format
    readable_tags = [pos_mapping.get(tag, "Other") for _, tag in pos_tags]
    
    # Count occurrences of each word class
    pos_counts = {pos: readable_tags.count(pos) for pos in set(readable_tags)}
    
    return pos_counts

# Apply function to each row and create a new DataFrame
df_pos = text_column.apply(process_text).apply(pd.Series).fillna(0).astype(int)

# Add original text for reference
df_pos.insert(0, "Original_Text", text_column)

# Display result
#print(df_pos.head())

# Save to CSV
df_pos.to_csv("pos_analysis_reliable.csv", index=False)

# Optional: Aggregate and plot total POS counts
pos_totals = df_pos.drop(columns=["Original_Text"]).sum().sort_values(ascending=False)

plt.figure(figsize=(10, 5))
plt.bar(pos_totals.index, pos_totals.values)
plt.xlabel("Part of Speech")
plt.ylabel("Total Frequency")
plt.title("Total POS Frequency Distribution")
plt.xticks(rotation=90)
plt.show()

### Analyzing the relationship between the total amount of words and the total amount of unique words.
#### Finding the ratio of unique words per word.

In [None]:
# Total amount of word in content
total_words = df["content"].swifter.progress_bar(True).apply(lambda x: len(str(x).split())).sum()
print("Total words in dataset:", total_words)

# Function to extract unique words from a text
def extract_unique_words(text):
    return set(str(text).split())

# Apply function efficiently using swifter
unique_word_sets = df["content"].swifter.progress_bar(True).apply(extract_unique_words)

# Combine all sets and count unique words
total_unique_words = len(set().union(*tqdm(unique_word_sets)))

print("Total unique words in dataset:", total_unique_words)

# Calculate the percentage of unique words per word
print("Unique words per word:", (total_unique_words / total_words) * 100)