Import all the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS

import re
import string
import contractions
import textstat
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [2]:
df = pd.read_csv("../data/preprocessed/master_fakenews.csv")

Let's now fix any contractions in the text and remove extra whitespace

In [3]:
def normalize_text(text):
    text = contractions.fix(str(text))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [4]:
df["prep_text"] = df["clean_text"].apply(normalize_text)

In [5]:
wnl = WordNetLemmatizer()
stop = set(stopwords.words("english"))

To ensure that non-letter characters are still in the text, as well convert any leftover uppercase to lowercase, remove stopwords, and lemmatize

In [6]:
def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", str(text))
    text = text.lower().split()
    text = [wnl.lemmatize(word) for word in text if word not in stop]
    return " ".join(text)

In [7]:
df["prep_text"] = df["clean_text"].apply(clean_text)

Let's now count unique words and compute the ratio of unique words to total text length This way we can analyze the lexical diversity of the text, it might be a clue on identifying fake news from real news.

In [8]:
df["lexical_richness"] = df.apply(
    lambda row: row["unique_words"] / row["word_count"] if row["word_count"] > 0 else 0,
    axis=1
)

Let's now calculate the Flesch Reading Ease and estimated grade level of each text
- The Flesch Reading Ease is a readability test that scores text on a scale from 0 to 100, which shows how easy it's to read. Higher scores mean the text is easier to understand, with scores around 60-70 being suitable for most readers.

In [9]:
df["flesch_reading_ease"] = df["prep_text"].apply(textstat.flesch_reading_ease)
df["grade_level"] = df["prep_text"].apply(textstat.text_standard)

Another part of the corpus that might give us some clue is the number of punctuaton and what's the ratio with words.

In [10]:
def count_punctuations(text):
    """Count punctuation characters in a text string."""
    if not isinstance(text, str):
        return 0
    return sum(1 for ch in text if ch in string.punctuation)

In [11]:
def count_words(text):
    """Count words in a text string (avoid division by zero)."""
    if not isinstance(text, str) or not text.strip():
        return 1  # avoid division by zero
    return len(text.split())

In [12]:
df["punct_count"] = df["text"].apply(count_punctuations)
df["word_count"]  = df["text"].apply(count_words)
df["punct_ratio"] = df["punct_count"] / df["word_count"]

In [13]:
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])  # keep NER

Let's use SpaCy to count the number of named entities in each text. NER give us the important entities in the text such as dates, monetary values, products, etc.

In [14]:
texts = df["prep_text"].fillna("").tolist()
entity_counts = []

for doc in nlp.pipe(texts, batch_size=100):
    entity_counts.append(len(doc.ents))




In [15]:
df["entity_count"] = entity_counts

Convert cleaned text into numerical TF-IDF features (1- and 2-grams)

In [16]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df["prep_text"])

Merge TF-IDF matrix with numeric features like polarity, subjectivity, etc.

In [17]:
numeric_feats = df[["polarity", "subjectivity", "text_len", "avg_word_len"]].fillna(0).values
X_full = hstack([X_tfidf, numeric_feats])

Identify which numeric features are most related to the label `real`

In [18]:
corrs = df.corr(numeric_only=True)["real"].sort_values(ascending=False)
print(corrs)

real                   1.000000
lexical_richness       0.165905
entity_count           0.163239
avg_word_len           0.162847
flesch_reading_ease   -0.008832
polarity              -0.025019
unique_words          -0.035003
punct_ratio           -0.047456
text_len              -0.054482
word_count            -0.056424
punct_count           -0.085365
subjectivity          -0.317270
num_sents                   NaN
Name: real, dtype: float64


Looking at the extreme of the our list, lexical_richness has a very small correlation with our label, whereas subjectivity has a somewhat strong negative correlation with our label

In [19]:
df.to_csv("../data/preprocessed/fakenews_preprocessed.csv", index=False)