In [45]:
import spacy
import re
import json
import textstat
import pandas as pd
from textblob import TextBlob

In [46]:
nlp = spacy.load("en_core_web_sm")


In [47]:
clickbait_keywords = [
    "shocking", "you won’t believe", "unbelievable", "exclusive",
    "what happened next", "will blow your mind", "exposed", "top secret",
    "can't miss", "epic", "amazing", "guaranteed", "crazy"
]

In [48]:
def extract_article_metadata(text):
    doc = nlp(text)

    # Text length metrics
    num_chars = len(text)
    words = [token.text for token in doc if token.is_alpha]
    num_words = len(words)
    sentences = list(doc.sents)
    num_sentences = len(sentences)
    avg_sentence_length = round(num_words / num_sentences, 2) if num_sentences else 0

    # Capitalized words
    capitalized_words = [token.text for token in doc if token.text.isupper() and len(token.text) > 1]
    num_caps = len(capitalized_words)

    # Special punctuation
    num_exclamations = text.count('!')
    num_questions = text.count('?')

    # Clickbait detection
    text_lower = text.lower()
    has_clickbait = any(word in text_lower for word in clickbait_keywords)

    # Readability score
    readability_score = textstat.flesch_reading_ease(text)

    # Sentiment
    sentiment = TextBlob(text).sentiment
    polarity = sentiment.polarity
    subjectivity = sentiment.subjectivity

    # POS Ratios
    total_tokens = len([token for token in doc if token.is_alpha])
    pos_counts = {}
    for token in doc:
        if token.is_alpha:
            pos = token.pos_
            pos_counts[pos] = pos_counts.get(pos, 0) + 1
    pos_ratios = {k: round(v / total_tokens, 3) for k, v in pos_counts.items()}

    return {
        "num_characters": num_chars,
        "num_words": num_words,
        "num_sentences": num_sentences,
        "avg_sentence_length": avg_sentence_length,
        "num_capitalized_words": num_caps,
        "num_exclamations": num_exclamations,
        "num_questions": num_questions,
        "has_clickbait_words": has_clickbait,
        "readability_score": readability_score,
        "sentiment_polarity": polarity,
        "sentiment_subjectivity": subjectivity,
        "pos_ratios": pos_ratios
    }

In [49]:
df = pd.read_csv("Dataset.csv")
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ind,text,subject,date,class
0,0,0,0,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",Fake
1,1,1,1,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",Fake
2,2,2,2,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",Fake
3,3,3,3,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",Fake
4,4,4,4,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",Fake


In [50]:
def length_of_article(text):
    words = word_tokenize(text)
    return len(words), len(text)  # word count, character count


def number_of_sentences(text):
    sentences = sent_tokenize(text)
    return len(sentences)


def average_sentence_length(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    return len(words) / len(sentences) if sentences else 0


def number_of_capitalized_words(text):
    words = word_tokenize(text)
    return sum(1 for w in words if w.isupper())


def punctuation_marks_count(text):
    return text.count('!'), text.count('?')


def contains_clickbait_phrases(text):
    clickbait_keywords = [
        "shocking", "you won’t believe", "unbelievable",
        "exclusive", "what happened next", "can't miss",
        "this will blow your mind", "never seen before"
    ]
    text_lower = text.lower()
    return any(phrase in text_lower for phrase in clickbait_keywords)


def flesch_kincaid_score(text):
    try:
        return textstat.flesch_kincaid_grade(text)
    except:
        return 0


def sentiment_score_textblob(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity  # ranges from -1 to 1


def sentiment_score_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(text)
    return score['compound']  # ranges from -1 to 1


def pos_ratios(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    total = len(pos_tags)
    if total == 0:
        return {'adj_ratio': 0, 'noun_ratio': 0, 'propn_ratio': 0}
    
    adj_count = sum(1 for word, tag in pos_tags if tag.startswith('JJ'))
    noun_count = sum(1 for word, tag in pos_tags if tag.startswith('NN'))
    propn_count = sum(1 for word, tag in pos_tags if tag == 'NNP')
    
    return {
        'adj_ratio': adj_count / total,
        'noun_ratio': noun_count / total,
        'propn_ratio': propn_count / total
    }


In [40]:
import json

def extract_article_metadata(text):
    word_count, char_count = length_of_article(text)
    sentence_count = number_of_sentences(text)
    avg_sentence_len = average_sentence_length(text)
    capital_words = number_of_capitalized_words(text)
    exclam_count, question_count = punctuation_marks_count(text)
    clickbait = contains_clickbait_phrases(text)
    readability = flesch_kincaid_score(text)
    sentiment_blob = sentiment_score_textblob(text)
    sentiment_vader = sentiment_score_vader(text)
    pos_ratio = pos_ratios(text)

    metadata = {
        "word_count": word_count,
        "char_count": char_count,
        "sentence_count": sentence_count,
        "avg_sentence_length": avg_sentence_len,
        "capitalized_word_count": capital_words,
        "exclamation_count": exclam_count,
        "question_count": question_count,
        "has_clickbait_phrase": clickbait,
        "flesch_kincaid_score": readability,
        "sentiment_textblob": sentiment_blob,
        "sentiment_vader": sentiment_vader,
        "pos_ratios": pos_ratio
    }

    return metadata


In [55]:
import json

for idx, row in df.iterrows():
    article_text = row["text"]
    metadata = extract_article_metadata(article_text)
    metadata["id"] = idx
    metadata["subject"] = row.get("subject", None)
    metadata["class"] = row.get("class", None)
    metadata["date"] = row.get("date", None)

    # Append each entry in a human-readable format
    with open("article_metadata.json", "a") as f:
        json.dump(metadata, f, indent=4)
        f.write(",\n\n")  # separate with commas and newlines for readability
