<a href="https://colab.research.google.com/github/ASantra-star/Scrapping_Sentiment_Analysis/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install beautifulsoup4 requests nltk pandas openpyxl

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
input_url = "https://docs.google.com/spreadsheets/d/1D7QkDHxUSKnQhR--q0BAwKMxQlUyoJTQ/export?format=xlsx"
input_path = "/content/Input.xlsx"

r = requests.get(input_url)
with open(input_path, "wb") as f:
    f.write(r.content)

df_input = pd.read_excel(input_path)
df_input

In [None]:
def load_words_from_files(file_list):
    words = set()
    for file in file_list:
        with open(file, "r", encoding="latin-1") as f:
            words.update(word.strip().lower() for word in f)
    return words

In [None]:
!pip install gdown

In [None]:
import gdown

# Folder 1
folder1_id = "1rd7YdoX8tED9mujc0c-6evJU4y7LFc_R"
gdown.download_folder(id=folder1_id, output="StopWords", quiet=False)

# Folder 2
folder2_id = "1YRcVlJO3ZaC78iTC6JcunfZl7Fz4AL8v"
gdown.download_folder(id=folder2_id, output="MasterDictionary", quiet=False)


In [None]:
stopword_files = [
    '/content/StopWords/StopWords_Auditor.txt',
    '/content/StopWords/StopWords_Currencies.txt',
    '/content/StopWords/StopWords_DatesandNumbers.txt',
    '/content/StopWords/StopWords_Generic.txt',
    '/content/StopWords/StopWords_GenericLong.txt',
    '/content/StopWords/StopWords_Geographic.txt',
    '/content/StopWords/StopWords_Names.txt'
]

stop_words = load_words_from_files(stopword_files)

In [None]:
positive_words = load_words_from_files(['/content/MasterDictionary/positive-words.txt'])
negative_words = load_words_from_files(['/content/MasterDictionary/negative-words.txt'])

In [None]:
def extract_article_text(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')

        article = soup.find('article')
        if article:
            return article.get_text(separator=' ')
        else:
            return soup.get_text()
    except:
        return ""

In [None]:
def clean_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
    return tokens

In [None]:
def count_syllables(word):
    vowels = "aeiou"
    word = word.lower()
    count = 0

    if word[0] in vowels:
        count += 1

    for i in range(1, len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1

    if word.endswith(("es", "ed")):
        count -= 1

    return max(count, 1)

In [None]:
def analyze_text(text):
    sentences = sent_tokenize(text)
    tokens = clean_text(text)

    word_count = len(tokens)
    sentence_count = len(sentences)

    positive_score = sum(1 for w in tokens if w in positive_words)
    negative_score = sum(1 for w in tokens if w in negative_words)

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)

    complex_words = [w for w in tokens if count_syllables(w) > 2]
    complex_word_count = len(complex_words)

    avg_sentence_length = word_count / sentence_count if sentence_count else 0
    percentage_complex_words = complex_word_count / word_count if word_count else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    syllables_per_word = sum(count_syllables(w) for w in tokens) / word_count if word_count else 0
    avg_word_length = sum(len(w) for w in tokens) / word_count if word_count else 0

    pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

    return [
        positive_score, negative_score, polarity_score, subjectivity_score,
        avg_sentence_length, percentage_complex_words, fog_index,
        avg_sentence_length, complex_word_count, word_count,
        syllables_per_word, pronouns, avg_word_length
    ]

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
output_data = []

for _, row in df_input.iterrows():
    text = extract_article_text(row['URL'])
    metrics = analyze_text(text)

    output_data.append([row['URL_ID'], row['URL']] + metrics)

In [None]:
columns = [
    "URL_ID","URL","POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE",
    "SUBJECTIVITY SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT",
    "WORD COUNT","SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"
]

df_output = pd.DataFrame(output_data, columns=columns)

output_path = "/content/Output.xlsx"
df_output.to_excel(output_path, index=False)

df_output


In [None]:
!pip install pandas feedparser openpyxl

In [None]:
import feedparser
import pandas as pd

# ---------------------------
# Configuration
# ---------------------------
QUERY = "financial markets"
MAX_URLS = 400

# Google News RSS feed
rss_url = f"https://news.google.com/rss/search?q={QUERY.replace(' ', '+')}"

# ---------------------------
# Fetch URLs from web
# ---------------------------
feed = feedparser.parse(rss_url)

urls = []
for entry in feed.entries[:MAX_URLS]:
    urls.append(entry.link)

# ---------------------------
# Auto-generate URL_IDs
# ---------------------------
data = {
    "URL_ID": [f"AUTO_{str(i+1).zfill(3)}" for i in range(len(urls))],
    "URL": urls
}

df_input = pd.DataFrame(data)

# ---------------------------
# Save Input.xlsx
# ---------------------------
input_path = "Input.xlsx"
df_input.to_excel(input_path, index=False)

print("Input.xlsx created automatically from web URLs")
df_input


In [None]:
!pip install pandas feedparser requests beautifulsoup4 nltk openpyxl

In [None]:
import pandas as pd
import feedparser
import requests
import nltk
import re
import string

from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('stopwords')

from nltk.corpus import stopwords

In [None]:
QUERY = "artificial intelligence"
MAX_URLS = 200

rss_url = f"https://news.google.com/rss/search?q={QUERY.replace(' ', '+')}"
feed = feedparser.parse(rss_url)

urls = [entry.link for entry in feed.entries[:MAX_URLS]]

df_input = pd.DataFrame({
    "URL_ID": [f"AUTO_{str(i+1).zfill(3)}" for i in range(len(urls))],
    "URL": urls
})

df_input.to_excel("Input.xlsx", index=False)
print("Input.xlsx created")


In [None]:
input_path = "Input.xlsx"
df_input.to_excel(input_path, index=False)

print("Input.xlsx created automatically from web URLs")
df_input

In [None]:
def extract_article_text(url):
    try:
        r = requests.get(url, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")

        paragraphs = soup.find_all("p")
        text = " ".join(p.get_text() for p in paragraphs)

        return text
    except:
        return ""


In [None]:
stop_words = set(stopwords.words("english"))

def clean_tokens(text):
    tokens = word_tokenize(text.lower())
    tokens = [
        w for w in tokens
        if w.isalpha() and w not in stop_words
    ]
    return tokens


In [None]:
def count_syllables(word):
    vowels = "aeiou"
    count = 0

    if word[0] in vowels:
        count += 1

    for i in range(1, len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1

    if word.endswith(("es", "ed")):
        count -= 1

    return max(count, 1)


def count_personal_pronouns(text):
    return len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))


In [None]:
sia = SentimentIntensityAnalyzer()

def analyze_text(text):
    sentences = sent_tokenize(text)
    tokens = clean_tokens(text)

    word_count = len(tokens)
    sentence_count = len(sentences)

    sentiment = sia.polarity_scores(text)

    positive_score = sentiment['pos']
    negative_score = sentiment['neg']

    polarity_score = sentiment['compound']
    subjectivity_score = positive_score + negative_score

    complex_words = [w for w in tokens if count_syllables(w) > 2]
    complex_word_count = len(complex_words)

    avg_sentence_length = word_count / sentence_count if sentence_count else 0
    percentage_complex_words = complex_word_count / word_count if word_count else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    syllable_per_word = (
        sum(count_syllables(w) for w in tokens) / word_count
        if word_count else 0
    )

    avg_word_length = (
        sum(len(w) for w in tokens) / word_count
        if word_count else 0
    )

    personal_pronouns = count_personal_pronouns(text)

    return [
        positive_score,
        negative_score,
        polarity_score,
        subjectivity_score,
        avg_sentence_length,
        percentage_complex_words,
        fog_index,
        avg_sentence_length,
        complex_word_count,
        word_count,
        syllable_per_word,
        personal_pronouns,
        avg_word_length
    ]


In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
output_rows = []

for _, row in df_input.iterrows():
    article_text = extract_article_text(row["URL"])
    metrics = analyze_text(article_text)

    output_rows.append([row["URL_ID"], row["URL"]] + metrics)

columns = [
    "URL_ID", "URL",
    "POSITIVE SCORE", "NEGATIVE SCORE",
    "POLARITY SCORE", "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX", "AVG NUMBER OF WORDS PER SENTENCE",
    "COMPLEX WORD COUNT", "WORD COUNT",
    "SYLLABLE PER WORD", "PERSONAL PRONOUNS",
    "AVG WORD LENGTH"
]

df_output = pd.DataFrame(output_rows, columns=columns)
df_output.to_excel("Output.xlsx", index=False)

print("Output.xlsx generated successfully")


In [None]:
from urllib.parse import urlparse
from nltk import pos_tag

In [None]:
def analyze_text_extended(text, url):
    sentences = sent_tokenize(text)
    paragraphs = [p for p in text.split("\n") if len(p.strip()) > 0]
    tokens = clean_tokens(text)

    word_count = len(tokens)
    sentence_count = len(sentences)
    paragraph_count = len(paragraphs)

    # ---- Sentiment ----
    sentiment = sia.polarity_scores(text)

    pos = sentiment['pos']
    neg = sentiment['neg']
    neu = sentiment['neu']
    polarity = sentiment['compound']
    intensity = abs(polarity)
    emotionality = pos + neg

    subjectivity = emotionality / (word_count + 1e-6)

    # ---- Complexity ----
    syllables = sum(count_syllables(w) for w in tokens)
    complex_words = [w for w in tokens if count_syllables(w) > 2]

    complex_count = len(complex_words)
    pct_complex = complex_count / word_count if word_count else 0

    avg_sentence_len = word_count / sentence_count if sentence_count else 0
    fog_index = 0.4 * (avg_sentence_len + pct_complex)

    # ---- Readability ----
    flesch = 206.835 - (1.015 * avg_sentence_len) - (84.6 * (syllables / word_count)) if word_count else 0
    reading_time = word_count / 225

    # ---- Lexical richness ----
    unique_words = len(set(tokens))
    ttr = unique_words / word_count if word_count else 0

    # ---- Style ----
    questions = text.count("?")
    exclamations = text.count("!")
    pronouns = count_personal_pronouns(text)

    avg_word_len = sum(len(w) for w in tokens) / word_count if word_count else 0

    # ---- Metadata ----
    domain = urlparse(url).netloc
    length_category = (
        "Short" if word_count < 500 else
        "Medium" if word_count <= 1200 else
        "Long"
    )

    return [
        domain, pos, neg, neu, polarity, intensity, emotionality,
        subjectivity, word_count, unique_words, ttr,
        sentence_count, avg_sentence_len, paragraph_count,
        complex_count, pct_complex, fog_index,
        flesch, reading_time, pronouns,
        questions, exclamations, avg_word_len, length_category
    ]


In [None]:
extended_columns = [
    "URL_ID", "URL", "DOMAIN",
    "POSITIVE SCORE", "NEGATIVE SCORE", "NEUTRAL SCORE",
    "POLARITY SCORE", "SENTIMENT INTENSITY", "EMOTIONALITY SCORE",
    "SUBJECTIVITY SCORE",
    "WORD COUNT", "UNIQUE WORD COUNT", "TYPE TOKEN RATIO",
    "SENTENCE COUNT", "AVG SENTENCE LENGTH",
    "PARAGRAPH COUNT",
    "COMPLEX WORD COUNT", "PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX",
    "FLESCH READING EASE", "READING TIME (MIN)",
    "PERSONAL PRONOUNS",
    "QUESTION COUNT", "EXCLAMATION COUNT",
    "AVG WORD LENGTH",
    "ARTICLE LENGTH CATEGORY"
]

In [None]:
rows = []

for _, row in df_input.iterrows():
    text = extract_article_text(row["URL"])
    metrics = analyze_text_extended(text, row["URL"])
    rows.append([row["URL_ID"], row["URL"]] + metrics)

df_extended = pd.DataFrame(rows, columns=extended_columns)
df_extended.to_excel("Output_Extended.xlsx", index=False)


In [None]:
input_path = "Output_Extended.xlsx"
df_input.to_excel(input_path, index=False)

print("Input.xlsx created automatically from web URLs")
df_input

In [None]:
# ============================================================
# 1. INSTALL DEPENDENCIES (COLAB ONLY)
# ============================================================
# !pip install pandas feedparser requests beautifulsoup4 nltk openpyxl

# ============================================================
# 2. IMPORTS & NLTK SETUP
# ============================================================
import pandas as pd
import feedparser
import requests
import nltk
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

stop_words = set(stopwords.words("english"))
sia = SentimentIntensityAnalyzer()

# ============================================================
# 3. AUTO-GENERATE INPUT.XLSX FROM WEB
# ============================================================
QUERY = "artificial intelligence"
MAX_URLS = 200

rss_url = f"https://news.google.com/rss/search?q={QUERY.replace(' ', '+')}"
feed = feedparser.parse(rss_url)

urls = [entry.link for entry in feed.entries[:MAX_URLS]]

df_input = pd.DataFrame({
    "URL_ID": [f"AUTO_{str(i+1).zfill(3)}" for i in range(len(urls))],
    "URL": urls
})

df_input.to_excel("Input.xlsx", index=False)

# ============================================================
# 4. SCRAPE ARTICLE TEXT
# ============================================================
def extract_article_text(url):
    try:
        r = requests.get(url, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")
        paragraphs = soup.find_all("p")
        return " ".join(p.get_text() for p in paragraphs)
    except:
        return ""

# ============================================================
# 5. TEXT CLEANING
# ============================================================
def clean_tokens(text):
    tokens = word_tokenize(text.lower())
    return [w for w in tokens if w.isalpha() and w not in stop_words]

# ============================================================
# 6. HELPER FUNCTIONS
# ============================================================
def count_syllables(word):
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith(("es", "ed")):
        count -= 1
    return max(count, 1)

def count_pronouns(text):
    return len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

# ============================================================
# 7. ORIGINAL METRICS
# ============================================================
def analyze_original(text):
    sentences = sent_tokenize(text)
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)

    sentiment = sia.polarity_scores(text)

    pos = sentiment['pos']
    neg = sentiment['neg']
    polarity = sentiment['compound']
    subjectivity = pos + neg

    complex_words = [w for w in tokens if count_syllables(w) > 2]
    complex_count = len(complex_words)

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = complex_count / wc if wc else 0
    fog = 0.4 * (avg_sentence_len + pct_complex)

    syll_per_word = sum(count_syllables(w) for w in tokens) / wc if wc else 0
    avg_word_len = sum(len(w) for w in tokens) / wc if wc else 0
    pronouns = count_pronouns(text)

    return [
        pos, neg, polarity, subjectivity,
        avg_sentence_len, pct_complex, fog,
        avg_sentence_len, complex_count, wc,
        syll_per_word, pronouns, avg_word_len
    ]

# ============================================================
# 8. EXTENDED METRICS
# ============================================================
def analyze_extended(text, url):
    sentences = sent_tokenize(text)
    paragraphs = [p for p in text.split("\n") if p.strip()]
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    pc = len(paragraphs)

    sentiment = sia.polarity_scores(text)

    pos = sentiment['pos']
    neg = sentiment['neg']
    neu = sentiment['neu']
    polarity = sentiment['compound']

    intensity = abs(polarity)
    emotionality = pos + neg
    subjectivity = emotionality / (wc + 1e-6)

    unique_words = len(set(tokens))
    ttr = unique_words / wc if wc else 0

    complex_words = [w for w in tokens if count_syllables(w) > 2]
    complex_count = len(complex_words)
    pct_complex = complex_count / wc if wc else 0

    avg_sentence_len = wc / sc if sc else 0
    fog = 0.4 * (avg_sentence_len + pct_complex)

    syllables = sum(count_syllables(w) for w in tokens)
    flesch = 206.835 - (1.015 * avg_sentence_len) - (84.6 * (syllables / wc)) if wc else 0

    reading_time = wc / 225
    avg_word_len = sum(len(w) for w in tokens) / wc if wc else 0

    pronouns = count_pronouns(text)
    questions = text.count("?")
    exclamations = text.count("!")

    domain = urlparse(url).netloc
    length_cat = "Short" if wc < 500 else "Medium" if wc <= 1200 else "Long"

    return [
        domain, pos, neg, neu, polarity, intensity, emotionality,
        subjectivity, wc, unique_words, ttr,
        sc, avg_sentence_len, pc,
        complex_count, pct_complex, fog,
        flesch, reading_time, pronouns,
        questions, exclamations, avg_word_len, length_cat
    ]

# ============================================================
# 9. GENERATE OUTPUT FILES
# ============================================================
original_cols = [
    "URL_ID","URL","POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE",
    "SUBJECTIVITY SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT",
    "WORD COUNT","SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"
]

extended_cols = [
    "URL_ID","URL","DOMAIN","POSITIVE SCORE","NEGATIVE SCORE","NEUTRAL SCORE",
    "POLARITY SCORE","SENTIMENT INTENSITY","EMOTIONALITY SCORE",
    "SUBJECTIVITY SCORE","WORD COUNT","UNIQUE WORD COUNT","TYPE TOKEN RATIO",
    "SENTENCE COUNT","AVG SENTENCE LENGTH","PARAGRAPH COUNT",
    "COMPLEX WORD COUNT","PERCENTAGE OF COMPLEX WORDS","FOG INDEX",
    "FLESCH READING EASE","READING TIME (MIN)","PERSONAL PRONOUNS",
    "QUESTION COUNT","EXCLAMATION COUNT","AVG WORD LENGTH",
    "ARTICLE LENGTH CATEGORY"
]

orig_rows, ext_rows = [], []

for _, row in df_input.iterrows():
    text = extract_article_text(row["URL"])
    orig_rows.append([row["URL_ID"], row["URL"]] + analyze_original(text))
    ext_rows.append([row["URL_ID"], row["URL"]] + analyze_extended(text, row["URL"]))

df_output = pd.DataFrame(orig_rows, columns=original_cols)
df_extended = pd.DataFrame(ext_rows, columns=extended_cols)

df_output.to_excel("Output.xlsx", index=False)
df_extended.to_excel("Output_Extended.xlsx", index=False)

# ============================================================
# 10. COMPARISON ANALYSIS (PROOF)
# ============================================================
original_metrics = ["POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE","SUBJECTIVITY SCORE","FOG INDEX"]
extended_metrics = ["NEUTRAL SCORE","SENTIMENT INTENSITY","EMOTIONALITY SCORE","TYPE TOKEN RATIO","FLESCH READING EASE"]

comparison = pd.DataFrame({
    "Original Avg Variance": [df_output[original_metrics].var().mean()],
    "Extended Avg Variance": [df_extended[extended_metrics].var().mean()],
    "Original Avg Correlation": [df_output[original_metrics].corr().abs().mean().mean()],
    "Extended Avg Correlation": [df_extended[extended_metrics].corr().abs().mean().mean()]
})

comparison


In [None]:
# ============================================================
# 0. INSTALL DEPENDENCIES (Colab only)
# ============================================================
# !pip install pandas feedparser requests beautifulsoup4 nltk openpyxl

# ============================================================
# 1. IMPORTS
# ============================================================
import pandas as pd
import feedparser
import requests
import nltk
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import time

# ============================================================
# 2. NLTK SETUP
# ============================================================
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

stop_words = set(stopwords.words("english"))
sia = SentimentIntensityAnalyzer()

# ============================================================
# 3. FETCH URLs (at least 100)
# ============================================================
QUERY = "artificial intelligence"
MAX_URLS = 200  # Fetch more to ensure at least 100 valid articles

rss = f"https://news.google.com/rss/search?q={QUERY.replace(' ', '+')}"
feed = feedparser.parse(rss)

urls = []
for entry in feed.entries:
    urls.append(entry.link)
    if len(urls) >= MAX_URLS:
        break

# Remove duplicates
urls = list(dict.fromkeys(urls))

df_input = pd.DataFrame({
    "URL_ID": [f"AUTO_{i+1:03d}" for i in range(len(urls))],
    "URL": urls
})

df_input.to_excel("Input.xlsx", index=False)
print(f"Generated Input.xlsx with {len(urls)} URLs")

# ============================================================
# 4. HELPER FUNCTIONS
# ============================================================

def resolve_url(url):
    """Follow redirects and get final URL"""
    try:
        return requests.get(url, timeout=10, allow_redirects=True).url
    except:
        return url

def extract_article_text(url):
    """Extract text from <p> tags"""
    try:
        url = resolve_url(url)
        headers = {"User-Agent": "Mozilla/5.0"}
        html = requests.get(url, headers=headers, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        text = " ".join(p.get_text() for p in soup.find_all("p"))
        return text.strip()
    except:
        return ""

def clean_tokens(text):
    tokens = word_tokenize(text.lower())
    return [w for w in tokens if w.isalpha() and w not in stop_words]

def count_syllables(word):
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith(("es", "ed")):
        count -= 1
    return max(count, 1)

def count_pronouns(text):
    return len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

def safe_analyze(func, *args):
    """Return zeros if analysis fails"""
    try:
        result = func(*args)
        assert result is not None
        return result
    except:
        if func.__name__ == "analyze_original":
            return [0]*13
        else:
            return [0]*24

# ============================================================
# 5. ORIGINAL METRICS
# ============================================================
def analyze_original(text):
    sentences = sent_tokenize(text)
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)

    s = sia.polarity_scores(text)
    complex_words = [w for w in tokens if count_syllables(w) > 2]

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0

    return [
        s['pos'],                          # POSITIVE SCORE
        s['neg'],                          # NEGATIVE SCORE
        s['compound'],                     # POLARITY SCORE
        s['pos'] + s['neg'],               # SUBJECTIVITY SCORE
        avg_sentence_len,                  # AVG SENTENCE LENGTH
        pct_complex,                       # PERCENTAGE OF COMPLEX WORDS
        0.4*(avg_sentence_len + pct_complex), # FOG INDEX
        avg_sentence_len,                  # AVG NUMBER OF WORDS PER SENTENCE
        len(complex_words),                # COMPLEX WORD COUNT
        wc,                                # WORD COUNT
        sum(count_syllables(w) for w in tokens)/wc if wc else 0, # SYLLABLE PER WORD
        count_pronouns(text),              # PERSONAL PRONOUNS
        sum(len(w) for w in tokens)/wc if wc else 0 # AVG WORD LENGTH
    ]

# ============================================================
# 6. EXTENDED METRICS
# ============================================================
def analyze_extended(text, url):
    sentences = sent_tokenize(text)
    paragraphs = [p for p in text.split("\n") if p.strip()]
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    pc = len(paragraphs)

    s = sia.polarity_scores(text)

    unique_words = len(set(tokens))
    complex_words = [w for w in tokens if count_syllables(w) > 2]
    syllables = sum(count_syllables(w) for w in tokens)

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0
    flesch = 206.835 - 1.015*avg_sentence_len - 84.6*(syllables/wc) if wc else 0

    return [
        urlparse(url).netloc,               # DOMAIN
        s['pos'],                           # POSITIVE SCORE
        s['neg'],                           # NEGATIVE SCORE
        s['neu'],                           # NEUTRAL SCORE
        s['compound'],                      # POLARITY SCORE
        abs(s['compound']),                 # SENTIMENT INTENSITY
        s['pos'] + s['neg'],                # EMOTIONALITY SCORE
        (s['pos'] + s['neg'])/(wc + 1e-6), # SUBJECTIVITY SCORE
        wc,                                 # WORD COUNT
        unique_words,                       # UNIQUE WORD COUNT
        unique_words/wc if wc else 0,       # TYPE TOKEN RATIO
        sc,                                 # SENTENCE COUNT
        avg_sentence_len,                    # AVG SENTENCE LENGTH
        pc,                                 # PARAGRAPH COUNT
        len(complex_words),                  # COMPLEX WORD COUNT
        pct_complex,                         # PERCENTAGE OF COMPLEX WORDS
        0.4*(avg_sentence_len + pct_complex), # FOG INDEX
        flesch,                             # FLESCH READING EASE
        wc/225,                              # READING TIME (MIN)
        count_pronouns(text),               # PERSONAL PRONOUNS
        text.count("?"),                    # QUESTION COUNT
        text.count("!"),                    # EXCLAMATION COUNT
        sum(len(w) for w in tokens)/wc if wc else 0,  # AVG WORD LENGTH
        "Short" if wc < 500 else "Medium" if wc <= 1200 else "Long"  # ARTICLE LENGTH CATEGORY
    ]

# ============================================================
# 7. COLUMNS
# ============================================================
original_cols = [
    "URL_ID","URL","POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE",
    "SUBJECTIVITY SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT",
    "WORD COUNT","SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"
]

extended_cols = [
    "URL_ID","URL","DOMAIN","POSITIVE SCORE","NEGATIVE SCORE","NEUTRAL SCORE",
    "POLARITY SCORE","SENTIMENT INTENSITY","EMOTIONALITY SCORE",
    "SUBJECTIVITY SCORE","WORD COUNT","UNIQUE WORD COUNT","TYPE TOKEN RATIO",
    "SENTENCE COUNT","AVG SENTENCE LENGTH","PARAGRAPH COUNT",
    "COMPLEX WORD COUNT","PERCENTAGE OF COMPLEX WORDS","FOG INDEX",
    "FLESCH READING EASE","READING TIME (MIN)","PERSONAL PRONOUNS",
    "QUESTION COUNT","EXCLAMATION COUNT","AVG WORD LENGTH",
    "ARTICLE LENGTH CATEGORY"
]

# ============================================================
# 8. GENERATE OUTPUTS
# ============================================================
orig_rows, ext_rows = [], []

for _, r in df_input.iterrows():
    text = extract_article_text(r["URL"])
    orig_metrics = safe_analyze(analyze_original, text)
    ext_metrics = safe_analyze(analyze_extended, text, r["URL"])
    orig_rows.append([r["URL_ID"], r["URL"]] + orig_metrics)
    ext_rows.append([r["URL_ID"], r["URL"]] + ext_metrics)
    time.sleep(0.2)  # polite scraping

df_output = pd.DataFrame(orig_rows, columns=original_cols)
df_extended = pd.DataFrame(ext_rows, columns=extended_cols)

# Save XLSX
df_output.to_excel("Output.xlsx", index=False)
df_extended.to_excel("Output_Extended.xlsx", index=False)

# Save CSV
df_output.to_csv("Output.csv", index=False)
df_extended.to_csv("Output_Extended.csv", index=False)

# ============================================================
# 9. VERIFY OUTPUT
# ============================================================
print("Original columns:", list(df_output.columns))
print("Extended columns:", list(df_extended.columns))
print("Extended CSV/XLSX generated successfully with at least 100 rows")
print("Sample data from extended metrics:")
print(df_extended.head(3))

In [None]:
# ============================================================
# INSTALL (COLAB ONLY)
# ============================================================
# !pip install pandas feedparser requests beautifulsoup4 nltk openpyxl

# ============================================================
# IMPORTS
# ============================================================
import pandas as pd
import feedparser
import requests
import nltk
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import time

# ============================================================
# NLTK SETUP
# ============================================================
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

stop_words = set(stopwords.words("english"))
sia = SentimentIntensityAnalyzer()

# ============================================================
# HELPER FUNCTIONS
# ============================================================
def resolve_url(url):
    """Follow redirects"""
    try:
        return requests.get(url, timeout=10, allow_redirects=True).url
    except:
        return url

def extract_article_text(url):
    """Extract text from <p> tags"""
    try:
        url = resolve_url(url)
        headers = {"User-Agent": "Mozilla/5.0"}
        html = requests.get(url, headers=headers, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        text = " ".join(p.get_text() for p in soup.find_all("p"))
        return text.strip()
    except:
        return ""

def clean_tokens(text):
    tokens = word_tokenize(text.lower())
    return [w for w in tokens if w.isalpha() and w not in stop_words]

def count_syllables(word):
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith(("es","ed")):
        count -= 1
    return max(count,1)

def count_pronouns(text):
    return len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

def safe_analyze(func, *args):
    """Return zeros if analysis fails"""
    try:
        result = func(*args)
        assert result is not None
        return result
    except:
        if func.__name__ == "analyze_original":
            return [0]*13
        else:
            return [0]*24

# ============================================================
# ORIGINAL METRICS
# ============================================================
def analyze_original(text):
    sentences = sent_tokenize(text)
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    s = sia.polarity_scores(text)
    complex_words = [w for w in tokens if count_syllables(w) > 2]

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0

    return [
        s['pos'],
        s['neg'],
        s['compound'],
        s['pos'] + s['neg'],
        avg_sentence_len,
        pct_complex,
        0.4*(avg_sentence_len + pct_complex),
        avg_sentence_len,
        len(complex_words),
        wc,
        sum(count_syllables(w) for w in tokens)/wc if wc else 0,
        count_pronouns(text),
        sum(len(w) for w in tokens)/wc if wc else 0
    ]

# ============================================================
# EXTENDED METRICS
# ============================================================
def analyze_extended(text, url):
    sentences = sent_tokenize(text)
    paragraphs = [p for p in text.split("\n") if p.strip()]
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    pc = len(paragraphs)

    s = sia.polarity_scores(text)
    unique_words = len(set(tokens))
    complex_words = [w for w in tokens if count_syllables(w) > 2]
    syllables = sum(count_syllables(w) for w in tokens)

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0
    flesch = 206.835 - 1.015*avg_sentence_len - 84.6*(syllables/wc) if wc else 0

    return [
        urlparse(url).netloc,
        s['pos'],
        s['neg'],
        s['neu'],
        s['compound'],
        abs(s['compound']),
        s['pos'] + s['neg'],
        (s['pos'] + s['neg'])/(wc+1e-6),
        wc,
        unique_words,
        unique_words/wc if wc else 0,
        sc,
        avg_sentence_len,
        pc,
        len(complex_words),
        pct_complex,
        0.4*(avg_sentence_len + pct_complex),
        flesch,
        wc/225,
        count_pronouns(text),
        text.count("?"),
        text.count("!"),
        sum(len(w) for w in tokens)/wc if wc else 0,
        "Short" if wc<500 else "Medium" if wc<=1200 else "Long"
    ]

# ============================================================
# COLUMNS
# ============================================================
original_cols = [
    "URL_ID","URL","POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE",
    "SUBJECTIVITY SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT",
    "WORD COUNT","SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"
]

extended_cols = [
    "URL_ID","URL","DOMAIN","POSITIVE SCORE","NEGATIVE SCORE","NEUTRAL SCORE",
    "POLARITY SCORE","SENTIMENT INTENSITY","EMOTIONALITY SCORE",
    "SUBJECTIVITY SCORE","WORD COUNT","UNIQUE WORD COUNT","TYPE TOKEN RATIO",
    "SENTENCE COUNT","AVG SENTENCE LENGTH","PARAGRAPH COUNT",
    "COMPLEX WORD COUNT","PERCENTAGE OF COMPLEX WORDS","FOG INDEX",
    "FLESCH READING EASE","READING TIME (MIN)","PERSONAL PRONOUNS",
    "QUESTION COUNT","EXCLAMATION COUNT","AVG WORD LENGTH",
    "ARTICLE LENGTH CATEGORY"
]

# ============================================================
# 1. FUNCTION TO GET 200 VALID ARTICLES
# ============================================================
def fetch_valid_articles(topic="artificial intelligence", target_count=200):
    urls, texts = [], []
    attempt = 0
    while len(texts) < target_count:
        rss = f"https://news.google.com/rss/search?q={topic.replace(' ','+')}"
        feed = feedparser.parse(rss)
        candidates = [entry.link for entry in feed.entries]
        for url in candidates:
            if url in urls:  # skip duplicates
                continue
            text = extract_article_text(url)
            if len(text.split()) > 50:  # only accept articles with >50 words
                urls.append(url)
                texts.append(text)
            if len(texts) >= target_count:
                break
        attempt += 1
        if len(texts) < target_count:
            topic = "technology" if topic=="artificial intelligence" else "science"
            print(f"Not enough valid articles, changing topic to: {topic}")
        if attempt > 5:
            break
    return urls[:target_count], texts[:target_count]

# ============================================================
# 2. FETCH ARTICLES
# ============================================================
urls, texts = fetch_valid_articles(topic="artificial intelligence", target_count=200)
print(f"Fetched {len(urls)} valid articles with text >50 words")

df_input = pd.DataFrame({
    "URL_ID": [f"AUTO_{i+1:03d}" for i in range(len(urls))],
    "URL": urls
})
df_input.to_excel("Input.xlsx", index=False)

# ============================================================
# 3. GENERATE METRICS
# ============================================================
orig_rows, ext_rows = [], []

for i, (url, text) in enumerate(zip(urls, texts)):
    orig_metrics = safe_analyze(analyze_original, text)
    ext_metrics = safe_analyze(analyze_extended, text, url)
    orig_rows.append([f"AUTO_{i+1:03d}", url] + orig_metrics)
    ext_rows.append([f"AUTO_{i+1:03d}", url] + ext_metrics)
    time.sleep(0.1)  # polite scraping

df_output = pd.DataFrame(orig_rows, columns=original_cols)
df_extended = pd.DataFrame(ext_rows, columns=extended_cols)

# ============================================================
# 4. SAVE XLSX + CSV
# ============================================================
df_output.to_excel("Output.xlsx", index=False)
df_extended.to_excel("Output_Extended.xlsx", index=False)
df_output.to_csv("Output.csv", index=False)
df_extended.to_csv("Output_Extended.csv", index=False)

# ============================================================
# 5. VERIFY
# ============================================================
print("Original columns:", list(df_output.columns))
print("Extended columns:", list(df_extended.columns))
print("Sample of extended metrics:")
print(df_extended.head(3))

In [None]:
# ============================================================
# INSTALL DEPENDENCIES (Colab only)
# ============================================================
# !pip install pandas feedparser requests beautifulsoup4 nltk openpyxl

# ============================================================
# IMPORTS
# ============================================================
import pandas as pd
import feedparser
import requests
import nltk
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import time

# ============================================================
# NLTK SETUP
# ============================================================
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

stop_words = set(stopwords.words("english"))
sia = SentimentIntensityAnalyzer()

# ============================================================
# HELPER FUNCTIONS
# ============================================================
def extract_article_text(url):
    """Extract text from <p> tags of a real article page"""
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        html = requests.get(url, headers=headers, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        text = " ".join(p.get_text() for p in soup.find_all("p"))
        return text.strip()
    except:
        return ""

def clean_tokens(text):
    tokens = word_tokenize(text.lower())
    return [w for w in tokens if w.isalpha() and w not in stop_words]

def count_syllables(word):
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1,len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith(("es","ed")):
        count -= 1
    return max(count,1)

def count_pronouns(text):
    return len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

def safe_analyze(func, *args):
    try:
        result = func(*args)
        assert result is not None
        return result
    except:
        if func.__name__ == "analyze_original":
            return [0]*13
        else:
            return [0]*24

# ============================================================
# ORIGINAL METRICS
# ============================================================
def analyze_original(text):
    sentences = sent_tokenize(text)
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    s = sia.polarity_scores(text)
    complex_words = [w for w in tokens if count_syllables(w) > 2]

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0

    return [
        s['pos'],
        s['neg'],
        s['compound'],
        s['pos'] + s['neg'],
        avg_sentence_len,
        pct_complex,
        0.4*(avg_sentence_len + pct_complex),
        avg_sentence_len,
        len(complex_words),
        wc,
        sum(count_syllables(w) for w in tokens)/wc if wc else 0,
        count_pronouns(text),
        sum(len(w) for w in tokens)/wc if wc else 0
    ]

# ============================================================
# EXTENDED METRICS
# ============================================================
def analyze_extended(text, url):
    sentences = sent_tokenize(text)
    paragraphs = [p for p in text.split("\n") if p.strip()]
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    pc = len(paragraphs)

    s = sia.polarity_scores(text)
    unique_words = len(set(tokens))
    complex_words = [w for w in tokens if count_syllables(w) > 2]
    syllables = sum(count_syllables(w) for w in tokens)

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0
    flesch = 206.835 - 1.015*avg_sentence_len - 84.6*(syllables/wc) if wc else 0

    return [
        urlparse(url).netloc,
        s['pos'],
        s['neg'],
        s['neu'],
        s['compound'],
        abs(s['compound']),
        s['pos'] + s['neg'],
        (s['pos'] + s['neg'])/(wc+1e-6),
        wc,
        unique_words,
        unique_words/wc if wc else 0,
        sc,
        avg_sentence_len,
        pc,
        len(complex_words),
        pct_complex,
        0.4*(avg_sentence_len + pct_complex),
        flesch,
        wc/225,
        count_pronouns(text),
        text.count("?"),
        text.count("!"),
        sum(len(w) for w in tokens)/wc if wc else 0,
        "Short" if wc<500 else "Medium" if wc<=1200 else "Long"
    ]

# ============================================================
# COLUMNS
# ============================================================
original_cols = [
    "URL_ID","URL","POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE",
    "SUBJECTIVITY SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT",
    "WORD COUNT","SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"
]

extended_cols = [
    "URL_ID","URL","DOMAIN","POSITIVE SCORE","NEGATIVE SCORE","NEUTRAL SCORE",
    "POLARITY SCORE","SENTIMENT INTENSITY","EMOTIONALITY SCORE",
    "SUBJECTIVITY SCORE","WORD COUNT","UNIQUE WORD COUNT","TYPE TOKEN RATIO",
    "SENTENCE COUNT","AVG SENTENCE LENGTH","PARAGRAPH COUNT",
    "COMPLEX WORD COUNT","PERCENTAGE OF COMPLEX WORDS","FOG INDEX",
    "FLESCH READING EASE","READING TIME (MIN)","PERSONAL PRONOUNS",
    "QUESTION COUNT","EXCLAMATION COUNT","AVG WORD LENGTH",
    "ARTICLE LENGTH CATEGORY"
]

# ============================================================
# 1. FETCH 200 REAL ARTICLES
# ============================================================
RSS_FEEDS = [
    "http://feeds.bbci.co.uk/news/technology/rss.xml",
    "https://www.theverge.com/rss/index.xml",
    "https://www.reuters.com/rssFeed/technologyNews"
]

MAX_ARTICLES = 200
urls, texts = [], []

for feed_url in RSS_FEEDS:
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        url = entry.link
        if url in urls:
            continue
        text = extract_article_text(url)
        if len(text.split()) > 50:
            urls.append(url)
            texts.append(text)
        if len(urls) >= MAX_ARTICLES:
            break
    if len(urls) >= MAX_ARTICLES:
        break

print(f"Fetched {len(urls)} valid articles with text >50 words")

# ============================================================
# 2. SAVE INPUT.XLSX
# ============================================================
df_input = pd.DataFrame({
    "URL_ID": [f"AUTO_{i+1:03d}" for i in range(len(urls))],
    "URL": urls
})
df_input.to_excel("Input.xlsx", index=False)

# ============================================================
# 3. GENERATE METRICS
# ============================================================
orig_rows, ext_rows = [], []

for i, (url, text) in enumerate(zip(urls, texts)):
    orig_metrics = safe_analyze(analyze_original, text)
    ext_metrics = safe_analyze(analyze_extended, text, url)
    orig_rows.append([f"AUTO_{i+1:03d}", url] + orig_metrics)
    ext_rows.append([f"AUTO_{i+1:03d}", url] + ext_metrics)
    time.sleep(0.1)  # polite scraping

df_output = pd.DataFrame(orig_rows, columns=original_cols)
df_extended = pd.DataFrame(ext_rows, columns=extended_cols)

# ============================================================
# 4. SAVE XLSX + CSV
# ============================================================
df_output.to_excel("Output.xlsx", index=False)
df_extended.to_excel("Output_Extended.xlsx", index=False)
df_output.to_csv("Output.csv", index=False)
df_extended.to_csv("Output_Extended.csv", index=False)

# ============================================================
# 5. VERIFY
# ============================================================
print("Original columns:", list(df_output.columns))
print("Extended columns:", list(df_extended.columns))
print("Sample of extended metrics:")
print(df_extended.head(60))

In [None]:
# Check total rows
print("Total articles processed:", len(df_extended))

# Show last 5 rows
print(df_extended.tail())

# Show a summary of extended metrics
print(df_extended.describe())

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

In [None]:
# ============================================================
# INSTALL DEPENDENCIES (Colab only)
# ============================================================
# !pip install pandas feedparser requests beautifulsoup4 nltk openpyxl

# ============================================================
# IMPORTS
# ============================================================
import pandas as pd
import feedparser
import requests
import nltk
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import time

# ============================================================
# NLTK SETUP
# ============================================================

stop_words = set(stopwords.words("english"))
sia = SentimentIntensityAnalyzer()

# ============================================================
# HELPER FUNCTIONS
# ============================================================
def extract_article_text(url):
    """Extract text from <p> tags of a real article page"""
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        html = requests.get(url, headers=headers, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        text = " ".join(p.get_text() for p in soup.find_all("p"))
        return text.strip()
    except:
        return ""

def clean_tokens(text):
    tokens = word_tokenize(text.lower())
    return [w for w in tokens if w.isalpha() and w not in stop_words]

def count_syllables(word):
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1,len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith(("es","ed")):
        count -= 1
    return max(count,1)

def count_pronouns(text):
    return len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

def safe_analyze(func, *args):
    try:
        result = func(*args)
        assert result is not None
        return result
    except:
        if func.__name__ == "analyze_original":
            return [0]*13
        else:
            return [0]*24

# ============================================================
# ORIGINAL METRICS
# ============================================================
def analyze_original(text):
    sentences = sent_tokenize(text)
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    s = sia.polarity_scores(text)
    complex_words = [w for w in tokens if count_syllables(w) > 2]

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0

    return [
        s['pos'],
        s['neg'],
        s['compound'],
        s['pos'] + s['neg'],
        avg_sentence_len,
        pct_complex,
        0.4*(avg_sentence_len + pct_complex),
        avg_sentence_len,
        len(complex_words),
        wc,
        sum(count_syllables(w) for w in tokens)/wc if wc else 0,
        count_pronouns(text),
        sum(len(w) for w in tokens)/wc if wc else 0
    ]

# ============================================================
# EXTENDED METRICS
# ============================================================
def analyze_extended(text, url):
    sentences = sent_tokenize(text)
    paragraphs = [p for p in text.split("\n") if p.strip()]
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    pc = len(paragraphs)

    s = sia.polarity_scores(text)
    unique_words = len(set(tokens))
    complex_words = [w for w in tokens if count_syllables(w) > 2]
    syllables = sum(count_syllables(w) for w in tokens)

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0
    flesch = 206.835 - 1.015*avg_sentence_len - 84.6*(syllables/wc) if wc else 0

    return [
        urlparse(url).netloc,
        s['pos'],
        s['neg'],
        s['neu'],
        s['compound'],
        abs(s['compound']),
        s['pos'] + s['neg'],
        (s['pos'] + s['neg'])/(wc+1e-6),
        wc,
        unique_words,
        unique_words/wc if wc else 0,
        sc,
        avg_sentence_len,
        pc,
        len(complex_words),
        pct_complex,
        0.4*(avg_sentence_len + pct_complex),
        flesch,
        wc/225,
        count_pronouns(text),
        text.count("?"),
        text.count("!"),
        sum(len(w) for w in tokens)/wc if wc else 0,
        "Short" if wc<500 else "Medium" if wc<=1200 else "Long"
    ]

# ============================================================
# COLUMNS
# ============================================================
original_cols = [
    "URL_ID","URL","POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE",
    "SUBJECTIVITY SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT",
    "WORD COUNT","SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"
]

extended_cols = [
    "URL_ID","URL","DOMAIN","POSITIVE SCORE","NEGATIVE SCORE","NEUTRAL SCORE",
    "POLARITY SCORE","SENTIMENT INTENSITY","EMOTIONALITY SCORE",
    "SUBJECTIVITY SCORE","WORD COUNT","UNIQUE WORD COUNT","TYPE TOKEN RATIO",
    "SENTENCE COUNT","AVG SENTENCE LENGTH","PARAGRAPH COUNT",
    "COMPLEX WORD COUNT","PERCENTAGE OF COMPLEX WORDS","FOG INDEX",
    "FLESCH READING EASE","READING TIME (MIN)","PERSONAL PRONOUNS",
    "QUESTION COUNT","EXCLAMATION COUNT","AVG WORD LENGTH",
    "ARTICLE LENGTH CATEGORY"
]

# ============================================================
# 1. FETCH 200 REAL ARTICLES
# ============================================================
RSS_FEEDS = [
    # BBC Technology
    "http://feeds.bbci.co.uk/news/technology/rss.xml",
    # The Verge
    "https://www.theverge.com/rss/index.xml",
    # Reuters Technology
    "https://www.reuters.com/rssFeed/technologyNews",
    # New York Times Technology
    "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
    # Wired
    "https://www.wired.com/feed/rss",
    # Ars Technica
    "http://feeds.arstechnica.com/arstechnica/index",
    # TechCrunch
    "http://feeds.feedburner.com/TechCrunch/",
    # CNET
    "https://www.cnet.com/rss/news/",
    # Engadget
    "https://www.engadget.com/rss.xml",
    # MIT Technology Review
    "https://www.technologyreview.com/feed/"
]

MAX_ARTICLES = 200
urls, texts = [], []

for feed_url in RSS_FEEDS:
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        url = entry.link
        if url in urls:
            continue
        text = extract_article_text(url)
        if len(text.split()) > 50:
            urls.append(url)
            texts.append(text)
        if len(urls) >= MAX_ARTICLES:
            break
    if len(urls) >= MAX_ARTICLES:
        break

print(f"Fetched {len(urls)} valid articles with text >50 words")

# ============================================================
# 2. SAVE INPUT.XLSX
# ============================================================
df_input = pd.DataFrame({
    "URL_ID": [f"AUTO_{i+1:03d}" for i in range(len(urls))],
    "URL": urls
})
df_input.to_excel("Input.xlsx", index=False)

# ============================================================
# 3. GENERATE METRICS
# ============================================================
orig_rows, ext_rows = [], []

for i, (url, text) in enumerate(zip(urls, texts)):
    orig_metrics = safe_analyze(analyze_original, text)
    ext_metrics = safe_analyze(analyze_extended, text, url)
    orig_rows.append([f"AUTO_{i+1:03d}", url] + orig_metrics)
    ext_rows.append([f"AUTO_{i+1:03d}", url] + ext_metrics)
    time.sleep(0.1)  # polite scraping

df_output = pd.DataFrame(orig_rows, columns=original_cols)
df_extended = pd.DataFrame(ext_rows, columns=extended_cols)

# ============================================================
# 4. SAVE XLSX + CSV
# ============================================================
df_output.to_excel("Output.xlsx", index=False)
df_extended.to_excel("Output_Extended.xlsx", index=False)
df_output.to_csv("Output.csv", index=False)
df_extended.to_csv("Output_Extended.csv", index=False)

# ============================================================
# 5. VERIFY
# ============================================================
print("Original columns:", list(df_output.columns))
print("Extended columns:", list(df_extended.columns))
print("Sample of extended metrics:")
print(df_extended.head(3))

In [None]:
# Check total rows
print("Total articles processed:", len(df_extended))

# Show last 5 rows
print(df_extended.tail(2))


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# ============================================================
# 1. LOAD OUTPUT FILES
# ============================================================
df_orig = pd.read_excel("Output.xlsx")
df_ext = pd.read_excel("Output_Extended.xlsx")

# ============================================================
# 2. MERGE ON URL_ID
# ============================================================
df_compare = pd.merge(df_orig, df_ext, on="URL_ID", suffixes=("_orig", "_ext"))

# ============================================================
# 3. COMPUTE DIFFERENCES
# ============================================================
metrics_to_compare = [
    ("POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE",
     "SUBJECTIVITY SCORE", "AVG SENTENCE LENGTH",
     "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX", "WORD COUNT")
]

# We'll store the differences as new columns
for metric in metrics_to_compare[0]:
    df_compare[f"{metric}_diff"] = df_compare[f"{metric}_ext"] - df_compare[f"{metric}_orig"]

# ============================================================
# 4. SUMMARY STATISTICS
# ============================================================
summary = df_compare[[f"{m}_diff" for m in metrics_to_compare[0]]].describe()
print("Comparison Summary (Extended - Original):")
print(summary)

# ============================================================
# 5. PLOT COMPARISON
# ============================================================
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
plt.figure(figsize=(12,6))

for i, metric in enumerate(metrics_to_compare[0]):
    plt.subplot(2,4,i+1)
    plt.scatter(df_compare[f"{metric}_orig"], df_compare[f"{metric}_ext"], alpha=0.6)
    plt.plot([df_compare[f"{metric}_orig"].min(), df_compare[f"{metric}_orig"].max()],
             [df_compare[f"{metric}_orig"].min(), df_compare[f"{metric}_orig"].max()],
             color='red', linestyle='--')  # 45-degree line
    plt.xlabel("Original")
    plt.ylabel("Extended")
    plt.title(metric)

plt.tight_layout()
plt.show()

# ============================================================
# 6. OPTIONAL: AVERAGE IMPROVEMENT
# ============================================================
avg_improvements = df_compare[[f"{m}_diff" for m in metrics_to_compare[0]]].mean()
print("\nAverage Improvement (Extended - Original):")
print(avg_improvements)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ============================================================
# 1. LOAD EXTENDED OUTPUT
# ============================================================
df_ext = pd.read_excel("Output_Extended.xlsx")

# ============================================================
# 2. SUMMARY STATISTICS OF EXTENDED-ONLY METRICS
# ============================================================
extended_only_metrics = [
    "UNIQUE WORD COUNT", "TYPE TOKEN RATIO", "FLESCH READING EASE",
    "FOG INDEX", "READING TIME (MIN)", "QUESTION COUNT",
    "EXCLAMATION COUNT", "COMPLEX WORD COUNT"
]

print("Summary statistics for extended metrics:")
print(df_ext[extended_only_metrics].describe())

# ============================================================
# 3. DISTRIBUTION PLOTS
# ============================================================
sns.set(style="whitegrid")
plt.figure(figsize=(18,12))

for i, metric in enumerate(extended_only_metrics):
    plt.subplot(3,3,i+1)
    sns.histplot(df_ext[metric], kde=True, color="skyblue", bins=20)
    plt.title(metric)
    plt.xlabel("")
    plt.ylabel("Count")

plt.tight_layout()
plt.show()

# ============================================================
# 4. ARTICLE LENGTH CATEGORY DISTRIBUTION
# ============================================================
plt.figure(figsize=(6,4))
sns.countplot(x="ARTICLE LENGTH CATEGORY", data=df_ext, palette="Set2")
plt.title("Distribution of Article Length Categories")
plt.ylabel("Number of Articles")
plt.show()

# ============================================================
# 5. SCATTER PLOTS TO SHOW RELATIONSHIPS
# ============================================================
plt.figure(figsize=(12,5))

# TYPE TOKEN RATIO vs FOG Index
plt.subplot(1,2,1)
sns.scatterplot(x="TYPE TOKEN RATIO", y="FOG INDEX", data=df_ext)
plt.title("Type Token Ratio vs FOG Index")
plt.xlabel("Type Token Ratio")
plt.ylabel("FOG Index")

# Flesch Reading Ease vs Reading Time
plt.subplot(1,2,2)
sns.scatterplot(x="FLESCH READING EASE", y="READING TIME (MIN)", data=df_ext)
plt.title("Flesch Reading Ease vs Reading Time")
plt.xlabel("Flesch Reading Ease")
plt.ylabel("Reading Time (minutes)")

plt.tight_layout()
plt.show()

# ============================================================
# 6. TOP 10 ARTICLES BY COMPLEX WORD COUNT
# ============================================================
top_complex = df_ext.sort_values("COMPLEX WORD COUNT", ascending=False).head(10)
print("Top 10 articles by Complex Word Count:")
print(top_complex[["URL_ID","URL","COMPLEX WORD COUNT","FOG INDEX","TYPE TOKEN RATIO"]])

In [None]:
# ============================================================
# INSTALL (Colab Only)
# ============================================================
# !pip install wordcloud matplotlib seaborn

# ============================================================
# IMPORTS
# ============================================================
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# ============================================================
# LOAD EXTENDED OUTPUT
# ============================================================
df_ext = pd.read_excel("Output_Extended.xlsx")

# ============================================================
# 1. GENERATE WORD CLOUD
# ============================================================
# Combine all article text
all_text = " ".join(df_ext['URL'])  # If you saved the actual text in a column, replace 'URL' with 'TEXT'

# If text column is not saved, we need to loop over URLs to extract text
# For demonstration, assuming text is in df_ext['ARTICLE_TEXT']
try:
    all_text = " ".join(df_ext['ARTICLE_TEXT'])
except:
    print("No text column found; word cloud will use URLs as placeholder")

# Create word cloud
wordcloud = WordCloud(width=1200, height=600, background_color='white', collocations=False).generate(all_text)

# Plot the word cloud
plt.figure(figsize=(15,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of All Articles", fontsize=20)
plt.show()

# ============================================================
# 2. FIND MOST COMPLEX ARTICLES
# ============================================================
# Use COMPLEX WORD COUNT or FOG INDEX as complexity metric
df_ext['COMPLEXITY_SCORE'] = df_ext['COMPLEX WORD COUNT'] * df_ext['FOG INDEX']

# Top 10 most complex articles
top_complex = df_ext.sort_values('COMPLEXITY_SCORE', ascending=False).head(10)

print("Top 10 Most Complex Articles:")
print(top_complex[['URL_ID','URL','COMPLEX WORD COUNT','FOG INDEX','COMPLEXITY_SCORE']])


In [None]:
!pip install pandas feedparser requests beautifulsoup4 nltk openpyxl wordcloud seaborn matplotlib

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

In [None]:

# ============================================================
# IMPORTS
# ============================================================
import pandas as pd
import feedparser
import requests
import nltk
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import time

# ============================================================
# NLTK SETUP
# ============================================================
stop_words = set(stopwords.words("english"))
sia = SentimentIntensityAnalyzer()

# ============================================================
# HELPER FUNCTIONS
# ============================================================
def extract_article_text(url):
    """Extract text from <p> tags of a real article page"""
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        html = requests.get(url, headers=headers, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        text = " ".join(p.get_text() for p in soup.find_all("p"))
        return text.strip()
    except:
        return ""

def clean_tokens(text):
    tokens = word_tokenize(text.lower())
    return [w for w in tokens if w.isalpha() and w not in stop_words]

def count_syllables(word):
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1,len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith(("es","ed")):
        count -= 1
    return max(count,1)

def count_pronouns(text):
    return len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

def safe_analyze(func, *args):
    try:
        result = func(*args)
        assert result is not None
        return result
    except:
        if func.__name__ == "analyze_original":
            return [0]*13
        else:
            return [0]*24

# ============================================================
# METRIC FUNCTIONS
# ============================================================
def analyze_original(text):
    sentences = sent_tokenize(text)
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    s = sia.polarity_scores(text)
    complex_words = [w for w in tokens if count_syllables(w) > 2]

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0

    return [
        s['pos'],
        s['neg'],
        s['compound'],
        s['pos'] + s['neg'],
        avg_sentence_len,
        pct_complex,
        0.4*(avg_sentence_len + pct_complex),
        avg_sentence_len,
        len(complex_words),
        wc,
        sum(count_syllables(w) for w in tokens)/wc if wc else 0,
        count_pronouns(text),
        sum(len(w) for w in tokens)/wc if wc else 0
    ]

def analyze_extended(text, url):
    sentences = sent_tokenize(text)
    paragraphs = [p for p in text.split("\n") if p.strip()]
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    pc = len(paragraphs)

    s = sia.polarity_scores(text)
    unique_words = len(set(tokens))
    complex_words = [w for w in tokens if count_syllables(w) > 2]
    syllables = sum(count_syllables(w) for w in tokens)

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0
    flesch = 206.835 - 1.015*avg_sentence_len - 84.6*(syllables/wc) if wc else 0

    return [
        urlparse(url).netloc,
        s['pos'],
        s['neg'],
        s['neu'],
        s['compound'],
        abs(s['compound']),
        s['pos'] + s['neg'],
        (s['pos'] + s['neg'])/(wc+1e-6),
        wc,
        unique_words,
        unique_words/wc if wc else 0,
        sc,
        avg_sentence_len,
        pc,
        len(complex_words),
        pct_complex,
        0.4*(avg_sentence_len + pct_complex),
        flesch,
        wc/225,
        count_pronouns(text),
        text.count("?"),
        text.count("!"),
        sum(len(w) for w in tokens)/wc if wc else 0,
        "Short" if wc<500 else "Medium" if wc<=1200 else "Long"
    ]

# ============================================================
# COLUMN DEFINITIONS
# ============================================================
original_cols = [
    "URL_ID","URL","POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE",
    "SUBJECTIVITY SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT",
    "WORD COUNT","SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"
]

extended_cols = [
    "URL_ID","URL","ARTICLE_TEXT","DOMAIN","POSITIVE SCORE","NEGATIVE SCORE","NEUTRAL SCORE",
    "POLARITY SCORE","SENTIMENT INTENSITY","EMOTIONALITY SCORE",
    "SUBJECTIVITY SCORE","WORD COUNT","UNIQUE WORD COUNT","TYPE TOKEN RATIO",
    "SENTENCE COUNT","AVG SENTENCE LENGTH","PARAGRAPH COUNT",
    "COMPLEX WORD COUNT","PERCENTAGE OF COMPLEX WORDS","FOG INDEX",
    "FLESCH READING EASE","READING TIME (MIN)","PERSONAL PRONOUNS",
    "QUESTION COUNT","EXCLAMATION COUNT","AVG WORD LENGTH",
    "ARTICLE LENGTH CATEGORY"
]

# ============================================================
# RSS FEEDS (Expanded for 200+ articles)
# ============================================================
RSS_FEEDS = [
    "http://feeds.bbci.co.uk/news/technology/rss.xml",
    "https://www.theverge.com/rss/index.xml",
    "https://www.reuters.com/rssFeed/technologyNews",
    "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
    "https://www.wired.com/feed/rss",
    "http://feeds.arstechnica.com/arstechnica/index",
    "http://feeds.feedburner.com/TechCrunch/",
    "https://www.cnet.com/rss/news/",
    "https://www.engadget.com/rss.xml",
    "https://www.technologyreview.com/feed/",
    "https://www.reuters.com/rssFeed/technologyNews"
]

MAX_ARTICLES = 200
urls, texts = [], []

for feed_url in RSS_FEEDS:
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        url = entry.link
        if url in urls:
            continue
        text = extract_article_text(url)
        if len(text.split()) > 50:
            urls.append(url)
            texts.append(text)
        if len(urls) >= MAX_ARTICLES:
            break
    if len(urls) >= MAX_ARTICLES:
        break

print(f"Fetched {len(urls)} valid articles with text >50 words")

# ============================================================
# SAVE INPUT.XLSX
# ============================================================
df_input = pd.DataFrame({
    "URL_ID": [f"AUTO_{i+1:03d}" for i in range(len(urls))],
    "URL": urls
})
df_input.to_excel("Input.xlsx", index=False)

# ============================================================
# GENERATE METRICS
# ============================================================
orig_rows, ext_rows = [], []

for i, (url, text) in enumerate(zip(urls, texts)):
    orig_metrics = safe_analyze(analyze_original, text)
    ext_metrics = safe_analyze(analyze_extended, text, url)
    orig_rows.append([f"AUTO_{i+1:03d}", url] + orig_metrics)
    ext_rows.append([f"AUTO_{i+1:03d}", url, text] + ext_metrics)
    time.sleep(0.1)

df_output = pd.DataFrame(orig_rows, columns=original_cols)
df_extended = pd.DataFrame(ext_rows, columns=extended_cols)

# ============================================================
# SAVE OUTPUT FILES
# ============================================================
df_output.to_excel("Output.xlsx", index=False)
df_extended.to_excel("Output_Extended.xlsx", index=False)
df_output.to_csv("Output.csv", index=False)
df_extended.to_csv("Output_Extended.csv", index=False)

# ============================================================
# 1. GENERATE WORD CLOUD
# ============================================================
all_text = " ".join(df_extended['ARTICLE_TEXT'])
#wordcloud = WordCloud(width=1200, height=600, background_color='white', collocations=False).generate(all_text)

#plt.figure(figsize=(15,7))
#plt.imshow(wordcloud, interpolation='bilinear')
#plt.axis('off')
#plt.title("Word Cloud of All Articles", fontsize=20)
#plt.show()

# ============================================================
# 2. MOST COMPLEX ARTICLES
# ============================================================
df_extended['COMPLEXITY_SCORE'] = df_extended['COMPLEX WORD COUNT'] * df_extended['FOG INDEX']
top_complex = df_extended.sort_values('COMPLEXITY_SCORE', ascending=False).head(10)

print("Top 10 Most Complex Articles:")
print(top_complex[['URL_ID','URL','COMPLEX WORD COUNT','FOG INDEX','COMPLEXITY_SCORE']])

In [None]:
# ============================================================
# INSTALL TRANSFORMERS
# ============================================================
# !pip install transformers torch

# ============================================================
# IMPORTS
# ============================================================
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# ============================================================
# LOAD EXTENDED DATA
# ============================================================
df_ext = pd.read_excel("Output_Extended.xlsx")

# ============================================================
# INITIALIZE HUGGING FACE SENTIMENT ANALYZER
# ============================================================
sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# ============================================================
# 1. GET MODEL PREDICTIONS
# ============================================================
pred_labels = []

for text in tqdm(df_ext['ARTICLE_TEXT'], desc="Analyzing Sentiment with Model"):
    if not isinstance(text, str) or len(text.strip()) == 0:
        pred_labels.append("NEUTRAL")  # handle empty text
    else:
        res = sentiment_model(text[:512])[0]  # limit to first 512 tokens for speed
        pred_labels.append(res['label'].upper())  # "POSITIVE" or "NEGATIVE"

df_ext['MODEL_SENTIMENT'] = pred_labels

# ============================================================
# 2. VADER SENTIMENT LABEL
# ============================================================
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

vader_labels = []

for text in df_ext['ARTICLE_TEXT']:
    s = sia.polarity_scores(str(text))
    vader_labels.append("POSITIVE" if s['compound'] >= 0 else "NEGATIVE")

df_ext['VADER_SENTIMENT'] = vader_labels

# ============================================================
# 3. COMPUTE ACCURACY
# ============================================================
accuracy = (df_ext['MODEL_SENTIMENT'] == df_ext['VADER_SENTIMENT']).mean()
print(f"Accuracy of VADER sentiment vs pre-trained model: {accuracy*100:.2f}%")

# ============================================================
# 4. OPTIONAL: SHOW MISMATCHES
# ============================================================
mismatches = df_ext[df_ext['MODEL_SENTIMENT'] != df_ext['VADER_SENTIMENT']]
print(f"\nNumber of mismatches: {len(mismatches)}")
print(mismatches[['URL_ID','URL','VADER_SENTIMENT','MODEL_SENTIMENT']])


In [None]:
from sklearn.metrics import classification_report
print(classification_report(df_ext['MODEL_SENTIMENT'], df_ext['VADER_SENTIMENT']))


In [None]:
# ============================================================
# INSTALL DEPENDENCIES (if not already installed)
# ============================================================
# !pip install transformers torch pandas nltk openpyxl tqdm

# ============================================================
# IMPORTS
# ============================================================
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

# ============================================================
# LOAD EXTENDED DATA
# ============================================================
df_ext = pd.read_excel("Output_Extended.xlsx")

# ============================================================
# INITIALIZE SENTIMENT MODELS
# ============================================================
# Hugging Face pre-trained sentiment model
model_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# ============================================================
# 1. MODEL PREDICTIONS
# ============================================================
model_labels = []

for text in tqdm(df_ext['ARTICLE_TEXT'], desc="Analyzing Sentiment with Model"):
    if not isinstance(text, str) or len(text.strip()) == 0:
        model_labels.append("NEUTRAL")
    else:
        res = model_analyzer(text[:512])[0]  # limit for speed
        model_labels.append(res['label'].upper())

df_ext['MODEL_SENTIMENT'] = model_labels

# ============================================================
# 2. VADER STRONG SENTIMENT LABELS
# ============================================================
vader_labels = []

for text in df_ext['ARTICLE_TEXT']:
    s = sia.polarity_scores(str(text))
    compound = s['compound']
    if compound > 0.05:
        vader_labels.append("POSITIVE")
    elif compound < -0.05:
        vader_labels.append("NEGATIVE")
    else:
        vader_labels.append("NEUTRAL")

df_ext['VADER_SENTIMENT'] = vader_labels

# ============================================================
# 3. ACCURACY AGAINST MODEL
# ============================================================
accuracy = (df_ext['MODEL_SENTIMENT'] == df_ext['VADER_SENTIMENT']).mean()
print(f"Accuracy of VADER (strong sentiment) vs pre-trained model: {accuracy*100:.2f}%")

# ============================================================
# 4. OPTIONAL: SHOW MISMATCHES
# ============================================================
mismatches = df_ext[df_ext['MODEL_SENTIMENT'] != df_ext['VADER_SENTIMENT']]
print(f"\nNumber of mismatches: {len(mismatches)}")
print(mismatches[['URL_ID','URL','VADER_SENTIMENT','MODEL_SENTIMENT']])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df_ext['MODEL_SENTIMENT'], df_ext['VADER_SENTIMENT']))

In [None]:
# ============================================================
# 1. GET MODEL PREDICTIONS AND VADER SCORES
# ============================================================
model_probs, model_labels, vader_compound, vader_labels = [], [], [], []

for text in tqdm(df_ext['ARTICLE_TEXT'], desc="Analyzing Sentiment"):
    if not isinstance(text, str) or len(text.strip())==0:
        model_probs.append(0.5)
        model_labels.append("NEUTRAL")
        vader_compound.append(0)
        vader_labels.append("NEUTRAL")
        continue

    # Hugging Face model prediction
    res = model_analyzer(text[:512])[0]
    model_labels.append(res['label'].upper())
    model_probs.append(res['score'] if res['label'].upper()=='POSITIVE' else 1-res['score'])

    # VADER
    s = sia.polarity_scores(text)
    compound = s['compound']
    vader_compound.append(compound)
    if compound > 0.05:
        vader_labels.append("POSITIVE")
    elif compound < -0.05:
        vader_labels.append("NEGATIVE")
    else:
        vader_labels.append("NEUTRAL")

df_ext['MODEL_SENTIMENT'] = model_labels
df_ext['MODEL_PROB'] = model_probs
df_ext['VADER_SENTIMENT'] = vader_labels
df_ext['VADER_COMPOUND'] = vader_compound

# ============================================================
# 2. SCATTER PLOT: VADER compound vs MODEL probability
# ============================================================
#plt.figure(figsize=(10,6))
#sns.scatterplot(x='VADER_COMPOUND', y='MODEL_PROB', hue='MODEL_SENTIMENT', data=df_ext, alpha=0.7)
#plt.axvline(0.05, color='red', linestyle='--', label='VADER POS threshold')
#plt.axvline(-0.05, color='blue', linestyle='--', label='VADER NEG threshold')
#plt.xlabel("VADER Compound Score")
#plt.ylabel("ML Model Positive Probability")
#plt.title("VADER Compound Score vs ML Model Sentiment Probability")
#plt.legend()
#plt.show()

# ============================================================
# 3. MISMATCHES
# ============================================================
mismatches = df_ext[df_ext['VADER_SENTIMENT'] != df_ext['MODEL_SENTIMENT']]
print(f"Total mismatches: {len(mismatches)}")
display_cols = ['URL_ID','URL','VADER_SENTIMENT','VADER_COMPOUND','MODEL_SENTIMENT','MODEL_PROB']
print(mismatches[display_cols].head(10))  # show top 10 mismatches

# ============================================================
# 4. TOP 5 MOST EXTREME MISMATCHES
# ============================================================
# Extreme mismatch = VADER strongly opposite to model
mismatches['EXTREME_DIFF'] = abs(mismatches['VADER_COMPOUND'] - (mismatches['MODEL_PROB']*2-1))
top_extreme = mismatches.sort_values('EXTREME_DIFF', ascending=False).head(5)
print("\nTop 5 Extreme Mismatches:")
print(top_extreme[display_cols])