<a href="https://colab.research.google.com/github/ASantra-star/Scrapping_Sentiment_Analysis/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas feedparser requests beautifulsoup4 nltk openpyxl wordcloud seaborn matplotlib

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

In [None]:

# ============================================================
# IMPORTS
# ============================================================
import pandas as pd
import feedparser
import requests
import nltk
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import time

# ============================================================
# NLTK SETUP
# ============================================================
stop_words = set(stopwords.words("english"))
sia = SentimentIntensityAnalyzer()

# ============================================================
# HELPER FUNCTIONS
# ============================================================
def extract_article_text(url):
    """Extract text from <p> tags of a real article page"""
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        html = requests.get(url, headers=headers, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        text = " ".join(p.get_text() for p in soup.find_all("p"))
        return text.strip()
    except:
        return ""

def clean_tokens(text):
    tokens = word_tokenize(text.lower())
    return [w for w in tokens if w.isalpha() and w not in stop_words]

def count_syllables(word):
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1,len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith(("es","ed")):
        count -= 1
    return max(count,1)

def count_pronouns(text):
    return len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

def safe_analyze(func, *args):
    try:
        result = func(*args)
        assert result is not None
        return result
    except:
        if func.__name__ == "analyze_original":
            return [0]*13
        else:
            return [0]*24

# ============================================================
# METRIC FUNCTIONS
# ============================================================
def analyze_original(text):
    sentences = sent_tokenize(text)
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    s = sia.polarity_scores(text)
    complex_words = [w for w in tokens if count_syllables(w) > 2]

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0

    return [
        s['pos'],
        s['neg'],
        s['compound'],
        s['pos'] + s['neg'],
        avg_sentence_len,
        pct_complex,
        0.4*(avg_sentence_len + pct_complex),
        avg_sentence_len,
        len(complex_words),
        wc,
        sum(count_syllables(w) for w in tokens)/wc if wc else 0,
        count_pronouns(text),
        sum(len(w) for w in tokens)/wc if wc else 0
    ]

def analyze_extended(text, url):
    sentences = sent_tokenize(text)
    paragraphs = [p for p in text.split("\n") if p.strip()]
    tokens = clean_tokens(text)

    wc = len(tokens)
    sc = len(sentences)
    pc = len(paragraphs)

    s = sia.polarity_scores(text)
    unique_words = len(set(tokens))
    complex_words = [w for w in tokens if count_syllables(w) > 2]
    syllables = sum(count_syllables(w) for w in tokens)

    avg_sentence_len = wc / sc if sc else 0
    pct_complex = len(complex_words)/wc if wc else 0
    flesch = 206.835 - 1.015*avg_sentence_len - 84.6*(syllables/wc) if wc else 0

    return [
        urlparse(url).netloc,
        s['pos'],
        s['neg'],
        s['neu'],
        s['compound'],
        abs(s['compound']),
        s['pos'] + s['neg'],
        (s['pos'] + s['neg'])/(wc+1e-6),
        wc,
        unique_words,
        unique_words/wc if wc else 0,
        sc,
        avg_sentence_len,
        pc,
        len(complex_words),
        pct_complex,
        0.4*(avg_sentence_len + pct_complex),
        flesch,
        wc/225,
        count_pronouns(text),
        text.count("?"),
        text.count("!"),
        sum(len(w) for w in tokens)/wc if wc else 0,
        "Short" if wc<500 else "Medium" if wc<=1200 else "Long"
    ]

# ============================================================
# COLUMN DEFINITIONS
# ============================================================
original_cols = [
    "URL_ID","URL","POSITIVE SCORE","NEGATIVE SCORE","POLARITY SCORE",
    "SUBJECTIVITY SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT",
    "WORD COUNT","SYLLABLE PER WORD","PERSONAL PRONOUNS","AVG WORD LENGTH"
]

extended_cols = [
    "URL_ID","URL","ARTICLE_TEXT","DOMAIN","POSITIVE SCORE","NEGATIVE SCORE","NEUTRAL SCORE",
    "POLARITY SCORE","SENTIMENT INTENSITY","EMOTIONALITY SCORE",
    "SUBJECTIVITY SCORE","WORD COUNT","UNIQUE WORD COUNT","TYPE TOKEN RATIO",
    "SENTENCE COUNT","AVG SENTENCE LENGTH","PARAGRAPH COUNT",
    "COMPLEX WORD COUNT","PERCENTAGE OF COMPLEX WORDS","FOG INDEX",
    "FLESCH READING EASE","READING TIME (MIN)","PERSONAL PRONOUNS",
    "QUESTION COUNT","EXCLAMATION COUNT","AVG WORD LENGTH",
    "ARTICLE LENGTH CATEGORY"
]

# ============================================================
# RSS FEEDS (Expanded for 200+ articles)
# ============================================================
RSS_FEEDS = [
    "http://feeds.bbci.co.uk/news/technology/rss.xml",
    "https://www.theverge.com/rss/index.xml",
    "https://www.reuters.com/rssFeed/technologyNews",
    "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
    "https://www.wired.com/feed/rss",
    "http://feeds.arstechnica.com/arstechnica/index",
    "http://feeds.feedburner.com/TechCrunch/",
    "https://www.cnet.com/rss/news/",
    "https://www.engadget.com/rss.xml",
    "https://www.technologyreview.com/feed/",
    "https://www.reuters.com/rssFeed/technologyNews"
]

MAX_ARTICLES = 200
urls, texts = [], []

for feed_url in RSS_FEEDS:
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        url = entry.link
        if url in urls:
            continue
        text = extract_article_text(url)
        if len(text.split()) > 50:
            urls.append(url)
            texts.append(text)
        if len(urls) >= MAX_ARTICLES:
            break
    if len(urls) >= MAX_ARTICLES:
        break

print(f"Fetched {len(urls)} valid articles with text >50 words")

# ============================================================
# SAVE INPUT.XLSX
# ============================================================
df_input = pd.DataFrame({
    "URL_ID": [f"AUTO_{i+1:03d}" for i in range(len(urls))],
    "URL": urls
})
df_input.to_excel("Input.xlsx", index=False)

# ============================================================
# GENERATE METRICS
# ============================================================
orig_rows, ext_rows = [], []

for i, (url, text) in enumerate(zip(urls, texts)):
    orig_metrics = safe_analyze(analyze_original, text)
    ext_metrics = safe_analyze(analyze_extended, text, url)
    orig_rows.append([f"AUTO_{i+1:03d}", url] + orig_metrics)
    ext_rows.append([f"AUTO_{i+1:03d}", url, text] + ext_metrics)
    time.sleep(0.1)

df_output = pd.DataFrame(orig_rows, columns=original_cols)
df_extended = pd.DataFrame(ext_rows, columns=extended_cols)

# ============================================================
# SAVE OUTPUT FILES
# ============================================================
df_output.to_excel("Output.xlsx", index=False)
df_extended.to_excel("Output_Extended.xlsx", index=False)
df_output.to_csv("Output.csv", index=False)
df_extended.to_csv("Output_Extended.csv", index=False)

# ============================================================
# 1. GENERATE WORD CLOUD
# ============================================================
all_text = " ".join(df_extended['ARTICLE_TEXT'])
#wordcloud = WordCloud(width=1200, height=600, background_color='white', collocations=False).generate(all_text)

#plt.figure(figsize=(15,7))
#plt.imshow(wordcloud, interpolation='bilinear')
#plt.axis('off')
#plt.title("Word Cloud of All Articles", fontsize=20)
#plt.show()

# ============================================================
# 2. MOST COMPLEX ARTICLES
# ============================================================
df_extended['COMPLEXITY_SCORE'] = df_extended['COMPLEX WORD COUNT'] * df_extended['FOG INDEX']
top_complex = df_extended.sort_values('COMPLEXITY_SCORE', ascending=False).head(10)

print("Top 10 Most Complex Articles:")
print(top_complex[['URL_ID','URL','COMPLEX WORD COUNT','FOG INDEX','COMPLEXITY_SCORE']])

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

df_ext = pd.read_excel("Output_Extended.xlsx")

sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

pred_labels = []

for text in tqdm(df_ext['ARTICLE_TEXT'], desc="Analyzing Sentiment with Model"):
    if not isinstance(text, str) or len(text.strip()) == 0:
        pred_labels.append("NEUTRAL")  # handle empty text
    else:
        res = sentiment_model(text[:512])[0]  # limit to first 512 tokens for speed
        pred_labels.append(res['label'].upper())  # "POSITIVE" or "NEGATIVE"

df_ext['MODEL_SENTIMENT'] = pred_labels

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

vader_labels = []

for text in df_ext['ARTICLE_TEXT']:
    s = sia.polarity_scores(str(text))
    vader_labels.append("POSITIVE" if s['compound'] >= 0 else "NEGATIVE")

df_ext['VADER_SENTIMENT'] = vader_labels


accuracy = (df_ext['MODEL_SENTIMENT'] == df_ext['VADER_SENTIMENT']).mean()
print(f"Accuracy of VADER sentiment vs pre-trained model: {accuracy*100:.2f}%")


mismatches = df_ext[df_ext['MODEL_SENTIMENT'] != df_ext['VADER_SENTIMENT']]
print(f"\nNumber of mismatches: {len(mismatches)}")
print(mismatches[['URL_ID','URL','VADER_SENTIMENT','MODEL_SENTIMENT']])


In [None]:
from sklearn.metrics import classification_report
print(classification_report(df_ext['MODEL_SENTIMENT'], df_ext['VADER_SENTIMENT']))


In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')
df_ext = pd.read_excel("Output_Extended.xlsx")

# Hugging Face pre-trained sentiment model
model_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

model_labels = []

for text in tqdm(df_ext['ARTICLE_TEXT'], desc="Analyzing Sentiment with Model"):
    if not isinstance(text, str) or len(text.strip()) == 0:
        model_labels.append("NEUTRAL")
    else:
        res = model_analyzer(text[:512])[0]  # limit for speed
        model_labels.append(res['label'].upper())

df_ext['MODEL_SENTIMENT'] = model_labels


vader_labels = []

for text in df_ext['ARTICLE_TEXT']:
    s = sia.polarity_scores(str(text))
    compound = s['compound']
    if compound > 0.05:
        vader_labels.append("POSITIVE")
    elif compound < -0.05:
        vader_labels.append("NEGATIVE")
    else:
        vader_labels.append("NEUTRAL")

df_ext['VADER_SENTIMENT'] = vader_labels

accuracy = (df_ext['MODEL_SENTIMENT'] == df_ext['VADER_SENTIMENT']).mean()
print(f"Accuracy of VADER (strong sentiment) vs pre-trained model: {accuracy*100:.2f}%")


mismatches = df_ext[df_ext['MODEL_SENTIMENT'] != df_ext['VADER_SENTIMENT']]
print(f"\nNumber of mismatches: {len(mismatches)}")
print(mismatches[['URL_ID','URL','VADER_SENTIMENT','MODEL_SENTIMENT']])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df_ext['MODEL_SENTIMENT'], df_ext['VADER_SENTIMENT']))

In [None]:

model_probs, model_labels, vader_compound, vader_labels = [], [], [], []

for text in tqdm(df_ext['ARTICLE_TEXT'], desc="Analyzing Sentiment"):
    if not isinstance(text, str) or len(text.strip())==0:
        model_probs.append(0.5)
        model_labels.append("NEUTRAL")
        vader_compound.append(0)
        vader_labels.append("NEUTRAL")
        continue

    # Hugging Face model prediction
    res = model_analyzer(text[:512])[0]
    model_labels.append(res['label'].upper())
    model_probs.append(res['score'] if res['label'].upper()=='POSITIVE' else 1-res['score'])

    # VADER
    s = sia.polarity_scores(text)
    compound = s['compound']
    vader_compound.append(compound)
    if compound > 0.05:
        vader_labels.append("POSITIVE")
    elif compound < -0.05:
        vader_labels.append("NEGATIVE")
    else:
        vader_labels.append("NEUTRAL")

df_ext['MODEL_SENTIMENT'] = model_labels
df_ext['MODEL_PROB'] = model_probs
df_ext['VADER_SENTIMENT'] = vader_labels
df_ext['VADER_COMPOUND'] = vader_compound


#plt.figure(figsize=(10,6))
#sns.scatterplot(x='VADER_COMPOUND', y='MODEL_PROB', hue='MODEL_SENTIMENT', data=df_ext, alpha=0.7)
#plt.axvline(0.05, color='red', linestyle='--', label='VADER POS threshold')
#plt.axvline(-0.05, color='blue', linestyle='--', label='VADER NEG threshold')
#plt.xlabel("VADER Compound Score")
#plt.ylabel("ML Model Positive Probability")
#plt.title("VADER Compound Score vs ML Model Sentiment Probability")
#plt.legend()
#plt.show()


mismatches = df_ext[df_ext['VADER_SENTIMENT'] != df_ext['MODEL_SENTIMENT']]
print(f"Total mismatches: {len(mismatches)}")
display_cols = ['URL_ID','URL','VADER_SENTIMENT','VADER_COMPOUND','MODEL_SENTIMENT','MODEL_PROB']
print(mismatches[display_cols].head(10))  # show top 10 mismatches


# Extreme mismatch = VADER strongly opposite to model
mismatches['EXTREME_DIFF'] = abs(mismatches['VADER_COMPOUND'] - (mismatches['MODEL_PROB']*2-1))
top_extreme = mismatches.sort_values('EXTREME_DIFF', ascending=False).head(5)
print("\nTop 5 Extreme Mismatches:")
print(top_extreme[display_cols])