In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
BASE_URL = 'https://edition.cnn.com/'

In [3]:

# Function to scrape news articles and categories from CNN
def scrape_categories(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    categories = {}
    
    # Example: Extract article text and category
    for article in soup.find_all('a', class_='subnav__section-link'):

        # Getting Category and its Page url
        text = article.get_text().strip()
        url = article.get("href")
        
        categories[text] = url
        
    
    return categories

categories_urls = scrape_categories(BASE_URL)

categories_urls

{'World': 'https://edition.cnn.com/world',
 'US Politics': 'https://edition.cnn.com/politics',
 'Business': 'https://edition.cnn.com/business',
 'Markets': 'https://edition.cnn.com/markets',
 'Health': 'https://edition.cnn.com/health',
 'Entertainment': 'https://edition.cnn.com/entertainment',
 'Tech': 'https://edition.cnn.com/business/tech',
 'Style': 'https://edition.cnn.com/style',
 'Travel': 'https://edition.cnn.com/travel',
 'Sports': 'https://edition.cnn.com/sport',
 'Videos': 'https://edition.cnn.com/videos',
 'Features': 'https://edition.cnn.com/specials',
 'Weather': 'https://edition.cnn.com/weather',
 'More': 'https://edition.cnn.com/more'}

In [4]:
try:
  del categories_urls["Videos"]
  del categories_urls["Features"]
  del categories_urls["More"]
except Exception as e:
  pass

categories_urls

{'World': 'https://edition.cnn.com/world',
 'US Politics': 'https://edition.cnn.com/politics',
 'Business': 'https://edition.cnn.com/business',
 'Markets': 'https://edition.cnn.com/markets',
 'Health': 'https://edition.cnn.com/health',
 'Entertainment': 'https://edition.cnn.com/entertainment',
 'Tech': 'https://edition.cnn.com/business/tech',
 'Style': 'https://edition.cnn.com/style',
 'Travel': 'https://edition.cnn.com/travel',
 'Sports': 'https://edition.cnn.com/sport',
 'Weather': 'https://edition.cnn.com/weather'}

In [7]:

def extract_articles(categories_urls):
    headlines = []
    article_text = []
    categories = []

    for category, url in categories_urls.items():

        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        articles = soup.find_all('a', {"data-link-type": "article"})
        
        for article in articles:
            try:
                article_url = BASE_URL + article.get("href").strip("/")

                try:
                    article_page = requests.get(article_url)
                    crawler = BeautifulSoup(article_page.text, 'html.parser')

                    headline = crawler.find('h1', class_='headline__text').get_text().strip()

                    text_container = crawler.find('div', class_='article__content')
                    paragraphs = text_container.find_all("p", class_='paragraph')

                    content = ""
                    for para in paragraphs:
                        content += " "+ para.get_text().strip()

                    if headline:
                        headlines.append(headline)
                    else:
                        headlines.append("")
                    if content:
                        article_text.append(content)
                    else:
                        article_text.append("")

                    categories.append(category)

                except Exception as e:
                    continue
            except Exception as e:
                continue
    
    return headlines, article_text, categories

headlines, article_text, categories = extract_articles(categories_urls)
data = pd.DataFrame({'Headline': headlines, "Content": article_text, 'Category': categories})




In [8]:
data.shape

(871, 3)

In [9]:
data.sample(10)

Unnamed: 0,Headline,Content,Category
585,Han Kwang Song: North Korean striker scores in...,After more than three years of going missing ...,Sports
244,After more reports of illnesses from recalled ...,After more reports of illnesses potentially l...,Health
705,The driest place in North America has sprung t...,California’s Death Valley is the hottest plac...,Weather
673,2023 Atlantic Hurricane Season Fast Facts,Here is a look at the 2023 Atlantic hurricane...,Weather
701,Hurricane Otis’ explosive intensification is a...,The rapid intensification Hurricane Otis unde...,Weather
82,For hundreds of migrant children living in she...,Estefanía Rebellón knows the trauma that stem...,World
262,The most powerful rocket ever built just went ...,"SpaceX’s gargantuan deep-space rocket system,...",Health
502,First-ever World of Frozen opens at Hong Kong ...,"Elsa, Anna, Olaf and Sven have a new home on ...",Travel
604,"Philadelphia Eagles win Super Bowl rematch, de...",The Philadelphia Eagles avenged their Super B...,Sports
632,Six bull sharks inadvertently made their home ...,"For golfers, staying out of the water could b...",Sports


In [10]:
data.isnull().sum()

Headline    0
Content     0
Category    0
dtype: int64

In [11]:
data.duplicated().sum()

361

In [12]:
data = data.drop_duplicates()
data.duplicated().sum()

0

In [58]:
data.shape

(510, 3)

In [59]:
data["Category"].value_counts()

Weather          104
World             67
Style             58
Business          50
US Politics       49
Travel            44
Sports            44
Entertainment     40
Health            31
Tech              16
Markets            7
Name: Category, dtype: int64

In [60]:
# Save the scraped data
data.to_csv('cnn_data_with_content.csv', index=False)

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [62]:
data.sample(10)

Unnamed: 0,Headline,Content,Category
372,US government’s proposal to boost EV sales is ...,The US government is planning to change auto ...,Tech
449,A new docuseries follows Juul’s catastrophic c...,"As Juul’s popularity began to surge in 2016, ...",Style
441,"Mushrooms, snails and plant roots: The surpris...","Madder, cochineal, Mauveine; these words may ...",Style
45,Civilians caught in the crossfire as fighting ...,Renewed hostilities between the Myanmar milit...,World
190,Why aren’t more women leading US companies? Am...,"Women have made huge strides in business, asc...",Business
427,Look of the week: Emily Ratajkowski and the ul...,"In the not too distant past, the silhouette t...",Style
789,Hurricane Lee strengthens to Category 4 storm ...,Hurricane Lee has strengthened into a major C...,Weather
101,"Takeaways from the Biden-Xi summit, where low ...","It happened, therefore it was a success. Pres...",US Politics
78,Meet the people who are making the world a bet...,The ideas can be simple: converting a vehicle...,World
107,Fact checking the 3rd GOP debate of the 2024 e...,Five of the Republican presidential candidate...,US Politics


In [63]:
dataset = data.iloc[:, 1:]
dataset

Unnamed: 0,Content,Category
0,Israel and Hamas have reached a deal for a fo...,World
2,Dutch voters cast their ballots on Wednesday ...,World
3,A highly venomous snake is on the loose after...,World
4,"Nearly seven weeks into the war, Israel and H...",World
5,Israel and Hamas reached an agreement early W...,World
...,...,...
861,Life-threatening flash flooding forced evacua...,Weather
863,One of the most exceptional heat streaks in U...,Weather
865,Substantial fires can make their own weather ...,Weather
867,An out-of-control blaze burning in northern W...,Weather


In [64]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

In [65]:
train_data.shape, test_data.shape

((408, 2), (102, 2))

In [66]:
# Text vectorization using Bag-of-Words
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['Content'])
X_test = vectorizer.transform(test_data['Content'])


In [67]:
# Training multiple classifiers
# classifiers = {
#     'Naive Bayes': MultinomialNB(),
# #     'Logistic Regression': LogisticRegression(max_iter=1000),
# #     'Random Forest': RandomForestClassifier(),
# #     "SVM": LinearSVC(max_iter=1000)
# }

In [68]:
# results = {}

# for clf_name, classifier in classifiers.items():
#     # Training the classifier
#     classifier.fit(X_train, train_data['Category'])

#     # Make predictions on the test set
#     predictions = classifier.predict(X_test)

#     # Evaluate the model
#     accuracy = accuracy_score(test_data['Category'], predictions)
#     classification_rep = classification_report(test_data['Category'], predictions, zero_division=1)

#     # Save the results
#     results[clf_name] = {
#         'Accuracy': accuracy,
#         'Classification Report': classification_rep
#     }

In [69]:
# # Display the results
# for clf_name, metrics in results.items():
#     print(f'\nResults for {clf_name}:')
#     print(f'Accuracy: {metrics["Accuracy"]:.2f}')
#     print('Classification Report:')
#     print(metrics['Classification Report'])


Results for Naive Bayes:
Accuracy: 0.64
Classification Report:
               precision    recall  f1-score   support

     Business       0.27      0.43      0.33         7
Entertainment       1.00      0.17      0.29         6
       Health       1.00      0.67      0.80         6
      Markets       1.00      0.00      0.00         2
       Sports       0.75      1.00      0.86         6
        Style       0.50      0.38      0.43         8
         Tech       1.00      0.00      0.00         5
       Travel       0.38      0.50      0.43        10
  US Politics       1.00      0.67      0.80        15
      Weather       0.86      1.00      0.93        25
        World       0.40      0.67      0.50        12

     accuracy                           0.64       102
    macro avg       0.74      0.50      0.49       102
 weighted avg       0.73      0.64      0.62       102



In [71]:
# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, train_data["Category"])


In [74]:

# Make predictions on the test set
predictions = nb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(test_data['Category'], predictions)
precision = precision_score(test_data['Category'], predictions, average='micro') 
recall = recall_score(test_data['Category'], predictions, average='micro')
f1 = f1_score(test_data['Category'], predictions, average='micro') 

classification_rep = classification_report(test_data['Category'], predictions)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
# Display the results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Classification Report:')
print(classification_rep)

Accuracy: 0.64
Precision: 0.64
Recall: 0.64
F1 Score: 0.64
Classification Report:
               precision    recall  f1-score   support

     Business       0.27      0.43      0.33         7
Entertainment       1.00      0.17      0.29         6
       Health       1.00      0.67      0.80         6
      Markets       0.00      0.00      0.00         2
       Sports       0.75      1.00      0.86         6
        Style       0.50      0.38      0.43         8
         Tech       0.00      0.00      0.00         5
       Travel       0.38      0.50      0.43        10
  US Politics       1.00      0.67      0.80        15
      Weather       0.86      1.00      0.93        25
        World       0.40      0.67      0.50        12

     accuracy                           0.64       102
    macro avg       0.56      0.50      0.49       102
 weighted avg       0.66      0.64      0.62       102



In [78]:
# Save evaluation results to a CSV file
evaluation_results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [accuracy, precision, recall, f1]
})

evaluation_results.to_csv('evaluation_results.csv', index=False)
evaluation_results

Unnamed: 0,Metric,Value
0,Accuracy,0.637255
1,Precision,0.637255
2,Recall,0.637255
3,F1 Score,0.637255
