<a href="https://colab.research.google.com/github/ChaithanyaSaiB/UMBC-DATA606-Capstone/blob/main/notebooks/Dataset_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing necessary libraries
import pandas as pd
import re
import os
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# Import the tqdm library for progress bar
from tqdm import tqdm

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load CNN health data
cnnhealth = pd.read_csv("/content/cnnhealth.txt", sep=r"(?<!\s)[|](?!\s)", names=['ID', 'DateTime', 'Article Title and Link'], engine="python")
cnnhealth.head()

Unnamed: 0,ID,DateTime,Article Title and Link
0,576880531301801984,Sat Mar 14 23:00:11 +0000 2015,An abundance of online info can turn us into e...
1,576820122666471424,Sat Mar 14 19:00:08 +0000 2015,A plant-based diet that incorporates fish may ...
2,576744652717461504,Sat Mar 14 14:00:15 +0000 2015,It doesn't take much to damage your hearing at...
3,576736754436304896,Sat Mar 14 13:28:52 +0000 2015,RT @CNN: Forever young? Discover this island’s...
4,576736614766010368,Sat Mar 14 13:28:18 +0000 2015,RT @CNN: Is post-traumatic stress disorder in ...


In [3]:
# Define the directory path containing the .txt files
directory = '/content'

# Initialize an empty list to store DataFrames
dfs = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        # Read the .txt file into a DataFrame
        df_name = os.path.splitext(filename)[0]  # Extract filename without extension
        try:
            df = pd.read_csv(os.path.join(directory, filename),
                             sep=r"(?<!\s)[|](?!\s)",
                             names=['ID', 'DateTime', 'Article Title and Link'],
                             engine="python")
            # Add the DataFrame to the list
            dfs.append(df)
            print(f"DataFrame '{df_name}' loaded with {len(df)} rows.")
        except UnicodeDecodeError:
            print(f"Error reading {filename}: UnicodeDecodeError. Skipping this file.")

# Concatenate all DataFrames in the list into one DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Display the combined DataFrame
print("Combined DataFrame:")
print(combined_df)

DataFrame 'usnewshealth' loaded with 1400 rows.
DataFrame 'nprhealth' loaded with 4837 rows.
Error reading foxnewshealth.txt: UnicodeDecodeError. Skipping this file.
DataFrame 'latimeshealth' loaded with 4171 rows.
DataFrame 'cnnhealth' loaded with 4061 rows.
Error reading msnhealthnews.txt: UnicodeDecodeError. Skipping this file.
DataFrame 'nytimeshealth' loaded with 6245 rows.
DataFrame 'goodhealth' loaded with 7864 rows.
DataFrame 'reuters_health' loaded with 4719 rows.
Error reading NBChealth.txt: UnicodeDecodeError. Skipping this file.
Error reading KaiserHealthNews.txt: UnicodeDecodeError. Skipping this file.
Error reading wsjhealth.txt: UnicodeDecodeError. Skipping this file.
DataFrame 'cbchealth' loaded with 3741 rows.
DataFrame 'everydayhealth' loaded with 3239 rows.
DataFrame 'gdnhealthcare' loaded with 2997 rows.
DataFrame 'bbchealth' loaded with 3929 rows.
Combined DataFrame:
                       ID                        DateTime  \
0      586278450392133633  Thu Apr 09 

In [4]:
# Extract all URLs from article titles and links
cnnhealth['URL'] = [re.findall(r'https?://\S+', text) for text in cnnhealth['Article Title and Link']]

# Drop rows with multiple or no URLs
drop_indices = cnnhealth['Article Title and Link'][cnnhealth['URL'].apply(len) != 1].index
cnnhealth.drop(drop_indices, inplace=True)

# Extract CNN URL for articles
cnnhealth['URL'] = [url[0] for url in cnnhealth['URL']]

In [5]:
# Function to fetch content from a URL
def fetch_content(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except Exception as e:
        print("Error fetching content:", e)
        return None

# Function to extract body content from HTML using BeautifulSoup
def extract_body(html_content):
    if html_content:
        soup = BeautifulSoup(html_content, 'html.parser')
        article_body = soup.findAll('p', class_='paragraph')
        if article_body:
            return article_body
        else:
            return None
    else:
        return None

# Apply web scraping to extract content and store it in a new column
def extract_content_with_progress(url_list):
    content_list = []
    for url in tqdm(url_list, desc='Extracting content'):
        content = extract_body(fetch_content(url))
        content_list.append(content)
    return content_list

# Apply web scraping to extract content and store it in a new column
cnnhealth['Content'] = extract_content_with_progress(cnnhealth['URL'])

  k = self.parse_starttag(i)
Extracting content:   3%|▎         | 113/3299 [01:15<27:07,  1.96it/s]

Error fetching content: Failed to parse: https://t…


Extracting content:   6%|▌         | 200/3299 [02:04<41:49,  1.23it/s]

Error fetching content: Failed to parse: http://t.c…


Extracting content:   7%|▋         | 246/3299 [02:29<24:43,  2.06it/s]

Error fetching content: Failed to parse: http://…


Extracting content:   8%|▊         | 254/3299 [02:35<39:31,  1.28it/s]

Error fetching content: Failed to parse: http://t.co…


Extracting content:  11%|█         | 362/3299 [03:43<2:42:08,  3.31s/it]

Error fetching content: HTTPSConnectionPool(host='flipboard.com', port=443): Read timed out. (read timeout=10)


Extracting content:  11%|█         | 363/3299 [03:53<4:23:01,  5.38s/it]

Error fetching content: HTTPSConnectionPool(host='flipboard.com', port=443): Read timed out. (read timeout=10)


Extracting content:  11%|█         | 365/3299 [04:04<4:44:46,  5.82s/it]

Error fetching content: HTTPSConnectionPool(host='flipboard.com', port=443): Read timed out. (read timeout=10)


Extracting content:  12%|█▏        | 386/3299 [04:18<57:15,  1.18s/it]  

Error fetching content: Failed to parse: http://…


Extracting content:  12%|█▏        | 395/3299 [04:31<2:38:49,  3.28s/it]

Error fetching content: HTTPSConnectionPool(host='flipboard.com', port=443): Read timed out. (read timeout=10)


Extracting content:  12%|█▏        | 406/3299 [04:46<2:51:27,  3.56s/it]

Error fetching content: HTTPSConnectionPool(host='flipboard.com', port=443): Read timed out. (read timeout=10)


Extracting content:  17%|█▋        | 549/3299 [05:59<27:12,  1.68it/s]

Error fetching content: Failed to parse: http://t.co…


Extracting content:  19%|█▉        | 634/3299 [06:50<2:30:45,  3.39s/it]

Error fetching content: HTTPConnectionPool(host='upwave.co', port=80): Max retries exceeded with url: /1kTcVP2 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45757fb820>, 'Connection to upwave.co timed out. (connect timeout=10)'))


Extracting content:  21%|██        | 683/3299 [07:08<30:36,  1.42it/s]

Error fetching content: Failed to parse: http://t.co…
Error fetching content: Failed to parse: http://…


Extracting content:  22%|██▏       | 721/3299 [07:34<2:26:31,  3.41s/it]

Error fetching content: HTTPConnectionPool(host='www.kansascity.com', port=80): Read timed out. (read timeout=10)


Extracting content:  23%|██▎       | 747/3299 [07:45<22:26,  1.90it/s]

Error fetching content: Failed to parse: http://t…


Extracting content:  23%|██▎       | 751/3299 [07:56<2:03:53,  2.92s/it]

Error fetching content: HTTPConnectionPool(host='upwave.co', port=80): Max retries exceeded with url: /1duFGv0 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b455a8766b0>, 'Connection to upwave.co timed out. (connect timeout=10)'))


Extracting content:  24%|██▎       | 782/3299 [08:09<20:42,  2.03it/s]

Error fetching content: Failed to parse: http://…


Extracting content:  24%|██▍       | 791/3299 [08:12<12:19,  3.39it/s]

Error fetching content: Failed to parse: http://t…


Extracting content:  24%|██▍       | 798/3299 [08:15<16:20,  2.55it/s]

Error fetching content: Failed to parse: http://t.co…


Extracting content:  27%|██▋       | 907/3299 [09:06<07:47,  5.12it/s]

Error fetching content: Failed to parse: http://t.c…


Extracting content:  29%|██▉       | 956/3299 [09:28<13:19,  2.93it/s]

Error fetching content: HTTPConnectionPool(host='shpe.co', port=80): Max retries exceeded with url: /1eAdojE (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7b4544e66f20>: Failed to resolve 'shpe.co' ([Errno -2] Name or service not known)"))


Extracting content:  29%|██▉       | 960/3299 [09:30<18:14,  2.14it/s]

Error fetching content: Failed to parse: http://t.…


Extracting content:  30%|██▉       | 984/3299 [09:49<29:09,  1.32it/s]

Error fetching content: Failed to parse: http://t…


Extracting content:  30%|███       | 994/3299 [09:53<19:04,  2.01it/s]

Error fetching content: Failed to parse: http://t.co…


Extracting content:  32%|███▏      | 1055/3299 [10:19<16:16,  2.30it/s]

Error fetching content: HTTPConnectionPool(host='healthysummer.me', port=80): Max retries exceeded with url: /2013/12/19/yoga-forget-what-you-think-you-know/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7b453b031db0>: Failed to resolve 'healthysummer.me' ([Errno -2] Name or service not known)"))


Extracting content:  32%|███▏      | 1058/3299 [10:19<09:24,  3.97it/s]

Error fetching content: HTTPConnectionPool(host='shpe.co', port=80): Max retries exceeded with url: /1c1TMq4 (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7b453b0325f0>: Failed to resolve 'shpe.co' ([Errno -2] Name or service not known)"))


Extracting content:  32%|███▏      | 1067/3299 [10:33<2:03:09,  3.31s/it]

Error fetching content: HTTPConnectionPool(host='upwave.co', port=80): Max retries exceeded with url: /1cArLCl (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b453a2b6590>, 'Connection to upwave.co timed out. (connect timeout=10)'))


Extracting content:  36%|███▌      | 1175/3299 [11:24<1:55:21,  3.26s/it]

Error fetching content: HTTPConnectionPool(host='upwave.co', port=80): Max retries exceeded with url: /17J2Wb9 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4530196e60>, 'Connection to upwave.co timed out. (connect timeout=10)'))


Extracting content:  38%|███▊      | 1259/3299 [12:06<17:03,  1.99it/s]

Error fetching content: Failed to parse: http://t.co…


Extracting content:  40%|███▉      | 1318/3299 [12:34<1:39:03,  3.00s/it]

Error fetching content: HTTPConnectionPool(host='upwave.co', port=80): Max retries exceeded with url: /18pyCjM (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b452856c5e0>, 'Connection to upwave.co timed out. (connect timeout=10)'))


Extracting content:  40%|████      | 1320/3299 [12:35<55:38,  1.69s/it]  

Error fetching content: Failed to parse: http://…


Extracting content:  40%|████      | 1325/3299 [12:45<1:33:13,  2.83s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1bwg1Co (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4527ea4bb0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  40%|████      | 1329/3299 [12:56<1:56:10,  3.54s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1aG810k (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45279a4c40>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  40%|████      | 1333/3299 [13:07<2:02:58,  3.75s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1agl2fP (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45274a4970>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  40%|████      | 1334/3299 [13:17<2:55:50,  5.37s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1cFft0A (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45274a51b0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1340/3299 [13:29<2:06:38,  3.88s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1bgB37P (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4526adb370>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1341/3299 [13:39<3:05:53,  5.70s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /15ruChD (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4526adba90>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1343/3299 [13:49<3:11:44,  5.88s/it]

Error fetching content: HTTPConnectionPool(host='upwave.co', port=80): Max retries exceeded with url: /18SjdaS (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b452675e7a0>, 'Connection to upwave.co timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1344/3299 [13:59<3:52:03,  7.12s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /16y2hab (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b452675eec0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1346/3299 [14:10<3:33:28,  6.56s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /16y2hab (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45265d9ab0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1347/3299 [14:20<4:07:02,  7.59s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /16xeLyX (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45265da2f0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1350/3299 [14:30<3:05:33,  5.71s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /17eEngB (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45260ed1e0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1352/3299 [14:40<3:10:12,  5.86s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1coPDhd (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4525d68cd0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1353/3299 [14:50<3:50:31,  7.11s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1cn3Ew2 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4525d694e0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1355/3299 [15:01<3:33:19,  6.58s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1cn3Ew2 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4525be0c70>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1356/3299 [15:11<4:06:29,  7.61s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /19ZTATf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4525be14b0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  41%|████      | 1360/3299 [15:21<2:40:59,  4.98s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1dYpxQ9 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45256a3550>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))
Error fetching content: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Extracting content:  41%|████▏     | 1363/3299 [15:25<1:33:18,  2.89s/it]

Error fetching content: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error fetching content: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Extracting content:  41%|████▏     | 1365/3299 [15:35<1:56:10,  3.60s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /16inogx (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45256a35b0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  42%|████▏     | 1381/3299 [15:50<1:42:22,  3.20s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /16Aj9sg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4523c2a530>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  42%|████▏     | 1384/3299 [16:01<2:13:02,  4.17s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1dGBBp7 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4523ad8ca0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  42%|████▏     | 1386/3299 [16:11<2:42:49,  5.11s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /18S6e5D (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45238e7550>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  42%|████▏     | 1389/3299 [16:22<2:36:20,  4.91s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1aPZ31W (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45232700d0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  42%|████▏     | 1395/3299 [16:33<1:46:00,  3.34s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /16mwTYg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45229b7940>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  43%|████▎     | 1404/3299 [16:47<1:28:45,  2.81s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1goZl0q (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45218aac50>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  43%|████▎     | 1408/3299 [16:48<33:42,  1.07s/it]

Error fetching content: Failed to parse: http://t.…


Extracting content:  43%|████▎     | 1413/3299 [16:59<1:35:21,  3.03s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1d8UHE9 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4520ca0d60>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  43%|████▎     | 1419/3299 [17:10<1:28:50,  2.84s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /17qLpkp (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45205d1570>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  43%|████▎     | 1421/3299 [17:21<2:12:45,  4.24s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /17pNFZd (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4520253190>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  43%|████▎     | 1428/3299 [17:34<1:54:07,  3.66s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /14DyRmC (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451f6d0580>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  43%|████▎     | 1430/3299 [17:44<2:31:58,  4.88s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /18B0Gyi (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451f353760>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  43%|████▎     | 1431/3299 [17:44<1:49:20,  3.51s/it]

Error fetching content: Failed to parse: http://t…


Extracting content:  43%|████▎     | 1434/3299 [17:55<2:03:12,  3.96s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /17CqV97 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451eefb250>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  44%|████▎     | 1437/3299 [18:05<2:16:47,  4.41s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /14ZxqlW (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451eb7f520>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  44%|████▎     | 1441/3299 [18:16<2:09:58,  4.20s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /17CA9QU (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451e347850>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  44%|████▍     | 1448/3299 [18:28<1:52:33,  3.65s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /14kEIx0 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451d935540>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  44%|████▍     | 1452/3299 [18:39<2:03:28,  4.01s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /14HGWdj (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451d26ac50>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  44%|████▍     | 1453/3299 [18:49<2:58:47,  5.81s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1ckDfMr (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451d26b490>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  44%|████▍     | 1465/3299 [19:03<1:43:16,  3.38s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /17fASZX (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451b7fc2b0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  44%|████▍     | 1467/3299 [19:14<2:24:09,  4.72s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /14VinpN (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451b48d780>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  44%|████▍     | 1468/3299 [19:24<3:12:34,  6.31s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /13XdD2S (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451b48dfc0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  45%|████▍     | 1478/3299 [19:36<1:26:35,  2.85s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1cVdUwc (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451a406b60>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  45%|████▍     | 1480/3299 [19:47<2:10:47,  4.31s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1d3wr8I (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451a085990>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  45%|████▍     | 1481/3299 [19:47<1:36:30,  3.18s/it]

Error fetching content: Failed to parse: http://t.…


Extracting content:  45%|████▌     | 1488/3299 [19:58<1:42:12,  3.39s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /18A1BQf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4519181ea0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  45%|████▌     | 1489/3299 [20:08<2:39:56,  5.30s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /18zNhah (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4519182740>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  45%|████▌     | 1491/3299 [20:19<2:36:00,  5.18s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /16q8AgT (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4519182980>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  45%|████▌     | 1493/3299 [20:29<2:46:47,  5.54s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /14BTmPt (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4518ffd900>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  45%|████▌     | 1497/3299 [20:40<1:58:22,  3.94s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1a61Ypn (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4518ae7c10>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▌     | 1505/3299 [20:52<1:22:39,  2.76s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1cIwDdA (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4517c2e170>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▌     | 1508/3299 [21:02<1:56:51,  3.91s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /14emLPm (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451772cac0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▌     | 1509/3299 [21:12<2:47:25,  5.61s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /14e3Rs3 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451772d1e0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▌     | 1511/3299 [21:23<2:53:53,  5.84s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /13n9THC (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4517572620>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▌     | 1514/3299 [21:33<2:32:13,  5.12s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /145lev5 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4517603790>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▌     | 1518/3299 [21:44<2:06:33,  4.26s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /15KDAFB (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45170ad1b0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▌     | 1519/3299 [21:54<2:52:43,  5.82s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /15LPu1X (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45170ad9f0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▌     | 1523/3299 [22:05<2:12:04,  4.46s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /12V7EQL (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4516d25090>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▋     | 1527/3299 [22:15<2:00:16,  4.07s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1ehdA6x (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4516bafeb0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▋     | 1528/3299 [22:25<2:46:03,  5.63s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /19BXRBn (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4516be88b0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▋     | 1529/3299 [22:35<3:21:12,  6.82s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /19BPLZx (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4516be90f0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  46%|████▋     | 1531/3299 [22:46<3:09:27,  6.43s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1aZuSVO (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451682a7a0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  47%|████▋     | 1537/3299 [22:57<1:43:30,  3.52s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1ebuQdp (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4515e25360>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  47%|████▋     | 1540/3299 [23:07<2:04:48,  4.26s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /13i1ZCE (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b451591c2e0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  47%|████▋     | 1542/3299 [23:18<2:30:51,  5.15s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /13X0Hg2 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45157980a0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  47%|████▋     | 1543/3299 [23:28<3:12:21,  6.57s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /13shzba (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45157988e0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  48%|████▊     | 1583/3299 [23:50<1:09:44,  2.44s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1bJzyAH (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4510e80ee0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  50%|████▉     | 1637/3299 [24:01<07:09,  3.87it/s]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /15fu7nP (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45105f7730>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  51%|█████     | 1671/3299 [24:02<02:16, 11.90it/s]

Error fetching content: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error fetching content: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Extracting content:  52%|█████▏    | 1724/3299 [24:02<00:36, 43.46it/s]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /11pVr3P (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45105f78b0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  53%|█████▎    | 1737/3299 [24:12<06:43,  3.87it/s]

Error fetching content: Failed to parse: http://t.co…


Extracting content:  53%|█████▎    | 1742/3299 [24:22<13:11,  1.97it/s]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /16KclLy (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4510280eb0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  53%|█████▎    | 1743/3299 [24:24<14:04,  1.84it/s]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /102LiJF (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45102819c0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  53%|█████▎    | 1745/3299 [24:44<38:46,  1.50s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /102JZKw (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4510282620>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  53%|█████▎    | 1746/3299 [24:54<53:34,  2.07s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /1038PKd (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4510282ce0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  53%|█████▎    | 1747/3299 [25:04<1:11:32,  2.77s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /102IGeK (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4510283520>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  53%|█████▎    | 1748/3299 [25:06<1:08:47,  2.66s/it]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /102IGeK (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45102827d0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  53%|█████▎    | 1763/3299 [25:26<35:12,  1.38s/it]  

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /102KWCK (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b4510282c80>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  58%|█████▊    | 1903/3299 [25:38<06:38,  3.50it/s]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /10plI1Z (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b45102c5c00>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  59%|█████▉    | 1954/3299 [25:50<06:57,  3.22it/s]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /10IBAw0 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b450ff42f80>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  61%|██████▏   | 2027/3299 [25:53<01:26, 14.78it/s]

Error fetching content: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Extracting content:  62%|██████▏   | 2046/3299 [26:03<05:03,  4.13it/s]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /15ApskV (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b450fbfe230>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))


Extracting content:  76%|███████▌  | 2500/3299 [26:29<00:23, 34.11it/s]

Error fetching content: HTTPConnectionPool(host='t.c', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7b450f5a4fd0>: Failed to resolve 't.c' ([Errno -2] Name or service not known)"))


Extracting content:  76%|███████▋  | 2521/3299 [26:29<00:20, 38.39it/s]

Error fetching content: HTTPSConnectionPool(host='makingcancerhistory.com', port=443): Max retries exceeded with url: /WhyMDA/Myths.html (Caused by SSLError(SSLError(1, '[SSL: WRONG_SIGNATURE_TYPE] wrong signature type (_ssl.c:1007)')))


Extracting content:  80%|███████▉  | 2639/3299 [26:34<00:12, 51.48it/s]

Error fetching content: HTTPConnectionPool(host='t.', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7b450f139e40>: Failed to resolve 't' ([Errno -2] Name or service not known)"))


Extracting content:  81%|████████  | 2662/3299 [26:34<00:13, 47.87it/s]

Error fetching content: HTTPConnectionPool(host='t.c', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7b450f13b610>: Failed to resolve 't.c' ([Errno -2] Name or service not known)"))


Extracting content: 100%|██████████| 3299/3299 [26:55<00:00,  2.04it/s]

Error fetching content: HTTPConnectionPool(host='on.cnn.com', port=80): Max retries exceeded with url: /SO1PxT (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7b450dbd85b0>, 'Connection to on.cnn.com timed out. (connect timeout=10)'))





In [6]:
# Drop rows with missing content
cnnhealth.dropna(subset=['Content'], inplace=True)

# Combine content paragraphs into a single string
cnnhealth['Content'] = [" ".join([p_tag.text.strip() for p_tag in content]) for content in cnnhealth['Content']]

In [7]:
# Clean the content data
def clean_text(text):
    text = re.sub(r'-', ' ', text)  # Substitute hyphens with empty spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert text to lowercase
    return text

cnnhealth['Content'] = cnnhealth['Content'].apply(clean_text)

In [8]:
# Convert to wordnet tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None  # Use default POS for lemmatization

In [9]:
# Tokenize and preprocess text data
def preprocess_text(text):
    tokens = word_tokenize(text)  # Tokenize text
    lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
    pos_tags = nltk.pos_tag(tokens)  # Get part-of-speech tags
    for i, (token, tag) in enumerate(pos_tags):
        pos = get_wordnet_pos(tag)  # Convert NLTK POS tags to WordNet POS tags
        if pos:
            tokens[i] = lemmatizer.lemmatize(token, pos=pos)  # Lemmatize tokens
        else:
            tokens[i] = lemmatizer.lemmatize(token)  # Use default POS for lemmatization
    stop_words = set(stopwords.words('english'))  # Get stopwords
    custom_stopwords = [    # Custom stopwords
    "patient", "doctor", "say", "year", "state", "day", "need", "come", "well",
    "make", "think", "know", "go", "use", "one", "like", "people", "may",
    "many", "still", "even", "two", "way", "good", "much", "back", "new",
    "time", "first", "really",
    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
    "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
    ]
    tokens = [token for token in tokens if token not in stop_words and token not in custom_stopwords]  # Remove stopwords
    return ' '.join(tokens)

cnnhealth['Content'] = cnnhealth['Content'].apply(preprocess_text)

In [10]:
# Save preprocessed data to CSV
cnnhealth.to_csv('cnnhealth_preprocessed.csv', index=False)